Hi, I am new to deeplearning and pytorch ,
My Seq2seq attention model is not training and my code is
class Encoder(nn.Module):
def init(self,vocab_size,embedding_dim,enc_units,batch_sz):
super(Encoder,self).init()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding=nn.Embedding(vocab_size, embedding_dim)
self.gru = nn.GRU(input_size =embedding_dim,hidden_size = enc_units,batch_first =True)
def forward(self, x, hidden):
x=x.type(torch.LongTensor)
x = self.embedding(x)
output, hidden = self.gru(x, hidden)
return output, hidden
def initialize_hidden_state(self):
return torch.zeros((1,self.batch_sz,self.enc_units))
class Attention(nn.Module):
def init(self,units,max_length_inp):
super(Attention,self).init()
self.units = units
self.max_length_inp=max_length_inp
self.densor1 =nn.Linear(in_features=units*2,out_features=max_length_inp)
self.act1 =nn.Tanh()
self.densor2 =nn.Linear(max_length_inp,max_length_inp)
self.act2=nn.Softmax(dim=-1)
def forward(self,hidden,enc_output):
hidden=hidden.permute(1,0,2)
hidden = hidden.repeat(1,max_length_inp,1)
concat =torch.cat((hidden, enc_output), -1)
score =self.densor1(concat)
score=self.act1(score)
score =self.densor2(score)
score=self.act2(score)
mix =torch.matmul(score,enc_output)
mix=mix.permute(0,2,1)
mix =torch.sum(mix,dim=2)
return mix
class Decoder(nn.Module):
def init(self, vocab_size, embedding_dim, dec_units, batch_sz,attention):
super(Decoder, self).init()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.gru = nn.GRU(embedding_dim+dec_units, dec_units,batch_first =True)
self.fc = nn.Linear(dec_units,vocab_size)
self.attention =attention
self.act2=nn.Softmax(dim=-1)
def forward(self, x, hidden, enc_output):
context_vector =self.attention(hidden,enc_output)
x=x.type(torch.LongTensor)
x = self.embedding(x)
x=torch.cat([torch.unsqueeze(context_vector,1),x],axis=-1)
output, state = self.gru(x,hidden)
output =torch.reshape(output,(-1,output.shape[2]))
x = self.fc(output)
x = self.act2(x)
return x, state
def initialize_hidden_state(self):
return torch.zeros((1,self.batch_sz,self.enc_units))
class Seq2Seq(nn.Module):
def init(self, encoder, decoder,batch_size,max_length,trg_vocab_size):
super().init()
self.batch_size=batch_size
self.max_length=max_length
self.trg_vocab_size=trg_vocab_size
self.encoder = encoder
self.decoder = decoder
def forward(self,src,targ,hidden):
outputs = torch.zeros(self.max_length,self.batch_size,self.trg_vocab_size)
enc_output, enc_hidden = self.encoder(src, hidden)
a=torch.tensor(targ_lang.word2idx['<start>'])
dec_hidden = enc_hidden
dec_input =a.expand(BATCH_SIZE,1)
for t in range(1,targ.shape[1]):
predictions, dec_hidden =self.decoder(dec_input, dec_hidden, enc_output)
outputs[t]=predictions
dec_input =torch.unsqueeze(targ[:,t],1)
return outputs
model =Seq2Seq(encoder,decoder,BATCH_SIZE,max_length_targ,vocab_tar_size)
optimizer =torch.optim.SGD(model.parameters(),lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()
def train(model, optimizer, criterion, clip):
model.train()
epoch_loss = 0
for (batch, (inp, targ)) in enumerate(dataloader_train):
hidden=encoder.initialize_hidden_state()
src = inp
trg = targ
optimizer.zero_grad()
output = model(src, trg,hidden)
trg = trg.transpose(0,1)
trg=trg.view(trg.shape[0],trg.shape[1],1)
input=output.reshape(output.shape[0]*output.shape[1],output.shape[2])
trg=trg.reshape(trg.shape[0]*trg.shape[1],trg.shape[2])
trg = trg.type(torch.long)
trg=trg.squeeze_()
loss = criterion(input,trg)
a = list(model.parameters())[0]
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
b = list(model.parameters())[0]
print(torch.equal(a.data, b.data))
print(loss)
epoch_loss += loss.item()
return epoch_loss / len(dataloader_train)
CLIP =1
train_loss = train(model, optimizer, criterion, CLIP)