Model not Learning anything

namdev · November 9, 2023, 10:32pm

Before posting a query, check the FAQs - it might already be answered!
Hi I am trying to create a Encoder -Decoder Model with No attention involved and I have trained the data on 15 epochs the Loss doesn’t converge right from the 1st epoch to the last epoch What mistake I might be making For Ref. I am attaching my code

class EncoderLSTM(nn.Module):
def init(self,input_size,hidden_size,dropout_p=0.5):
super(EncoderLSTM,self).init()
self.hidden_size=hidden_size
self.embedding=nn.Embedding(input_size,hidden_size,device=device)
self.LSTM=nn.LSTM(hidden_size,hidden_size,num_layers=2,batch_first=True)
self.dropout=nn.Dropout(dropout_p)
def forward(self,input):
embedding=self.dropout(self.embedding(input))
#print(embedding.shape)
out,(hide,cs)=self.LSTM(embedding)
return out,(hide,cs)
def predict(self,input):
z1=vocab.lookup_indices(input.split(’ ‘))
z1.insert(0,vocab[’‘])
z1.append(vocab[’'])
out,(hide,cs)=EncoderLSTM.forward(self,input=torch.tensor(z1).view(1,len(z1)).to(device))
return out,(hide,cs)

class DecoderLSTM(nn.Module):
def init(self,hidden_size,output_size,encoder):
super(DecoderLSTM,self).init()
self.Embedding=nn.Embedding(output_size,hidden_size)
self.LSTM=nn.LSTM(hidden_size,hidden_size,num_layers=2,batch_first=True)
self.out1=nn.Linear(hidden_size,output_size3)
self.out2=nn.Linear(output_size3,output_size)
self.grelu=nn.GELU()
self.relu=nn.ReLU()
self.encoder=encoder
def forward(self,encoder_output,encoder_hidden,target_tensor=None):
batch_size=encoder_output.size(0)
decoder_input=torch.empty(batch_size,1,dtype=torch.long,device=device).fill_(sos_tensor)
decoder_hidden=encoder_hidden
decoder_outputs=
for i in range(8):
decoder_output,decoder_hidden=self.forward_step(decoder_input,decoder_hidden)
decoder_outputs.append(decoder_output)
if target_tensor is not None:
#print(target_tensor[:,i].shape)
decoder_input=target_tensor[:,i].unsqueeze(1)
#print(decoder_input.shape)
else:
,topi=decoder_output.topk(1)
decoder_input=topi.squeeze(-1).detach()
decoder_outputs=torch.cat(decoder_outputs,dim=1)
return decoder_outputs,decoder_hidden,None
def forward_step(self,input,hidden):
output=self.Embedding(input)
output,(hidden_new,cell_state_new)=self.LSTM(output,hidden)
output=self.grelu(self.out1(output))
output=self.relu(self.out2(output))
return output,(self.grelu(hidden_new),self.grelu(cell_state_new))
def predict(self,encoderpredict):
encoder_output,encoder_hidden=encoderpredict
d_o,d_h,=DecoderLSTM.forward(self,encoder_output,encoder_hidden,None)
z=torch.argmax(d_o,dim=2)
z=z.view(-1,z.shape[1])
z=z.tolist()
z1=vocab.lookup_tokens(z[0])
return z1

encoder=EncoderLSTM(256,128)
encoder_optimizer=torch.optim.Adam(encoder.parameters(),1e-3,weight_decay=0.09)
encoder.to(device)
decoder=DecoderLSTM(128,len(vocab),encoder)
loss_fn_decoder=torch.nn.CrossEntropyLoss()
decoder_optimizer=torch.optim.Adam(decoder.parameters(),learning_rate,weight_decay=0.09)
decoder.to(device)

for i in range(epochs):
encoder.train()
decoder.train()
train_loss=0
for batch in tqdm(train_dataloader,desc=‘Training’):
data,labels=batch
decoder_optimizer.zero_grad()
encoder_optimizer.zero_grad() data.to(device)
output_encoder,hidden_encoder=encoder(data.to(device))
output_decoder,hidden,none=decoder(encoder_output=output_encoder.to(device),encoder_hidden=hidden_encoder,target_tensor=labels.to(device))
loss=loss_fn_decoder(output_decoder.view(-1,output_decoder.shape[-1]).to(device),labels.view(-1).to(device))
loss.backward()
encoder_optimizer.step()
decoder_optimizer.step()
train_loss+=loss
print(train_loss)
!

the next Image is for the Loss

vdw · November 11, 2023, 6:00am

You may want to format your post so that your code is easier to read. Try rapping your code in ```…```.

Since you’re using an LSTM encoder and decoder, why are you passing only the hidden state but no the cell state from the encoder to the decoder? In fact, in your forward_step method of the decoder, you call self.LSTM with only the hidden state, so the cell state will always be randomly initialized.

Here’s a working implementation of a basic RNN-based Seq2Seq model, incl. a notebook. Note that the method _create_final_hidden() which combines the hidden and cell state in case the encoder and decoder are LSTMs.