Loss is same in each batch and gradients are also same

culms · October 10, 2019, 5:07am

Hi, I am new to deeplearning and pytorch ,
My Seq2seq attention model is not training and my code is

class Encoder(nn.Module):
def init(self,vocab_size,embedding_dim,enc_units,batch_sz):
super(Encoder,self).init()
self.batch_sz = batch_sz
self.enc_units = enc_units
self.embedding=nn.Embedding(vocab_size, embedding_dim)
self.gru = nn.GRU(input_size =embedding_dim,hidden_size = enc_units,batch_first =True)

def forward(self, x, hidden):
    x=x.type(torch.LongTensor)
    x = self.embedding(x)
    output, hidden = self.gru(x, hidden)
    return output, hidden

def initialize_hidden_state(self):
    return torch.zeros((1,self.batch_sz,self.enc_units))

class Attention(nn.Module):
def init(self,units,max_length_inp):
super(Attention,self).init()
self.units = units
self.max_length_inp=max_length_inp
self.densor1 =nn.Linear(in_features=units*2,out_features=max_length_inp)
self.act1 =nn.Tanh()
self.densor2 =nn.Linear(max_length_inp,max_length_inp)
self.act2=nn.Softmax(dim=-1)

def forward(self,hidden,enc_output):
    hidden=hidden.permute(1,0,2)
    hidden = hidden.repeat(1,max_length_inp,1)
    concat =torch.cat((hidden, enc_output), -1)
    score =self.densor1(concat)
    score=self.act1(score)
    score =self.densor2(score)
    score=self.act2(score)

    mix =torch.matmul(score,enc_output)
    mix=mix.permute(0,2,1)
    mix =torch.sum(mix,dim=2)
    return mix

class Decoder(nn.Module):
def init(self, vocab_size, embedding_dim, dec_units, batch_sz,attention):
super(Decoder, self).init()
self.batch_sz = batch_sz
self.dec_units = dec_units
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.gru = nn.GRU(embedding_dim+dec_units, dec_units,batch_first =True)
self.fc = nn.Linear(dec_units,vocab_size)
self.attention =attention
self.act2=nn.Softmax(dim=-1)

def forward(self, x, hidden, enc_output):
     context_vector =self.attention(hidden,enc_output)
    x=x.type(torch.LongTensor)
    x = self.embedding(x)
    x=torch.cat([torch.unsqueeze(context_vector,1),x],axis=-1)    
    output, state = self.gru(x,hidden)
    output =torch.reshape(output,(-1,output.shape[2]))
    x = self.fc(output)
    x = self.act2(x)
    return x, state
    
def initialize_hidden_state(self):
    return torch.zeros((1,self.batch_sz,self.enc_units))

class Seq2Seq(nn.Module):
def init(self, encoder, decoder,batch_size,max_length,trg_vocab_size):
super().init()
self.batch_size=batch_size
self.max_length=max_length
self.trg_vocab_size=trg_vocab_size
self.encoder = encoder
self.decoder = decoder

def forward(self,src,targ,hidden):
    
    outputs = torch.zeros(self.max_length,self.batch_size,self.trg_vocab_size)
    enc_output, enc_hidden = self.encoder(src, hidden)
    a=torch.tensor(targ_lang.word2idx['<start>'])
    dec_hidden = enc_hidden
    dec_input  =a.expand(BATCH_SIZE,1)
    for t in range(1,targ.shape[1]):
        predictions, dec_hidden =self.decoder(dec_input, dec_hidden, enc_output)
        outputs[t]=predictions
        dec_input =torch.unsqueeze(targ[:,t],1)
   return outputs

model =Seq2Seq(encoder,decoder,BATCH_SIZE,max_length_targ,vocab_tar_size)
optimizer =torch.optim.SGD(model.parameters(),lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()

def train(model, optimizer, criterion, clip):
model.train()
epoch_loss = 0
for (batch, (inp, targ)) in enumerate(dataloader_train):
hidden=encoder.initialize_hidden_state()
src = inp
trg = targ
optimizer.zero_grad()
output = model(src, trg,hidden)
trg = trg.transpose(0,1)
trg=trg.view(trg.shape[0],trg.shape[1],1)
input=output.reshape(output.shape[0]*output.shape[1],output.shape[2])
trg=trg.reshape(trg.shape[0]*trg.shape[1],trg.shape[2])
trg = trg.type(torch.long)
trg=trg.squeeze_()
loss = criterion(input,trg)
a = list(model.parameters())[0]
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
b = list(model.parameters())[0]
print(torch.equal(a.data, b.data))
print(loss)
epoch_loss += loss.item()
return epoch_loss / len(dataloader_train)

CLIP =1

train_loss = train(model, optimizer, criterion, CLIP)

vdw · October 10, 2019, 5:17am

No offense, but don’t just post badly formatted and not documented code, expecting others to help.

Having said that, you first want to get a simplified version training properly. You don’t immediately need attention, and I’ve never seen just an complicated code for just the decoder.

Start with something working – there are good Seq2Seq PyTorch tutorials – and extend them as needed.