So this is the model i wrote for language modelling, (basically dialogue A to dialogue B generation), but if i use key_padding_mask(s) the test loss becomes nan, if i dont use them the test loss starts increasing just after 1st epoch.
class Transfomermodel(nn.Module):
def __init__(self):
super().__init__()
self.trans = nn.Transformer(d_model=300, nhead=6, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1)
self.embedding = nn.Embedding(len(vocab)+1, 300)
self.fc = nn.Linear(300, len(vocab)+1)
self.srce_mask = None
self.trg_mask = None
self.src_pad_mask = None
self.trg_pad_mask = None
def forward(self,src,trg):
self.srce_mask = self._generate_square_subsequent_mask(len(src)).cuda()
self.trg_mask = self._generate_square_subsequent_mask(len(trg)).cuda()
self.src_pad_mask = self._generate_padding_mask(src).cuda()
self.trg_pad_mask = self._generate_padding_mask(trg).cuda()
src_inp = self.embedding(src)
trg_inp = self.embedding(trg)
out = self.trans(src_inp,trg_inp, src_mask= self.srce_mask, tgt_mask= self.trg_mask, src_key_padding_mask= self.src_pad_mask, tgt_key_padding_mask= self.trg_pad_mask)
return self.fc(out)
def _generate_square_subsequent_mask(self, sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def _generate_padding_mask(self, tsr):
msk = torch.tensor((tsr.data.cpu().numpy() == 0).astype(int), dtype = torch.bool)
msk = msk.permute(1,0)
return msk
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
I am using CrossEntropyLoss, and this is my training code
for epoch in tqdm(range(epochs)):
model.train()
epoch_loss = 0
for batch in tqdm(train_iterator):
src_inp = batch['src_sentence'].permute(1,0).cuda()
trg_inp = batch['trg_sentence'].permute(1,0).cuda()
in_tgt = trg_inp[:-1, :]
exp_tgt = trg_inp[1:, :]
optimizer.zero_grad()
output= model(src_inp, in_tgt)
output = output.view(-1, output.shape[-1])
trg = exp_tgt.reshape(-1)
loss = criterion1(output, trg)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
optimizer.step()
epoch_loss += loss.item()
y.append(epoch_loss/len(train_iterator))
print(y)
epoch_loss = 0
model.eval()
for batch in tqdm(val_iterator):
src_inp = batch['src_sentence'].permute(1,0).cuda()
trg_inp = batch['trg_sentence'].permute(1,0).cuda()
in_tgt = trg_inp[:-1, :]
exp_tgt = trg_inp[1:, :]
optimizer.zero_grad()
output= model(src_inp, in_tgt)
output = output.view(-1, output.shape[-1])
trg = exp_tgt.reshape(-1)
loss = criterion1(output, trg)
torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
epoch_loss += loss.item()
y_val.append(epoch_loss/len(val_iterator))
print(y_val)
curr_loss = epoch_loss/len(val_iterator)
if(curr_loss < best_loss):
best_loss = curr_loss
torch.save(model.state_dict(), MODEL_DIR)
i am using Adam optimizer, and I dont know where i am going wrong. the output predicted is gibberish or repetition of a single word. I dont know whats wrong, can anyone help me find out where i went wrong?
batch["xxx_sentence] is a long tensor of size [batch_size, sequence length]