Hi,
I have an encoder-decoder for sentence rewriting task. I was following the pytorch tutorial with minor changes of making encoder and decoder accepting batch. The loss and perplexity were fine at training and converged nicely, but at inference, the decoder predicts repeated words and common words, such as ["‘they’, ‘this’, ‘the’, ‘the’, ‘the’, ‘the’, ‘the’"], or [‘i’, ‘is’, ‘like’, ‘that’, ‘that’, ‘that’, ‘that’, ‘that’] using greedy decoding. I’ve been stuck on the debugging for a couple days and could someone point out what I should investigate?
batch = Shuffle(data, batch_size)
encoder_optim.zero_grad()
decoder_optim.zero_grad()
loss = torch.tensor(0).float().to(device)
batch_len = 0
avg_length = []
new_ins = []
targets_tensor_inds = []
for kid in batch:
new_ins.append([i[1] for i in data[kid])
targets_tensor_inds.append(MakeGold(targets[kid],2))
mal_x = max([len(i) for i in new_ins])
x_inds = PadTarget(new_ins,mal_x,2)
mal_y = max([len(i) for i in targets_tensor_inds])
target_tensor = PadTarget(targets_tensor_inds,mal_y,2)
x_h = encoder.initHidden(batch_size)
x_out, x_h = encoder(x_inds, x_h)
decoder_input = torch.tensor([[SOS_token]*batch_size], device=device)
if ISBIDIR:
decoder_hidden = x_h.view(2,batch_size,-1)
else:
decoder_hidden = x_h.view(1,batch_size,-1) # B X H
if random.random() > teacher_forcing:
for di in range(target_length):
decoder_input = target_tensor[:, di].detach()
decoder_output, decoder_hidden, attnweights = decoder(
decoder_input, decoder_hidden,x_out,batch_size)
topv, topi = decoder_output.topk(1)
loss += criterion(decoder_output, target_tensor[:,di])
if decoder_input is torch.tensor([[EOS_token]*batch_size], device=device):
break
else:
for di in range(target_length):
if di !=0:
decoder_input = topi.squeeze().detach() # detach from history as input #MAKE SURE ITS 1 x B X H
decoder_output, decoder_hidden, attnweights = decoder(
decoder_input, decoder_hidden,x_out,batch_size)
topv, topi = decoder_output.topk(1)
loss += criterion(decoder_output, target_tensor[:,di])
if decoder_input is torch.tensor([[EOS_token]*batch_size], device=device):
break
And my model:
class EncoderRNN(nn.Module):
def __init__(self,device, vocab_size, word_dim, hidden_size, biflag):
super(EncoderRNN, self).__init__()
self.device = device
self.hidden_size = hidden_size
self.biflag = bifla
self.embedding = nn.Embedding(vocab_size, word_dim)
if biflag:
self.gru = nn.GRU(input_size=word_dim,
hidden_size=hidden_size,
num_layers=1,
batch_first=True,
bidirectional=True)
else:
self.gru = nn.GRU(input_size=word_dim,
hidden_size=hidden_size,
num_layers=1,
batch_first=True,
bidirectional=False)
if embed_flag:
self.initEmbeddings(weights)
def forward(self, x_input, hidden):
x_input = x_input.to(self.device)
embedded = self.embedding(x_input)
output = embedded
output, x_h = self.gru(output,hidden) # output: length x batch x 2H, hidden: 2 x 1 x H
final = torch.cat([x_h[0,:,:].unsqueeze(0), x_h[1,:,:].unsqueeze(0)], dim=2) # 1 # 2 x H
return output, final
def initHidden(self,batch_size):
if self.biflag:
return torch.zeros(2, batch_size, self.hidden_size, device=self.device)
else:
return torch.zeros(1, batch_size, self.hidden_size, device=self.device)
MAX_LENGTH = 50
class AttnDecoderRNN(nn.Module):
def __init__(self, word_dim, out_vocab_size, hidden_size, biflag, device, dropout_p=0.1, max_length=MAX_LENGTH):
super(AttnDecoderRNN, self).__init__()
self.device = device
self.hidden_size = hidden_size
self.output_size = out_vocab_size
self.dropout_p = dropout_p
self.max_length = max_length
self.biflag = biflag
self.embedding = nn.Embedding(self.output_size, self.hidden_size)
self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
self.attn_combine = nn.Linear(2*self.hidden_size+self.hidden_size , self.hidden_size)
self.dropout = nn.Dropout(self.dropout_p)
if biflag:
self.gru = nn.GRU(input_size=word_dim,
hidden_size=hidden_size,
num_layers=1,
bidirectional=True)
self.out = nn.Linear(2*hidden_size, out_vocab_size)
else:
self.gru = nn.GRU(word_dim, hidden_size, num_layers=1, bidirectional=False)
self.out = nn.Linear(hidden_size, out_vocab_size)
if embed_flag:
self.initEmbeddings(weights)
def forward(self, input, hidden, encoder_outputs,batchsize):
B = batchsize
encoder_outputs = PadTensors(encoder_outputs, self.max_length) # 1 x L X 256
embedded = self.embedding(input) # B x 1 x H
embedded = self.dropout(embedded) # B x 1 x H
attn_weights = F.softmax(
self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1) # B X L
# B x L
attn_weights = attn_weights.unsqueeze(1)
attn_applied = torch.bmm(attn_weights,
encoder_outputs) # 1 x b x h
attn_applied = attn_applied.transpose(1,0)
output = torch.cat((embedded[0], attn_applied[0]), 1) #
output = self.attn_combine(output).unsqueeze(0) # 1 x B x H
output = F.relu(output)
output, hidden = self.gru(output, hidden)
output = F.log_softmax(self.out(output[0]), dim=1)
return output, hidden, attn_weights