Hi all,
I am trying to use packed sequnce as input to RNN for language modeling, but it didn’t work as expected. Here are the codes.
The following code does not use packed sequence and works fine.
class LanguageModel(nn.Module):
def __init__(self, ntoken, ninp, nhid, nlayers):
super(LanguageModel, self).__init__()
self.encoder = nn.Embedding(ntoken, ninp)
self.rnn = nn.GRU(ninp, nhid, nlayers, bidirectional=False, batch_first=True)
self.decoder = nn.Linear(nhid, ntoken)
def forward(self, inputs, mask):
# embedding
emb = self.encoder(inputs.long())
output_, _ = self.rnn(emb)
# mask output
mask_ = mask.unsqueeze(-1).expand_as(output_).float()
output = output_ * mask_
# project output
decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
return decoded.view(output.size(0), output.size(1), decoded.size(1))
The loss is
epoch: 0, it: 100, loss: 3.0152803540229796, acc: 0.21636631093919279
epoch: 0, it: 200, loss: 2.5584751963615417, acc: 0.2976714611053467
epoch: 0, it: 300, loss: 2.424778082370758, acc: 0.31738966673612595
epoch: 0, it: 400, loss: 2.3470527958869933, acc: 0.3234238114953041
epoch: 0, it: 500, loss: 2.3100508141517637, acc: 0.32845291540026667
epoch: 0, it: 600, loss: 2.269477825164795, acc: 0.33436131440103056
epoch: 0, it: 700, loss: 2.2323202776908873, acc: 0.3435117769241333
epoch: 0, it: 800, loss: 2.197794075012207, acc: 0.3516477417945862
epoch: 0, it: 900, loss: 2.161339772939682, acc: 0.36355975896120074
epoch: 0, it: 1000, loss: 2.1328598356246946, acc: 0.37262321919202807
epoch: 0, it: 1100, loss: 2.120845100879669, acc: 0.37346176490187644
epoch: 0, it: 1200, loss: 2.0859076166152954, acc: 0.3842319694161415
epoch: 0, it: 1300, loss: 2.070769666433334, acc: 0.39238578140735625
epoch: 0, it: 1400, loss: 2.057626646757126, acc: 0.394229926019907
The following code only changes the above class in forward function to use packed seqence, and loss is not decreasing.
def forward(inputs, mask):
# embedding
emb = self.encoder(inputs.long())
# sequence length
seq_lengths = torch.sum(mask, dim=-1).squeeze(-1)
# sort sequence by length
sorted_len, sorted_idx = seq_lengths.sort(0, descending=True)
index_sorted_idx = sorted_idx\
.view(-1, 1, 1).expand_as(inputs)
sorted_inputs = inputs.gather(0, index_sorted_idx.long())
# pack sequence
packed_seq = torch.nn.utils.rnn.pack_padded_sequence(
sorted_inputs, sorted_len.cpu().data.numpy(), batch_first=True)
# feed it into RNN
out, _ = self.rnn(packed_seq)
# unpack sequence
unpacked, unpacked_len = \
torch.nn.utils.rnn.pad_packed_sequence(
out, batch_first=True)
# unsort the output
_, original_idx = sorted_idx.sort(0, descending=True)
unsorted_idx = original_idx\
.view(-1, 1, 1).expand_as(unpacked)
output = unpacked.gather(0, unsorted_idx.long())
# project output
decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
return decoded.view(output.size(0), output.size(1), decoded.size(1))
The loss is
epoch: 0, it: 100, loss: 3.2207558250427244, acc: 0.16291182031854987
epoch: 0, it: 200, loss: 3.0119342851638793, acc: 0.17143549174070358
epoch: 0, it: 300, loss: 2.9969462013244628, acc: 0.17032290264964103
epoch: 0, it: 400, loss: 3.004516990184784, acc: 0.16658018743619324
epoch: 0, it: 500, loss: 2.987579824924469, acc: 0.17096973054111003
epoch: 0, it: 600, loss: 2.9835088515281676, acc: 0.1719639204442501
epoch: 0, it: 700, loss: 2.983652164936066, acc: 0.17081086978316307
epoch: 0, it: 800, loss: 2.993579874038696, acc: 0.16737559842411429
epoch: 0, it: 900, loss: 2.981204776763916, acc: 0.1713446132838726
epoch: 0, it: 1000, loss: 2.982670919895172, acc: 0.17059179410338401
epoch: 0, it: 1100, loss: 2.975895357131958, acc: 0.17110723197460176
epoch: 0, it: 1200, loss: 2.9888737654685973, acc: 0.1680946245789528
epoch: 0, it: 1300, loss: 2.982082223892212, acc: 0.17025468410924077
In both experiments, I used SGD with 0.1 learning rate.
Am I using the packed sequence in a wrong way?
Thanks!