Here is my code for some toy data:
# packed_sequence = nn.utils.rnn.pack_padded_sequence(inputs, [10, 9, 8, 7, 5], batch_first=True)
# print(packed_sequence)
# unpacked_tensor, lengths = nn.utils.rnn.pad_packed_sequence(packed_sequence, batch_first=True)
# print(unpacked_tensor)
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from random import randint
import sys
use_cuda = False
class SentenceEncoderRNN(nn.Module):
def __init__(self, input_dim, embed_dim, hidden_dim, embeddings=None, fine_tune_embeddings=True, variable_lengths=True, bidirectional=True):
super(SentenceEncoderRNN, self).__init__()
self.embed_dim = embed_dim
self.hidden_dim = hidden_dim
self.variable_lengths = variable_lengths
self.bidirectional = bidirectional
self.embedding = nn.Embedding(input_dim, embed_dim, padding_idx=0)
# self.embedding.weight = nn.Parameter(torch.from_numpy(embeddings).float())
# self.embedding.weight.requires_grad = fine_tune_embeddings
self.gru = nn.GRU(embed_dim, hidden_dim, bidirectional=bidirectional, batch_first=True)
def forward(self, inputs, lengths):
embedded = self.embedding(inputs)
if self.variable_lengths:
embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True)
all_outs, hidden = self.gru(embedded)
return all_outs, hidden
class Classifier(nn.Module):
"""docstring for Classifier"""
def __init__(self, encoder, hidden_dim):
super(Classifier, self).__init__()
self.hidden_dim = hidden_dim
self.encoder = encoder
self.Wc = nn.Linear(hidden_dim, 1)
def forward(self, inputs, lengths):
outputs, hidden = self.encoder(inputs, lengths)
if self.encoder.variable_lengths:
outputs, lengths = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
last_outputs = outputs[:, -1, :]
temp = self.Wc(last_outputs)
prob = F.sigmoid(temp)
return prob
def predict(self, inputs, lengths):
return self.forward(inputs, lengths).data
def train(encoder, clfr, optimizer, inputs, lengths, targets, batch_size):
epochs = 1000
copy_epochs = epochs
while copy_epochs > 0:
optimizer.zero_grad()
outputs = clfr(inputs, lengths)
loss_function = nn.MSELoss()
loss = loss_function(outputs, targets)
loss.backward()
optimizer.step()
copy_epochs -= 1
if copy_epochs % 100 == 0:
print(loss.data[0])
def prepare_data(batch_size, max_timesteps, vocab_size):
inputs = torch.LongTensor(batch_size, max_timesteps).zero_()
lengths = [10, 9, 7, 7, 5]
targets = Variable(torch.Tensor([1, 0, 0, 0, 1]))
for i in range(batch_size):
for j in range(max_timesteps):
if j == lengths[i]:
break
inputs[i, j] = randint(1, vocab_size - 1)
inputs = Variable(inputs)
return inputs, lengths, targets
if __name__ == '__main__':
variable_lengths = True # or False
print('Variable lengths:', variable_lengths)
batch_size, max_timesteps, vocab_size, embed_dim, hidden_dim = 5, 10, 20, 15, 7
bidirectional = False
if bidirectional:
num_dir = 2
else:
num_dir = 1
inputs, lengths, targets = prepare_data(batch_size, max_timesteps, vocab_size)
print('Inputs:')
print(inputs)
encoder = SentenceEncoderRNN(vocab_size, embed_dim, hidden_dim, variable_lengths=variable_lengths, bidirectional=bidirectional)
clfr = Classifier(encoder, hidden_dim * num_dir)
optimizer = torch.optim.SGD(clfr.parameters(), lr=0.01, momentum=0.9)
print('Loss:')
train(encoder, clfr, optimizer, inputs, lengths, targets, batch_size)
print('Predictions:')
print(clfr.predict(inputs, lengths))
Here are the outputs:
Without using pack_padded_sequence
Variable lengths: False
Inputs:
Variable containing:
4 11 6 10 5 17 1 1 3 9
18 4 8 19 19 14 1 18 11 0
19 3 8 3 10 17 12 0 0 0
5 19 9 6 13 2 17 0 0 0
6 14 2 18 1 0 0 0 0 0
[torch.LongTensor of size 5x10]Loss:
0.19118347764015198
0.15429019927978516
0.1406058967113495
0.11726896464824677
0.05257026106119156
0.013342835009098053
0.005831681191921234
0.003512600902467966
0.0024605693761259317
0.001874120207503438
Predictions:0.9772
0.0425
0.0303
0.0413
0.9337
[torch.FloatTensor of size 5x1]
Using pack_padded_sequence
Variable lengths: True
Inputs:
Variable containing:
9 10 12 11 5 1 7 5 16 16
16 18 15 13 12 17 16 16 16 0
5 2 17 3 19 2 5 0 0 0
18 10 16 16 15 7 3 0 0 0
17 15 2 5 11 0 0 0 0 0
[torch.LongTensor of size 5x10]Loss:
0.1822376549243927
0.16268190741539001
0.1562534123659134
0.15366964042186737
0.1524355411529541
0.1517626792192459
0.15135648846626282
0.15109115839004517
0.15090689063072205
0.15077264606952667
Predictions:0.9393
0.2565
0.2565
0.2565
0.2565
[torch.FloatTensor of size 5x1]
The input to my models is a bunch of sequence of variable lengths.
I am getting significantly lower performance of the model when I use pack_padded_sequence (to try and automatically deal with variable length input).
Is there any bug in my implementation? Can anyone explain why this may be happening?
Thanks!