Running LSTM over bAbI dataset.The Encoder
module calculated embedding for the story
and query
and process them with two GRU and the Answer
module consumes the representation encoded by Encoder
and uses a fully connected layer to pick a word from the answer vocabulary which separate from story-query vocabulary.
encoder = EncoderRNN(svocab_size, hidden_size).cuda()
answer = Answer(hidden_size, avocab_size).cuda()
The training loss remain the same value([0] loss: 4.1544718742370605
), no matter what. I have tried,
- batch_size - 1, 16, 128, 256, 512
- optimizers - SGD, Adam
- lr - 0.1, 0.01, 0.001
Please find the code for model below.
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1):
super(EncoderRNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.embedding = nn.Embedding(input_size, hidden_size)
self.storyRnn = nn.GRU(hidden_size, hidden_size)
self.queryRnn = nn.GRU(hidden_size, hidden_size)
def forward(self, story, query, shidden, qhidden):
#print(story.size())
sembedded = self.embedding(story).transpose(1, 0)
soutput = sembedded
#print(sembedded.size())
for i in range(self.n_layers):
soutput, shidden = self.storyRnn(soutput, shidden)
qembedded = self.embedding(query).transpose(1, 0)
qoutput = qembedded
for i in range(self.n_layers):
qoutput, qhidden = self.queryRnn(qoutput, qhidden)
return soutput, shidden, qoutput, qhidden
class Answer(nn.Module):
def __init__(self, hidden_size, output_size):
super(Answer, self).__init__()
self.input_size = hidden_size * 2
self.output_size = output_size
self.linear = nn.Linear(self.input_size, self.output_size)
def forward(self, input):
return F.softmax(self.linear(input))
def train(input, target, modules, criterion, optimizers):
encoder, answer = modules
eoptim, aoptim = optimizers
story, query = input
story, query = torch.LongTensor(story).cuda(), torch.LongTensor(query).cuda()
story, query = Variable(story), Variable(query)
target= torch.LongTensor(target).cuda()
target= Variable(target)
batch_size = story.data.size()[0]
shidden_state = encoder.initHidden(batch_size)
qhidden_state = encoder.initHidden(batch_size)
so, sh, qo, qh = encoder(story, query, shidden_state, qhidden_state)
representation = F.elu(torch.cat((so[-1], qo[-1]), 1))
prediction = answer(representation)
loss = criterion(prediction, target)
loss.backward()
eoptim.step()
aoptim.step()
return loss.data[0]