The following is copied from github.
I encounter this problem when I am trying to implement seq2seq to familiarize with this new framework. This issue seems related to parameters sharing in mini-batch. I setup a dummy training set with only one mini-batch. This mini-batch has 3 data entries in it. All with the same input, and different outputs:
training_data = [
[[[4,5,1,7,8],[4,5,1,7,8],[4,5,1,7,8]], (input 1)
[[0],[0],[0]], (input 2)
[[1],[3],[5]]] (target)
]
In theory, the model will never learn this data because the contradiction. However, the loss reach near 0 after only a few hundred epochs. But if I split 3 data entries into 3 mini-batch, the model will not learn the data set which should be the correct result.
So the model must be keeping different set of parameter for each position in mini-batch? And the parameters are not updated to be the same after each mini-batch forward-backward? Can someone tell me if I misunderstood something?
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
training_data = [
[[[4,5,1,7,8],[4,5,1,7,8],[4,5,1,7,8]], [[0],[0],[0]], [[1],[3],[5]]]
]
def prepare_sequence(all_seq):
return autograd.Variable(torch.LongTensor(all_seq)).cuda()
class Seq2seqLSTM(nn.Module):
def __init__(self, vocab_size, target_size, embed_dim, hidden_dim, num_layers):
super(Seq2seqLSTM, self).__init__()
self.hidden_dim = hidden_dim
self.embed_dim = embed_dim
self.vocab_size = vocab_size
self.target_size = target_size
self.num_layers = num_layers
self.word_embeddings = nn.Embedding(vocab_size, embed_dim)
self.encoder = nn.LSTM(embed_dim, hidden_dim, num_layers)
self.decoder = nn.LSTM(embed_dim, hidden_dim, num_layers)
self.curr_hidden = None
self.hidden2tag = nn.Linear(hidden_dim, target_size)
def init_hidden(self, batch_size):
return (autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda(),
autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda())
def forward(self, enc_seq, dec_seq):
batch_size = enc_seq.size()[0]
self.curr_hidden = self.init_hidden(batch_size)
enc_embeds = self.word_embeddings(enc_seq)
dec_embeds = self.word_embeddings(dec_seq)
enc_out, self.curr_hidden = self.encoder(
enc_embeds.view(-1, batch_size, self.embed_dim), self.curr_hidden)
dec_out, self.curr_hidden = self.decoder(
dec_embeds.view(-1, batch_size, self.embed_dim), self.curr_hidden)
tag_space = self.hidden2tag(dec_out.view(batch_size * len(dec_out), -1))
tag_scores = F.log_softmax(tag_space)
return tag_scores
EMBED_DIM = 10
HIDDEN_DIM = 10
VOCAB_SIZE = 10
TARGET_SIZE = 10
NUM_LAYERS = 2
model = Seq2seqLSTM(VOCAB_SIZE, TARGET_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS).cuda()
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())
print model
for epoch in range(1000):
one_batch = map(lambda x:prepare_sequence(x), training_data[epoch%len(training_data)])
enc_inp = one_batch[0]
dec_inp = one_batch[1]
target = one_batch[2]
model.zero_grad()
tag_scores = model(enc_inp, dec_inp)
loss = loss_function(tag_scores, target.view(-1))
loss.backward()
optimizer.step()
print loss