Issue with mini-batch parameters sharing

The following is copied from github.

I encounter this problem when I am trying to implement seq2seq to familiarize with this new framework. This issue seems related to parameters sharing in mini-batch. I setup a dummy training set with only one mini-batch. This mini-batch has 3 data entries in it. All with the same input, and different outputs:

training_data = [
[[[4,5,1,7,8],[4,5,1,7,8],[4,5,1,7,8]], (input 1)
[[0],[0],[0]], (input 2)
[[1],[3],[5]]] (target)
]

In theory, the model will never learn this data because the contradiction. However, the loss reach near 0 after only a few hundred epochs. But if I split 3 data entries into 3 mini-batch, the model will not learn the data set which should be the correct result.

So the model must be keeping different set of parameter for each position in mini-batch? And the parameters are not updated to be the same after each mini-batch forward-backward? Can someone tell me if I misunderstood something?

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



training_data = [
	[[[4,5,1,7,8],[4,5,1,7,8],[4,5,1,7,8]], [[0],[0],[0]], [[1],[3],[5]]]
	]


def prepare_sequence(all_seq):
	return autograd.Variable(torch.LongTensor(all_seq)).cuda()

class Seq2seqLSTM(nn.Module):

	def __init__(self, vocab_size, target_size, embed_dim, hidden_dim, num_layers):

		super(Seq2seqLSTM, self).__init__()
		self.hidden_dim = hidden_dim
		self.embed_dim = embed_dim
		self.vocab_size = vocab_size
		self.target_size = target_size
		self.num_layers = num_layers

		self.word_embeddings = nn.Embedding(vocab_size, embed_dim)
		self.encoder = nn.LSTM(embed_dim, hidden_dim, num_layers)
		self.decoder = nn.LSTM(embed_dim, hidden_dim, num_layers)

		self.curr_hidden = None
		self.hidden2tag = nn.Linear(hidden_dim, target_size)

	def init_hidden(self, batch_size):
		return (autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda(),
			autograd.Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim)).cuda())

	def forward(self, enc_seq, dec_seq):

		batch_size = enc_seq.size()[0]
		self.curr_hidden = self.init_hidden(batch_size)

		enc_embeds = self.word_embeddings(enc_seq)
		dec_embeds = self.word_embeddings(dec_seq)

		enc_out, self.curr_hidden = self.encoder(
			enc_embeds.view(-1, batch_size, self.embed_dim), self.curr_hidden)
		dec_out, self.curr_hidden = self.decoder(
			dec_embeds.view(-1, batch_size, self.embed_dim), self.curr_hidden)

		tag_space = self.hidden2tag(dec_out.view(batch_size * len(dec_out), -1))
		tag_scores = F.log_softmax(tag_space)
		return tag_scores

EMBED_DIM = 10
HIDDEN_DIM = 10
VOCAB_SIZE = 10
TARGET_SIZE = 10
NUM_LAYERS = 2

model = Seq2seqLSTM(VOCAB_SIZE, TARGET_SIZE, EMBED_DIM, HIDDEN_DIM, NUM_LAYERS).cuda()
loss_function = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())
print model

for epoch in range(1000):

	one_batch = map(lambda x:prepare_sequence(x), training_data[epoch%len(training_data)])
	enc_inp = one_batch[0]
	dec_inp = one_batch[1]
	target = one_batch[2]

	model.zero_grad()
	tag_scores = model(enc_inp, dec_inp)

	loss = loss_function(tag_scores, target.view(-1))
	loss.backward()
	optimizer.step()

	print loss