Consistency problems for simple seq2seq

Hey guys, I wrote an encoder-decoder with attention model to predict reverse strings - that is, when getting abc return cba.

I get good results for most of my runs, but every few runs, my network doesn’t converge at all.
Any suggestion how to check what may cause this?

my encoder and decoder:

class EncoderRNN(rnn.RNN):
	def __init__(self, hidden_size, emb_size, vocab_size, pre_trained_emb=None, n_layers=1, bidirect=True):
		super(EncoderRNN, self).__init__()
		self.n_layers = n_layers
		self.hidden_size = hidden_size
		self.bidirect = bidirect
		self.emb = self.create_emb(vocab_size, emb_size, pre_trained_emb)
		self.gru = nn.GRU(emb_size, hidden_size, batch_first=True, num_layers=n_layers, bidirectional=bidirect)
		self.opt = optim.Adam(self.params())

	def forward(self, input_sequence, hidden):
		embeddings = self.emb(input_sequence)
		output, hidden = self.gru(embeddings, hidden)
		return output, hidden

class DecoderRNN(rnn.RNN):
	def __init__(self, hidden_size, emb_size, vocab_size, pre_trained_emb=None):
		super(DecoderRNN, self).__init__()
		self.hidden_size = hidden_size
		self.n_layers = config.values.get('n_layers', 1)
		self.emb = self.create_emb(vocab_size, emb_size, pre_trained_emb)
		self.W1 = Par(hidden_size, hidden_size) # U_a in the paper
		self.W2 = Par(hidden_size, hidden_size) # W_a in the paper
		self.W3 = Par(emb_size+hidden_size, hidden_size)
		self.b2 = Par(hidden_size)
		self.b3 = Par(hidden_size)
		self.v = Par(hidden_size)

		self.gru = nn.GRU(hidden_size, hidden_size, num_layers=self.n_layers)
		self.linear = nn.Linear(hidden_size, vocab_size)
		self.opt = optim.Adam(self.params(), lr=0.01)

	def forward(self, prev_inp, hidden, enc_outputs):
		Uh = torch.matmul(enc_outputs, self.W1)
		Ws = torch.matmul(, 1)[:,:self.hidden_size], self.W2)
		Wsb = torch.add(Ws, self.b2).unsqueeze(1)

		u = F.tanh(torch.add(Uh, Wsb))
		attn_weights = torch.mul(self.v, u).sum(2)
		attn_weights = F.softmax(attn_weights).unsqueeze(2)
		context_vector = torch.mul(attn_weights, enc_outputs).sum(1).squeeze(1)
		# s_i = f(s_i-1, y_i-1, c_i)
		prev_inp_emb = self.emb(prev_inp)
		res = torch.matmul([prev_inp_emb, context_vector], 1), self.W3)
		res = torch.add(res, self.b3).unsqueeze(0)
		res, hidden = self.gru(res, hidden.view(self.n_layers, -1, self.hidden_size))
		res = self.linear(res.squeeze(0))
		res = F.log_softmax(res)

		return res, hidden, attn_weights

A few functions that are shared for the encoder and decoder:

class RNN(nn.Module):
	def __init__(self):
		super(RNN, self).__init__()

	def initHidden(self, batch_size):
		return cuda(Variable(torch.zeros(self.n_layers * self.num_directions(), batch_size, self.hidden_size)))

	def create_emb(self, output_size, emb_size, pre_trained_emb_mat):
		emb = nn.Embedding(output_size, emb_size)
		if pre_trained_emb_mat is not None:
			emb.load_state_dict({'weight': pre_trained_emb_mat})
			for param in emb.parameters(): 
				param.requires_grad = False
		return emb

	def params(self):
		return (p for p in self.parameters() if p.requires_grad)

	def num_directions(self):
		return 2 if self.bidirect else 1	

And a few more functions

def cuda(var):
	if torch.cuda.is_available():
		return var.cuda()
	return var

def Arr(*sizes): return torch.randn(sizes)
def Par(*sizes): return torch.nn.Parameter(Arr(*sizes))

Where I initialized the encoder and decoder in the following way:

encoder = EncoderRNN(hidden_size, emb_size, vocab.n_words, n_layers=n_layers, bidirect=bidirect)
decoder = DecoderRNN(hidden_size*enc_num_of_directions, emb_size, vocab.n_words)

Notice the decoder state size is twice the size of the encoder.