Hey guys, I wrote an encoder-decoder with attention model to predict reverse strings - that is, when getting abc
return cba
.
I get good results for most of my runs, but every few runs, my network doesn’t converge at all.
Any suggestion how to check what may cause this?
my encoder and decoder:
class EncoderRNN(rnn.RNN):
def __init__(self, hidden_size, emb_size, vocab_size, pre_trained_emb=None, n_layers=1, bidirect=True):
super(EncoderRNN, self).__init__()
self.n_layers = n_layers
self.hidden_size = hidden_size
self.bidirect = bidirect
self.emb = self.create_emb(vocab_size, emb_size, pre_trained_emb)
self.gru = nn.GRU(emb_size, hidden_size, batch_first=True, num_layers=n_layers, bidirectional=bidirect)
self.opt = optim.Adam(self.params())
def forward(self, input_sequence, hidden):
embeddings = self.emb(input_sequence)
output, hidden = self.gru(embeddings, hidden)
return output, hidden
class DecoderRNN(rnn.RNN):
def __init__(self, hidden_size, emb_size, vocab_size, pre_trained_emb=None):
super(DecoderRNN, self).__init__()
self.hidden_size = hidden_size
self.n_layers = config.values.get('n_layers', 1)
self.emb = self.create_emb(vocab_size, emb_size, pre_trained_emb)
self.W1 = Par(hidden_size, hidden_size) # U_a in the paper
self.W2 = Par(hidden_size, hidden_size) # W_a in the paper
self.W3 = Par(emb_size+hidden_size, hidden_size)
self.b2 = Par(hidden_size)
self.b3 = Par(hidden_size)
self.v = Par(hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, num_layers=self.n_layers)
self.linear = nn.Linear(hidden_size, vocab_size)
self.opt = optim.Adam(self.params(), lr=0.01)
def forward(self, prev_inp, hidden, enc_outputs):
Uh = torch.matmul(enc_outputs, self.W1)
Ws = torch.matmul(torch.cat(hidden, 1)[:,:self.hidden_size], self.W2)
Wsb = torch.add(Ws, self.b2).unsqueeze(1)
u = F.tanh(torch.add(Uh, Wsb))
attn_weights = torch.mul(self.v, u).sum(2)
attn_weights = F.softmax(attn_weights).unsqueeze(2)
context_vector = torch.mul(attn_weights, enc_outputs).sum(1).squeeze(1)
# s_i = f(s_i-1, y_i-1, c_i)
prev_inp_emb = self.emb(prev_inp)
res = torch.matmul(torch.cat([prev_inp_emb, context_vector], 1), self.W3)
res = torch.add(res, self.b3).unsqueeze(0)
res, hidden = self.gru(res, hidden.view(self.n_layers, -1, self.hidden_size))
res = self.linear(res.squeeze(0))
res = F.log_softmax(res)
return res, hidden, attn_weights
A few functions that are shared for the encoder and decoder:
class RNN(nn.Module):
def __init__(self):
super(RNN, self).__init__()
def initHidden(self, batch_size):
return cuda(Variable(torch.zeros(self.n_layers * self.num_directions(), batch_size, self.hidden_size)))
def create_emb(self, output_size, emb_size, pre_trained_emb_mat):
emb = nn.Embedding(output_size, emb_size)
if pre_trained_emb_mat is not None:
emb.load_state_dict({'weight': pre_trained_emb_mat})
for param in emb.parameters():
param.requires_grad = False
return emb
def params(self):
return (p for p in self.parameters() if p.requires_grad)
def num_directions(self):
return 2 if self.bidirect else 1
And a few more functions
def cuda(var):
if torch.cuda.is_available():
return var.cuda()
return var
def Arr(*sizes): return torch.randn(sizes)
def Par(*sizes): return torch.nn.Parameter(Arr(*sizes))
Where I initialized the encoder and decoder in the following way:
encoder = EncoderRNN(hidden_size, emb_size, vocab.n_words, n_layers=n_layers, bidirect=bidirect)
decoder = DecoderRNN(hidden_size*enc_num_of_directions, emb_size, vocab.n_words)
Notice the decoder state size is twice the size of the encoder.