I’m new to the PyTorch framework (coming from Theano and Tensorflow mainly) which I really enjoy to use.
I’ve followed the introduction tutorial and read the classifying Classifying Names with a Character-Level RNN one.
I now try to adapt it to a char level LSTM model in order to gain some practical experience with the framework.
Basically I feed in the model sequences of char indices and give as target to the model the same sequence but shifted by one in the future.
However I can’t overfit a simple training example and I don’t see what I did wrong.
If someone can spot my mistake it would be very helpful.
Here is my code:
class LSTMTxtGen(nn.Module): def __init__(self, hidden_dim, n_layer, vocab_size): super(LSTMTxtGen, self).__init__() self.n_layer = n_layer self.hidden_dim = hidden_dim self.vocab_size = vocab_size self.lstm = nn.LSTM(vocab_size, hidden_dim, n_layer, batch_first=True) # The linear layer that maps from hidden state space to tag space #self.hidden = self.init_hidden() def init_hidden(self, batch_size): # Before we've done anything, we dont have any hidden state. # Refer to the Pytorch documentation to see exactly # why they have this dimensionality. # The axes semantics are (num_layers, minibatch_size, hidden_dim) return (autograd.Variable(torch.zeros(self.n_layer, batch_size, self.hidden_dim)), autograd.Variable(torch.zeros(self.n_layer, batch_size, self.hidden_dim))) def forward(self, seqs): self.hidden = self.init_hidden(seqs.size()) lstm_out, self.hidden = self.lstm(seqs, self.hidden) lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) lstm_out = nn.Linear(lstm_out.size(1), self.vocab_size)(lstm_out) return lstm_out model = LSTMTxtGen ( hidden_dim = 50, n_layer = 3, vocab_size = 44, ) print(Model) criterion = nn.CrossEntropyLoss() optimizer = optim.Adamax(model.parameters()) G = Data.batch_generator(5,100) batch_per_epoch, to_idx, to_char = next(G) X, Y = next(G) for epoch in range(10): losses =  for batch_count in range(batch_per_epoch): model.zero_grad() #mode.hidden = model.init_hidden() #X, Y = next(G) X = autograd.Variable(torch.from_numpy(X)) Y = autograd.Variable(torch.from_numpy(Y)) preds = model(X) loss = criterion(preds.view(-1, model.vocab_size), Y.view(-1)) loss.backward() optimizer.step() losses.append(loss) if (batch_count % 20 == 0): print('Loss: ', losses[-1])
The loss keep oscillating and no improvement is made.