Hello, I am trying to create an LSTM to learn on toy data that is similar to some data I want to train on. My code runs, but the network does not update itself and does not learn the labels. This is a binary classification problem, so I am storing my labels (0/1) in a tensor of size (num_datapts x 1) and my inputs are in a tensor of size (num_datapts x 3). Here is the complete code:
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim INPUT_SIZE = 3 HIDDEN_SIZE = 3 OUTPUT_SIZE = 1 # why 2? class MyLSTM(nn.Module): def __init__(self, input_dim, hidden_dim, output_size): super(MyLSTM, self).__init__() self.hidden_dim = hidden_dim # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. self.lstm = nn.LSTM(input_dim, hidden_dim) # The linear layer that maps from hidden state space to tag space self.fc1 = nn.Linear(hidden_dim, output_size) self.hidden = self.init_hidden() def init_hidden(self): # Before we've done anything, we dont have any hidden state. # Refer to the Pytorch documentation to see exactly # why they have this dimensionality. # The axes semantics are (num_layers, minibatch_size, hidden_dim) return (torch.zeros(1, 1, self.hidden_dim), # hx & cx torch.zeros(1, 1, self.hidden_dim)) def forward(self, input): lstm_out, self.hidden = self.lstm(input, self.hidden) tag_space = self.fc1(lstm_out.view(len(input), -1)) # F.log_softmax applies softmax activation function on outputs # Take the log because we're going to define the loss function as NLL and it expects that tag_scores = F.softmax(tag_space, dim=0) return tag_scores # Make some input data inputs = [torch.randn(1, 3), torch.randn(1, 3), torch.randn(1, 3)] # for _ in range(5)] # make a sequence of length 5 #targets = [torch.ones(1,3, dtype=torch.int64), torch.ones(1,3, dtype=torch.int64), torch.ones(1,3, dtype=torch.int64)] #targets = torch.tensor([, , ], dtype=torch.float32) #torch.tensor([, , ]) # Add the extra 2nd dimension # Cat concatenates the list of 3 1x3 tensors into one 3x3 tensor. # View changes the dimensions of the tensor while preserving the data. -1 infers the size from other dimensions. # This goes 3*(1 x 3) -> (3 x 3) -> (3 x 1 x 3) inputs = torch.cat(inputs).view(len(inputs), 1, -1) targets = torch.tensor([0, 0, 0], dtype=torch.float32) #torch.cat(targets)#.view(len(targets), 1, -1) print targets.dim() # Train the model model = MyLSTM(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE) loss_function = nn.BCELoss() optimizer = optim.SGD(model.parameters(), lr=0.1) # See what the scores are before training # Note that element i,j of the output is the score for tag j for word i. # Here we don't need to train, so the code is wrapped in torch.no_grad() with torch.no_grad(): tag_scores = model(inputs) print(tag_scores) for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data # Step 1. Remember that Pytorch accumulates gradients. # We need to clear them out before each instance model.zero_grad() # Also, we need to clear out the hidden state of the LSTM, # detaching it from its history on the last instance. model.hidden = model.init_hidden() # Step 2. Run our forward pass. tag_scores = model(inputs) # Step 3. Compute the loss, gradients, and update the parameters by # calling optimizer.step() loss = loss_function(tag_scores, targets) print tag_scores print loss loss.backward() optimizer.step() with torch.no_grad(): tag_scores = model(inputs) print(tag_scores)
Can anyone tell me what is going on?