Hello, I am trying to create an LSTM to learn on toy data that is similar to some data I want to train on. My code runs, but the network does not update itself and does not learn the labels. This is a binary classification problem, so I am storing my labels (0/1) in a tensor of size (num_datapts x 1) and my inputs are in a tensor of size (num_datapts x 3). Here is the complete code:

```
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
INPUT_SIZE = 3
HIDDEN_SIZE = 3
OUTPUT_SIZE = 1 # why 2?
class MyLSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, output_size):
super(MyLSTM, self).__init__()
self.hidden_dim = hidden_dim
# The LSTM takes word embeddings as inputs, and outputs hidden states
# with dimensionality hidden_dim.
self.lstm = nn.LSTM(input_dim, hidden_dim)
# The linear layer that maps from hidden state space to tag space
self.fc1 = nn.Linear(hidden_dim, output_size)
self.hidden = self.init_hidden()
def init_hidden(self):
# Before we've done anything, we dont have any hidden state.
# Refer to the Pytorch documentation to see exactly
# why they have this dimensionality.
# The axes semantics are (num_layers, minibatch_size, hidden_dim)
return (torch.zeros(1, 1, self.hidden_dim), # hx & cx
torch.zeros(1, 1, self.hidden_dim))
def forward(self, input):
lstm_out, self.hidden = self.lstm(input, self.hidden)
tag_space = self.fc1(lstm_out.view(len(input), -1))
# F.log_softmax applies softmax activation function on outputs
# Take the log because we're going to define the loss function as NLL and it expects that
tag_scores = F.softmax(tag_space, dim=0)
return tag_scores
# Make some input data
inputs = [torch.randn(1, 3), torch.randn(1, 3), torch.randn(1, 3)] # for _ in range(5)] # make a sequence of length 5
#targets = [torch.ones(1,3, dtype=torch.int64), torch.ones(1,3, dtype=torch.int64), torch.ones(1,3, dtype=torch.int64)]
#targets = torch.tensor([[1], [1], [1]], dtype=torch.float32) #torch.tensor([[0], [0], [1]])
# Add the extra 2nd dimension
# Cat concatenates the list of 3 1x3 tensors into one 3x3 tensor.
# View changes the dimensions of the tensor while preserving the data. -1 infers the size from other dimensions.
# This goes 3*(1 x 3) -> (3 x 3) -> (3 x 1 x 3)
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
targets = torch.tensor([0, 0, 0], dtype=torch.float32) #torch.cat(targets)#.view(len(targets), 1, -1)
print targets.dim()
# Train the model
model = MyLSTM(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
loss_function = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
tag_scores = model(inputs)
print(tag_scores)
for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data
# Step 1. Remember that Pytorch accumulates gradients.
# We need to clear them out before each instance
model.zero_grad()
# Also, we need to clear out the hidden state of the LSTM,
# detaching it from its history on the last instance.
model.hidden = model.init_hidden()
# Step 2. Run our forward pass.
tag_scores = model(inputs)
# Step 3. Compute the loss, gradients, and update the parameters by
# calling optimizer.step()
loss = loss_function(tag_scores, targets)
print tag_scores
print loss
loss.backward()
optimizer.step()
with torch.no_grad():
tag_scores = model(inputs)
print(tag_scores)
```

Can anyone tell me what is going on?