Newbie question - LSTM is not updating itself

Hello, I am trying to create an LSTM to learn on toy data that is similar to some data I want to train on. My code runs, but the network does not update itself and does not learn the labels. This is a binary classification problem, so I am storing my labels (0/1) in a tensor of size (num_datapts x 1) and my inputs are in a tensor of size (num_datapts x 3). Here is the complete code:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

INPUT_SIZE = 3
HIDDEN_SIZE = 3
OUTPUT_SIZE = 1     # why 2?

class MyLSTM(nn.Module):

    def __init__(self, input_dim, hidden_dim, output_size):
        super(MyLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(input_dim, hidden_dim)
        # The linear layer that maps from hidden state space to tag space
        self.fc1 = nn.Linear(hidden_dim, output_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim),     # hx & cx
                torch.zeros(1, 1, self.hidden_dim))

    def forward(self, input):
        lstm_out, self.hidden = self.lstm(input, self.hidden)
        tag_space = self.fc1(lstm_out.view(len(input), -1))
        # F.log_softmax applies softmax activation function on outputs
        # Take the log because we're going to define the loss function as NLL and it expects that
        tag_scores = F.softmax(tag_space, dim=0)
        return tag_scores

# Make some input data
inputs = [torch.randn(1, 3), torch.randn(1, 3), torch.randn(1, 3)] # for _ in range(5)]  # make a sequence of length 5
#targets = [torch.ones(1,3, dtype=torch.int64), torch.ones(1,3, dtype=torch.int64), torch.ones(1,3, dtype=torch.int64)]
#targets = torch.tensor([[1], [1], [1]], dtype=torch.float32) #torch.tensor([[0], [0], [1]])
# Add the extra 2nd dimension
# Cat concatenates the list of 3 1x3 tensors into one 3x3 tensor.
# View changes the dimensions of the tensor while preserving the data. -1 infers the size from other dimensions.
# This goes 3*(1 x 3) -> (3 x 3) -> (3 x 1 x 3)
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
targets = torch.tensor([0, 0, 0], dtype=torch.float32) #torch.cat(targets)#.view(len(targets), 1, -1)
print targets.dim()

# Train the model
model = MyLSTM(INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE)
loss_function = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    model.zero_grad()

    # Also, we need to clear out the hidden state of the LSTM,
    # detaching it from its history on the last instance.
    model.hidden = model.init_hidden()

    # Step 2. Run our forward pass.
    tag_scores = model(inputs)

    # Step 3. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()
    loss = loss_function(tag_scores, targets)
    print tag_scores
    print loss
    loss.backward()
    optimizer.step()

with torch.no_grad():
    tag_scores = model(inputs)
    print(tag_scores)

Can anyone tell me what is going on?