Getting None for gradient of output (LSTM)

Hey there,

I’m coding my first LSTM and am having issues getting the network to train. The network is meant to perform a binary classification task.

It seems there’s some issue with backpropagating the loss as the gradients I see for the network’s parameters are usually extremely small (<e-03). In particular, I’ve noticed that even after backpropagation, output.grad and hidden.grad are both None, which doesn’t seem right.

Here’s the code for defining the network:

class LSTM(nn.Module):
    def __init__(self, input_size, embedding_dim,hidden_size, output_size, batch_size):
        super(LSTM, self).__init__()
        
        self.input_size= input_size
        self.embedding_size = embedding_dim
        self.hidden_size = hidden_size
        self.output_size = 1
        self.batch_size = batch_size
        
        self.linear_f = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.linear_i = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.linear_ctilde = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.linear_o = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, output_size)
        self.init_weights()
        
        self.length=None
        
    def forward(self, x, hidden,c):
        x_emb = x
        length=x_emb.shape[0]
        embs = torch.chunk(x_emb, self.length, 1)
        outputs=[]
        def step(emb, hid, c_t):
            combined = torch.cat((hid, emb), 1)
            f = torch.sigmoid(self.linear_f(combined))
            i = torch.sigmoid(self.linear_i(combined))
            c_tilde = torch.tanh(self.linear_ctilde(combined))
            c_t = f * c_t + i * c_tilde
            o = torch.sigmoid(self.linear_o(combined))
            
            hid = o * torch.tanh(c_t)
            return hid, c_t
            
        for i in range(len(embs)):
            hidden, c = step(embs[i], hidden, c)
        
        decoded=self.decoder(hidden)
        output=torch.softmax(decoded,1)
        return output, hidden

    def init_hidden(self):
        h0 = torch.zeros(self.batch_size, self.hidden_size,requires_grad=True)
        c0 = torch.zeros(self.batch_size, self.hidden_size,requires_grad=True)
        return h0, c0
    
    def init_weights(self):
        initrange = .1
        lin_layers = [self.linear_f, self.linear_i, self.linear_ctilde, self.linear_o, self.decoder]
     
        for layer in lin_layers:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)

And here’s the function I’m using to train the network:

def training_loop(batch_size, num_epochs, model, loss_, optim, training_iter, dev_iter, train_eval_iter,verbose,end_early):
    step = 0
    epoch = 0
    total_batches = int(len(training_set) / batch_size)
    epoch_loss=[]
    start_time=time.time()
    outputs=[]
    ground_truths=[]
    last_fi=model.linear_f.weight
    while epoch <= num_epochs:
        model.train()
        x=next(training_iter)
        vectors, conversions = get_batch(x)
        vectors = torch.stack(vectors).view([len(vectors),len(vectors[0])]).float() # batch_size, seq_len
        conversions = torch.stack(conversions).long().view([batch_size])
        
        model.length=len(vectors[0])
        hidden, cell_state = model.init_hidden()
        output, hidden = model(vectors, hidden, cell_state)
        lossy= loss_(output, conversions)
        lossy.backward()
        
        print(output.grad)
        
        optim.step()
        model.zero_grad()
        
        if step % total_batches == 0:
            if not epoch%1:
                model.eval()
                print("Epoch %i; Step %i; Loss %f; Train acc: %f; Dev acc %f" 
                      %(epoch, step, item,\
                        evaluate(model, train_eval_iter, lstm),\
                        evaluate(model, dev_iter, lstm)))
                print('')

        step += 1

Note: Since the sequences are of varying lengths and I didn’t want to do zero-padding (it seemed like it would alter the meaning of the data) I use a batch size of 1 and pass the length of each sequence by setting the model’s ‘length’ attribute