LSTM model weight gradients are all zero

jaysan33 · July 11, 2025, 3:52pm

Hello all, hoping someone can point out a flaw in my model. I have an LSTM model that computes the loss only at the end of a sequence (sequence of length 5 using batching). I apply a linear layer to the final hidden state to compute an output and use MSE as the loss function. When I run the model, my linear layer weights will update, but the LSTM weights will not. When I check the gradients of the LSTM weights, they are all zero, or effectively zero. Any help is greatly appreciated. code below:

batch_proportion = .1
batch_size = int(np.round(batch_proportion * x_train.shape[0]))
dataset = TensorDataset(x_train_tensors,y_train_tensors)
data_loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)

#determine LSTM structure
input_dim = inputs.shape[1]
hidden_dim = 3
target_dim = 1

class LSTM_Vol(nn.Module):
    def __init__(self, input_dim, hidden_dim, target_dim):
        super().__init__()

        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)  #sets up the LSTM object; data tensor is set up with samples being the first dimension

        self.hidden2out = nn.Linear(hidden_dim, target_dim)  #sets up a linear layer object to map hidden state to a prediction

    def forward(self, batch, batch_size):   #takes in vol data points and passes it through the RNN...if vol_matrix is unbatched, vol_matrix has dim seq_len x input_dim; for batched, seq_len x batch_size x input_dim 
        output, (hn, cn) = self.lstm(batch,(torch.full((1,batch_size,hidden_dim),15,dtype=torch.float32),torch.full((1,batch_size,hidden_dim),15,dtype=torch.float32)))  #hn will be the final hidden state(s) (will be of dimension 1 x hidden_dim for unbatched; 1 x batch_size x hidden_dim for batched)
       
        predict = self.hidden2out(hn.view(-1,hidden_dim))  #applies a linear transform on the final hidden state to return a vol prediction
        return predict   #I want to return the final output of the RNN...will compute loss on this output

model = LSTM_Vol(input_dim, hidden_dim, target_dim)

#define loss function and optimizer method
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.5)

num_epochs = 1
epoch = np.arange(num_epochs)
epoch_avg_loss = np.zeros((epoch.size))

model.train()  #put model in training mode

for t in epoch:
    counter = 0
    cum_loss = 0
    
    for batch_X, batch_y in data_loader:
        
        optimizer.zero_grad()
        vol_predict = model.forward(batch_X,batch_X.shape[0])  #output with dimension 1 x batch_size x 1
        loss = loss_function(vol_predict.view(-1,1),batch_y) #batch_y has dimension batch_size x 1
        loss.backward()
        optimizer.step()
        cum_loss += loss
        counter += 1
    epoch_avg_loss[t] = math.sqrt(cum_loss / counter)
    
for name, param in model.named_parameters():
        if param.requires_grad: # Only parameters that require gradients will have a .grad attribute
            print(f"Parameter: {name}, Gradient: {param.grad}")

jaysan33 · July 11, 2025, 3:55pm

Here are the gradients I’m getting after one pass through the model:

ptrblck · July 11, 2025, 4:06pm

I cannot reproduce the issue using random input data:

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader


batch_size = 16
#determine LSTM structure
input_dim = 2
hidden_dim = 3
target_dim = 1
temp_dim = 8

x_train_tensors = torch.randn(batch_size*10, temp_dim, input_dim)
y_train_tensors = torch.randn(batch_size*10, temp_dim)
dataset = TensorDataset(x_train_tensors,y_train_tensors)
data_loader = DataLoader(dataset,batch_size=batch_size,shuffle=True)



class LSTM_Vol(nn.Module):
    def __init__(self, input_dim, hidden_dim, target_dim):
        super().__init__()

        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)  #sets up the LSTM object; data tensor is set up with samples being the first dimension

        self.hidden2out = nn.Linear(hidden_dim, target_dim)  #sets up a linear layer object to map hidden state to a prediction

    def forward(self, batch, batch_size):   #takes in vol data points and passes it through the RNN...if vol_matrix is unbatched, vol_matrix has dim seq_len x input_dim; for batched, seq_len x batch_size x input_dim 
        output, (hn, cn) = self.lstm(batch,(torch.full((1,batch_size,hidden_dim),15,dtype=torch.float32),torch.full((1,batch_size,hidden_dim),15,dtype=torch.float32)))  #hn will be the final hidden state(s) (will be of dimension 1 x hidden_dim for unbatched; 1 x batch_size x hidden_dim for batched)
       
        predict = self.hidden2out(hn.view(-1,hidden_dim))  #applies a linear transform on the final hidden state to return a vol prediction
        return predict   #I want to return the final output of the RNN...will compute loss on this output

model = LSTM_Vol(input_dim, hidden_dim, target_dim)

#define loss function and optimizer method
loss_function = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epochs = 1
epoch = torch.arange(num_epochs)
epoch_avg_loss = torch.zeros(epoch.size())

model.train()  #put model in training mode

for t in epoch:
    counter = 0
    cum_loss = 0
    
    for batch_X, batch_y in data_loader:
        
        optimizer.zero_grad()
        vol_predict = model.forward(batch_X,batch_X.shape[0])  #output with dimension 1 x batch_size x 1
        loss = loss_function(vol_predict.view(-1,1),batch_y) #batch_y has dimension batch_size x 1
        loss.backward()
        optimizer.step()
        cum_loss += loss
        counter += 1
    epoch_avg_loss[t] = torch.sqrt(cum_loss / counter)
    
for name, param in model.named_parameters():
        if param.requires_grad: # Only parameters that require gradients will have a .grad attribute
            print(f"Parameter: {name}, Gradient: {param.grad}")

Output:

Parameter: lstm.weight_ih_l0, Gradient: tensor([[-0.0035,  0.0019],
        [ 0.0024, -0.0009],
        [ 0.0029, -0.0026],
        [-0.0007, -0.0003],
        [-0.0003,  0.0004],
        [ 0.0046, -0.0021],
        [ 0.0032, -0.0024],
        [ 0.0095, -0.0015],
        [ 0.0210, -0.0073],
        [-0.0025,  0.0013],
        [ 0.0003,  0.0005],
        [ 0.0031, -0.0019]])
Parameter: lstm.weight_hh_l0, Gradient: tensor([[ 2.3573e-04, -4.2392e-05, -1.8180e-03],
        [ 6.3321e-04,  3.9709e-04,  6.7277e-04],
        [-2.3476e-04,  2.0882e-05,  1.7179e-03],
        [ 1.4311e-03, -9.9232e-06,  1.1886e-03],
        [ 1.9923e-04,  4.2508e-04,  1.0136e-03],
        [ 5.1547e-03,  5.0098e-03,  7.9001e-03],
        [-5.8394e-05,  6.7433e-04,  2.8024e-03],
        [ 3.4174e-04,  4.3815e-04,  5.4041e-03],
        [-3.0685e-04,  2.0751e-03,  1.0447e-02],
        [ 2.7741e-05, -6.6987e-04, -1.9434e-03],
        [-1.6692e-04,  5.4640e-05,  5.5214e-04],
        [-3.2297e-04,  2.9907e-04,  1.8433e-03]])
Parameter: lstm.bias_ih_l0, Gradient: tensor([-0.0054,  0.0005,  0.0051,  0.0010,  0.0020,  0.0078,  0.0070,  0.0138,
         0.0264, -0.0044,  0.0017,  0.0050])
Parameter: lstm.bias_hh_l0, Gradient: tensor([-0.0054,  0.0005,  0.0051,  0.0010,  0.0020,  0.0078,  0.0070,  0.0138,
         0.0264, -0.0044,  0.0017,  0.0050])
Parameter: hidden2out.weight, Gradient: tensor([[ 0.0812, -0.0401, -0.1410]])
Parameter: hidden2out.bias, Gradient: tensor([-0.4495])

jaysan33 · July 11, 2025, 5:04pm

I just played around with some random data as well, and I was able to get meaningful gradients. There must be something wrong in my data processing pipeline. But at least I can confirm that I have gradient flow in the model. Thank you for quickly confirming.