All Model Parameters Gradient None

Lei_Shi1 · October 31, 2022, 6:22pm

I have a model consisting of CNN & RNN. When I try to print model parameters gradients by below:

    optimizer.zero_grad()
    loss.backward()
    for name, p in model.named_parameters():
        print(name, 'gradient is', p.grad)
    optimizer.step()

it shows everything is None. How to debug? Thanks.

Python:3.9.12
OS:Ubuntu 18.04
Pytorch:1.12.1
CUDA:11.6

srishti-git1110 · October 31, 2022, 7:09pm

Could you please post a minimum executable snippet that reproduces the error?

It shouldn’t be the case generally as model parameters are the leaf tensors in the computation graph of the loss provided it is calculated correctly.

Lei_Shi1 · October 31, 2022, 7:46pm

I’m sorry my model is too big. Here is already the simplified one:

from calendar import EPOCH
import torch
from torch import nn, tensor
import os

# hyperparameters
LEARNING_RATE = 1e-5
EPOCHS = int(1e7)
BATCH_SIZE = 1
horizon = 8
WEIGHT_DECAY = 1e-4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class action_input_model(nn.Module):
    
    def __init__(self):
        super().__init__()

        self.action_dense1 =  nn.Linear(2, 16)           # verified input (8,2), output(8,16) when nn.Linear(2,16)
        self.action_relu3 = nn.ReLU()
        self.action_dense2 = nn.Linear(16, 16)
        
    def forward(self, x):
        
        x = self.action_dense1(x)
        x = self.action_relu3(x)
        x = self.action_dense2(x)
        return x
    
    
class rnn_cell(nn.Module):

    def __init__(self):
        super().__init__()
        
        self.rnn_cell = nn.LSTM(16, 64, 8, batch_first = True)             # (input_size, hidden_size/num_units, num_layers)  
                                                        
    def forward(self, x):
        output = self.rnn_cell(x)
        return output
    
    
class output_model_1(nn.Module):
        
    def __init__(self):
        super().__init__()
        
        self.output_dense1 = nn.Linear(64, 32)       # hidden layer features are 64
        self.output_relu3 = nn.ReLU()
        self.output_dense2 = nn.Linear(32, 4)       # 4 is the output dimension, actually 8*4
        
    def forward(self, x):
        
        x = self.output_dense1(x)
        x = self.output_relu3(x)
        x = self.output_dense2(x)
        return x


class combined_model( action_input_model, rnn_cell, output_model_1):
    
    def __init__(self):
        super().__init__()
        
    def forward(self):
       
        action_input_data = torch.empty(BATCH_SIZE, horizon, 2, device=device)                           # 2 is linear and angular velocity

        # run the model
        
        action_model = action_input_model().to(device)
        action_input_processed = action_model(action_input_data)
        
        lstm = rnn_cell().to(device)
        lstm_out, _ = lstm(action_input_processed)
        
        output_model = output_model_1().to(device)
        model_output_temp = output_model(lstm_out)                     # [position x, position y, position z, collision], diff from ground_truth
        
        return model_output_temp     # [batch_size, horizon+1, 4], horizon+1 is timestep, 4 is [position x, position y, position z, collision]


model = combined_model().to(device)

# Adam optimizer (L2 regularization)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

for step in range(EPOCHS):
            
    model_output = model()
    
    # Loss function: MSE for position, cross entropy for collision
    # To get the ground truth data first for the model output
    
    ground_truth_position = torch.empty(1, horizon, 3, device=device)
    ground_truth_collision = torch.empty(1, horizon, device=device)
    
    loss_mse = nn.MSELoss(reduction='mean')
    loss_position = loss_mse(model_output[:, :, :2], ground_truth_position[:,:,:2])
    loss_position.retain_grad()
     
    loss_cross_entropy = nn.CrossEntropyLoss(reduction='sum')        
    loss_collision = loss_cross_entropy(model_output[:, :, 3], ground_truth_collision)
    if loss_collision != 0:
        print('loss_collision', loss_collision)
    
    loss = loss_position + loss_collision
    loss.retain_grad()
    optimizer.zero_grad()
    loss.backward()
    
    print('loss grad is', loss.grad)
    for name, p in model.named_parameters():
        print(name, 'gradient is', p.grad)
    
    optimizer.step()

What’s weird is that my loss gradient is always 1.

Thank you. I really appreciate it!

srishti-git1110 · November 2, 2022, 7:23am

Thanks for posting the code. I tried reproducing the error and yes, gradients are coming out to be None.

I tried debugging using a bunch of ways, and found this in order:

With your exact same code, when I used torchviz to visualize the graph of loss, the graph had the model parameters of all 3 models as leaf tensors so I wasn’t sure why the gradients are None after backpropagating as the graph seemed to be ok.
I then tried to replicate what your code does without the combined_model class, but nothing changed - gradients were still None.
To drill more, I removed the combined_model class again and used this:

import itertools
params = [action_model.parameters(), lstm.parameters(), output_model.parameters()]
optimizer = torch.optim.Adam(itertools.chain(*params), lr=0.1)

action_input_data = torch.empty(BATCH_SIZE, horizon, 2, device=device) 
  
action_model = action_input_model()
action_model = action_model.to(device)
action_input_processed = action_model(action_input_data)
        
lstm = rnn_cell().to(device)
lstm_out, _ = lstm(action_input_processed)
        
output_model = output_model_1().to(device)
model_output = output_model(lstm_out)
 
ground_truth_position = torch.empty(1, horizon, 3, device=device)
ground_truth_collision = torch.empty(1, horizon, device=device)

# losses
loss_mse = nn.MSELoss(reduction='mean')
loss_position = loss_mse(model_output[:,:,:2], ground_truth_position[:,:,:2])
loss_cross_entropy = nn.CrossEntropyLoss(reduction='sum')        
loss_collision = loss_cross_entropy(model_output[:, :, 3], ground_truth_collision)
loss = loss_position + loss_collision

optimizer.zero_grad()
loss.backward()
optimizer.step()


for name, param in action_model.named_parameters():
  print(name, param.grad)

and this gave all gradients as nans which I doubted could be because of some precision problems or any such operation like division by 0 (not completely sure though).

One single line change in the above code:

# action_input_data = torch.empty(BATCH_SIZE, horizon, 2, device=device) 
action_input_data = torch.randn(BATCH_SIZE, horizon, 2, device=device)

and it worked, and so the following code:

for name, param in action_model.named_parameters():
  print(name, param.grad.shape)

gives

action_dense1.weight torch.Size([16, 2])
action_dense1.bias torch.Size([16])
action_dense2.weight torch.Size([16, 16])
action_dense2.bias torch.Size([16])

Makes me think whether there’s any error within the code, or is it just the input tensor values causing the error.
Tagging @ptrblck for better help.

srishti-git1110 · November 2, 2022, 7:25am

Here, are you talking about the output of this step:

If yes, then that’s right mathematically as differentiating loss with respect to loss gives 1.

ptrblck · November 2, 2022, 7:39am

I guess the input tensors are causing the issue since torch.empty will use uninitialized memory which can contain any value including NaNs and it seems the code works properly after you’ve initialized the tensors via torch.randn.

srishti-git1110 · November 2, 2022, 1:39pm

Hi @Lei_Shi1,

The reason why your gradients are coming up to be None is because you are creating new instances for all the three classes in the forward method of combined_model.

To be able to update the parameters of model which is an instance of combined_model, you could do something like this:
(Although there could be cleaner ways to do this without creating 3 separate classes)

from calendar import EPOCH
import torch
from torch import nn, tensor
import os
LEARNING_RATE = 1e-5
EPOCHS = int(1e7)
BATCH_SIZE = 1
horizon = 8
WEIGHT_DECAY = 1e-4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class action_input_model(nn.Module): 
    def __init__(self):
        super().__init__()
        self.action_dense1 =  nn.Linear(2, 16)           # verified input (8,2), output(8,16) when nn.Linear(2,16)
        self.action_relu3 = nn.ReLU()
        self.action_dense2 = nn.Linear(16, 16)
      
class rnn_cell(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn_cell = nn.LSTM(16, 64, 8, batch_first = True)             # (input_size, hidden_size/num_units, num_layers)  

    
class output_model_1(nn.Module):   
    def __init__(self):
        super().__init__()
        self.output_dense1 = nn.Linear(64, 32)       # hidden layer features are 64
        self.output_relu3 = nn.ReLU()
        self.output_dense2 = nn.Linear(32, 4)       # 4 is the output dimension, actually 8*4
        
   
class combined_model(action_input_model, rnn_cell, output_model_1):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        x = self.action_dense1(x)
        x = self.action_relu3(x)
        x = self.action_dense2(x)
        x, _ = self.rnn_cell(x)
        x = self.output_dense1(x)
        x = self.output_relu3(x)
        x = self.output_dense2(x)
        return x    

action_input_data = torch.randn(BATCH_SIZE, horizon, 2, device=device)
model = combined_model().to(device) 
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

for step in range(EPOCHS):   
    model_output = model(action_input_data)
    ground_truth_position = torch.randn(1, horizon, 3, device=device)
    ground_truth_collision = torch.randn(1, horizon, device=device)
    
    loss_mse = nn.MSELoss(reduction='mean')
    loss_position = loss_mse(model_output[:, :, :2], ground_truth_position[:,:,:2])
    loss_position.retain_grad()
     
    loss_cross_entropy = nn.CrossEntropyLoss(reduction='sum')        
    loss_collision = loss_cross_entropy(model_output[:, :, 3], ground_truth_collision)
    if loss_collision != 0:
        print('loss_collision', loss_collision)
    
    loss = loss_position + loss_collision
    loss.retain_grad()
    optimizer.zero_grad()
    loss.backward()
    
    print('loss grad is', loss.grad)
    for name, p in model.named_parameters():
        print(name, p.grad.shape)
    
    optimizer.step()
    break

this works and gives:

loss_collision tensor(-4.4471, grad_fn=<NegBackward0>)
loss grad is tensor(1.)
output_dense1.weight torch.Size([32, 64])
output_dense1.bias torch.Size([32])
output_dense2.weight torch.Size([4, 32])
output_dense2.bias torch.Size([4])
rnn_cell.weight_ih_l0 torch.Size([256, 16])
rnn_cell.weight_hh_l0 torch.Size([256, 64])
rnn_cell.bias_ih_l0 torch.Size([256])
rnn_cell.bias_hh_l0 torch.Size([256])
rnn_cell.weight_ih_l1 torch.Size([256, 64])
rnn_cell.weight_hh_l1 torch.Size([256, 64])
rnn_cell.bias_ih_l1 torch.Size([256])
rnn_cell.bias_hh_l1 torch.Size([256])
rnn_cell.weight_ih_l2 torch.Size([256, 64])
rnn_cell.weight_hh_l2 torch.Size([256, 64])
rnn_cell.bias_ih_l2 torch.Size([256])
rnn_cell.bias_hh_l2 torch.Size([256])
rnn_cell.weight_ih_l3 torch.Size([256, 64])
rnn_cell.weight_hh_l3 torch.Size([256, 64])
rnn_cell.bias_ih_l3 torch.Size([256])
rnn_cell.bias_hh_l3 torch.Size([256])
rnn_cell.weight_ih_l4 torch.Size([256, 64])
rnn_cell.weight_hh_l4 torch.Size([256, 64])
rnn_cell.bias_ih_l4 torch.Size([256])
rnn_cell.bias_hh_l4 torch.Size([256])
rnn_cell.weight_ih_l5 torch.Size([256, 64])
rnn_cell.weight_hh_l5 torch.Size([256, 64])
rnn_cell.bias_ih_l5 torch.Size([256])
rnn_cell.bias_hh_l5 torch.Size([256])
rnn_cell.weight_ih_l6 torch.Size([256, 64])
rnn_cell.weight_hh_l6 torch.Size([256, 64])
rnn_cell.bias_ih_l6 torch.Size([256])
rnn_cell.bias_hh_l6 torch.Size([256])
rnn_cell.weight_ih_l7 torch.Size([256, 64])
rnn_cell.weight_hh_l7 torch.Size([256, 64])
rnn_cell.bias_ih_l7 torch.Size([256])
rnn_cell.bias_hh_l7 torch.Size([256])
action_dense1.weight torch.Size([16, 2])
action_dense1.bias torch.Size([16])
action_dense2.weight torch.Size([16, 16])
action_dense2.bias torch.Size([16])

Hope this helps,
S

Lei_Shi1 · November 3, 2022, 5:54pm

Thanks, Srishti!!!

I have debugged it and the reason is precisely what you mentioned:

The reason why your gradients are coming up to be None is because you are creating new instances for all the three classes in the forward method of combined_model.

When I fixed it like below, it worked

class combined_model(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.obs_im_model = nn.Sequential(
        nn.Conv2d(3, 32, 5, stride=2),
        nn.ReLU(),
        nn.Conv2d(32, 64, 3, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, 3, stride=2),
        nn.Flatten(start_dim=1),
        nn.Linear(8960, 256),
        nn.ReLU(),
        nn.Linear(256, 128) 
        )
        
        self.obs_lowd_model = nn.Sequential(
        nn.Linear(128, 128),       
        nn.ReLU(),
        nn.Linear(128, 128)
        )
        
        self.action_input_model = nn.Sequential(
        nn.Linear(2, 16),           # verified input (8,2), output(8,16) when nn.Linear(2,16)
        nn.ReLU(),
        nn.Linear(16, 16)
        )
        
        self.rnn_cell = nn.LSTM(16, 64, 8, batch_first = True)             # (input_size, hidden_size/num_units, num_layers)
        
        self.output_model = nn.Sequential(
        nn.Linear(64, 32),       # hidden layer features are 64
        nn.ReLU(),
        nn.Linear(32, 4)       # 4 is the output dimension, actually 8*4   
        )
        
    def forward(self, img_train_dataloader, ground_truth_files_list, action_input_files_list):

My original code can do the forward computation but failed for backward iteration.