nn.Sequential change parameters' gradient

Dear all,

I’m writing because I have noticed a strange behaviour of the gradient computation in presence of a nn.Sequential module. I’m interested in computing the output’s gradient of a nn.Module. For my application, I had to rewrite a custom module which construct an equivalent representation of the original PyTorch model, but register a list of layers to compute the forward pass. I put the model’s implementation below

class ClassicalNeuralNetwork( nn.Module):

    def __init__(self, net):
        nn.Module.__init__(self)
        self.net = net 
        self.layers = self._create_effdim_model(net)
        self.d = self._compute_number_of_parameters()
        self.selected_out = {}

    def forward(self, x):

        tot = False
        
        if not torch.is_tensor(x):
            self.selected_out[0] = x
            x = torch.from_numpy(x)
        else:
            self.selected_out[0] = x.detach().numpy()
        for i in range(len(self.layers) - 1):
            x = F.leaky_relu(self.layers[i](x))
            self.selected_out[i+1] = x.detach().numpy()
            tot = True
        if tot:
            x = self.layers[-1](x)
            self.selected_out[len(self.layers)] = x.detach().numpy()
        else:
            x = F.leaky_relu(self.layers[-1](x))
            self.selected_out[len(self.layers)] = x.detach().numpy()
        return x, self.selected_out
    
    
    def _compute_number_of_parameters(self):
        '''
        Function that computes the total amount of parameters of a model 
        '''
        num = 0
        for mod in self.layers:
            num += sum(p.numel() for p in mod.parameters()) 
        return num
    def _create_effdim_model(self, model):
        '''
        Given a Pytorch model returns a list of all his layers
        '''
        ret = []
        for j, (name, m) in enumerate(model.named_modules()):
            
            if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
                ret.append(nn.Sequential(m.double()))
                
            if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
                ret.append(m.double())
        
        
        return ret

In the last method _create_effdim_model I used a nn.Sequential of a single layer which should be equivalent to add just the linear layer. Hence, an almost copy and paste version of ClassicalNeuralNetwork has been constructed by removing the nn.Sequential. The output of the models are the same, but the parameters’ gradients are different and I cannot figure out the reason.
Do you have any idea? Does the nn.Sequential change the gradients?

Below, I attach the code to reproduce this behaviour.

import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np

class ClassicalNeuralNetwork( nn.Module):
    
    def __init__(self, net):
       
        nn.Module.__init__(self)
        self.net = net 
        self.layers = self._create_effdim_model(net)
        self.d = self._compute_number_of_parameters()
        self.selected_out = {}

    def forward(self, x):
        tot = False
        
        if not torch.is_tensor(x):
            self.selected_out[0] = x
            x = torch.from_numpy(x)
        else:
            self.selected_out[0] = x.detach().numpy()
        for i in range(len(self.layers) - 1):
            x = F.leaky_relu(self.layers[i](x))
            self.selected_out[i+1] = x.detach().numpy()
            tot = True
        if tot:
            x = self.layers[-1](x)
            self.selected_out[len(self.layers)] = x.detach().numpy()
        else:
            x = F.leaky_relu(self.layers[-1](x))
            self.selected_out[len(self.layers)] = x.detach().numpy()
        return x, self.selected_out
    
    
    def _compute_number_of_parameters(self):
        '''
        Function that computes the total amount of parameters of a model 
        '''
        num = 0
        for mod in self.layers:
            num += sum(p.numel() for p in mod.parameters()) 

        return num
    
    def _create_effdim_model(self, model):
        '''
        Given a Pytorch model returns a list of all his layers
        '''
        ret = []
        for j, (name, m) in enumerate(model.named_modules()):
            
            if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
                ret.append(nn.Sequential(m.double()))
                
            if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
                ret.append(m.double())
        
        
        return ret
    
class NoSClassicalNeuralNetwork( nn.Module):

    def __init__(self, net):
    
        nn.Module.__init__(self)
        self.net = net 
        self.layers = self._create_effdim_model(net)
        self.d = self._compute_number_of_parameters()
        self.selected_out = {}

    def forward(self, x):
       
        tot = False
        
        if not torch.is_tensor(x):
            self.selected_out[0] = x
            x = torch.from_numpy(x)
        else:
            self.selected_out[0] = x.detach().numpy()
        for i in range(len(self.layers) - 1):
            x = F.leaky_relu(self.layers[i](x))
            self.selected_out[i+1] = x.detach().numpy()
            tot = True
        if tot:
            x = self.layers[-1](x)
            self.selected_out[len(self.layers)] = x.detach().numpy()
        else:
            x = F.leaky_relu(self.layers[-1](x))
            self.selected_out[len(self.layers)] = x.detach().numpy()
        return x, self.selected_out
    
    def _create_rand_params(self):
        for l in self.layers:
            if type(l) == nn.Linear:
                l.weight.data.uniform_(0,1)
    def _compute_number_of_parameters(self):
        '''
        Function that computes the total amount of parameters of a model 
        '''
        num = 0
        for mod in self.layers:
            num += sum(p.numel() for p in mod.parameters()) 
        return num
    def _create_effdim_model(self, model):
        '''
        Given a Pytorch model returns a list of all his layers.
        '''
        ret = []
        for j, (name, m) in enumerate(model.named_modules()):
            if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
                ret.append(m.double())
                
            if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
                ret.append(m.double())
        
        
        return ret


class net5410210257(nn.Module):

    def __init__(self,input_size,output_size):
        super(net5410210257,self).__init__()
        self.l1 = nn.Linear(input_size,10, bias=False)
        self.relu = nn.LeakyReLU()
        self.l2 = nn.Linear(10,2, bias=False)
        self.relu = nn.LeakyReLU()
        self.l3 = nn.Linear(2,10, bias=False)
        self.relu = nn.LeakyReLU()
        self.l4 = nn.Linear(10,25, bias=False)
        self.relu = nn.LeakyReLU()
        self.l5 = nn.Linear(25,7, bias=False)
    

    def forward(self,x):
        output = self.l1(x) 
        output = self.relu(output)
        output = self.l2(output)
        output = self.relu(output)
        output = self.l3(output)
        output = self.relu(output)
        output = self.l4(output)
        output = self.relu(output)
        output = self.l5(output)


        return output

x = np.ones(54)
net1 = net5410210257(54,7)

smodel = ClassicalNeuralNetwork(net1)
model = NoSClassicalNeuralNetwork(net1)
out,_ = model(x)
outs,_ = smodel(x)

print("The output of the model without nn.Sequential is " + str(out))
print("The output of the model with nn.Sequential is " + str(outs))

print('--'*50 + 'PARAMETERS GRADS of NON-SEQUENTIAL MODEL' + '--'*50 )
model.zero_grad()
out[1].backward()
for p in model.parameters():
    print(p.grad)
print('--'*50 + 'PARAMETERS GRADS of SEQUENTIAL MODEL' + '--'*50 )
outs[1].backward()
for p in smodel.parameters():
    print(p.grad)

Best,
Max

You are reusing the same net1 model and are thus accumulating the gradients in the .grad attribute of net1.
Delete the gradients and the result should match:

x = np.ones(54)
net1 = net5410210257(54,7)

smodel = ClassicalNeuralNetwork(net1)
model = NoSClassicalNeuralNetwork(net1)
out,_ = model(x)
outs,_ = smodel(x)

print("The output of the model without nn.Sequential is " + str(out))
print("The output of the model with nn.Sequential is " + str(outs))

print('--'*50 + 'PARAMETERS GRADS of NON-SEQUENTIAL MODEL' + '--'*50 )
model.zero_grad()
out[1].backward()
reference = []
for p in model.parameters():
    print(p.grad)
    reference.append(p.grad.abs().sum().clone())
print('--'*50 + 'PARAMETERS GRADS of SEQUENTIAL MODEL' + '--'*50 )
smodel.zero_grad() # HERE!
current = []
outs[1].backward()
for p in smodel.parameters():
    print(p.grad)
    current.append(p.grad.abs().sum().clone())
    
reference = torch.stack(reference)
current = torch.stack(current)

print((reference - current).abs().max())
# tensor(0., dtype=torch.float64)

Thank you, your suggestion fixes the problem!

Best,
Max