Dear all,

I’m writing because I have noticed a strange behaviour of the gradient computation in presence of a nn.Sequential module. I’m interested in computing the output’s gradient of a nn.Module. For my application, I had to rewrite a custom module which construct an equivalent representation of the original PyTorch model, but register a list of layers to compute the forward pass. I put the model’s implementation below

``````class ClassicalNeuralNetwork( nn.Module):

def __init__(self, net):
nn.Module.__init__(self)
self.net = net
self.layers = self._create_effdim_model(net)
self.d = self._compute_number_of_parameters()
self.selected_out = {}

def forward(self, x):

tot = False

if not torch.is_tensor(x):
self.selected_out[0] = x
x = torch.from_numpy(x)
else:
self.selected_out[0] = x.detach().numpy()
for i in range(len(self.layers) - 1):
x = F.leaky_relu(self.layers[i](x))
self.selected_out[i+1] = x.detach().numpy()
tot = True
if tot:
x = self.layers[-1](x)
self.selected_out[len(self.layers)] = x.detach().numpy()
else:
x = F.leaky_relu(self.layers[-1](x))
self.selected_out[len(self.layers)] = x.detach().numpy()
return x, self.selected_out

def _compute_number_of_parameters(self):
'''
Function that computes the total amount of parameters of a model
'''
num = 0
for mod in self.layers:
num += sum(p.numel() for p in mod.parameters())
return num
def _create_effdim_model(self, model):
'''
Given a Pytorch model returns a list of all his layers
'''
ret = []
for j, (name, m) in enumerate(model.named_modules()):

if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
ret.append(nn.Sequential(m.double()))

if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
ret.append(m.double())

return ret
``````

In the last method `_create_effdim_model` I used a nn.Sequential of a single layer which should be equivalent to add just the linear layer. Hence, an almost copy and paste version of `ClassicalNeuralNetwork` has been constructed by removing the nn.Sequential. The output of the models are the same, but the parameters’ gradients are different and I cannot figure out the reason.
Do you have any idea? Does the nn.Sequential change the gradients?

Below, I attach the code to reproduce this behaviour.

``````import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np

class ClassicalNeuralNetwork( nn.Module):

def __init__(self, net):

nn.Module.__init__(self)
self.net = net
self.layers = self._create_effdim_model(net)
self.d = self._compute_number_of_parameters()
self.selected_out = {}

def forward(self, x):
tot = False

if not torch.is_tensor(x):
self.selected_out[0] = x
x = torch.from_numpy(x)
else:
self.selected_out[0] = x.detach().numpy()
for i in range(len(self.layers) - 1):
x = F.leaky_relu(self.layers[i](x))
self.selected_out[i+1] = x.detach().numpy()
tot = True
if tot:
x = self.layers[-1](x)
self.selected_out[len(self.layers)] = x.detach().numpy()
else:
x = F.leaky_relu(self.layers[-1](x))
self.selected_out[len(self.layers)] = x.detach().numpy()
return x, self.selected_out

def _compute_number_of_parameters(self):
'''
Function that computes the total amount of parameters of a model
'''
num = 0
for mod in self.layers:
num += sum(p.numel() for p in mod.parameters())

return num

def _create_effdim_model(self, model):
'''
Given a Pytorch model returns a list of all his layers
'''
ret = []
for j, (name, m) in enumerate(model.named_modules()):

if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
ret.append(nn.Sequential(m.double()))

if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
ret.append(m.double())

return ret

class NoSClassicalNeuralNetwork( nn.Module):

def __init__(self, net):

nn.Module.__init__(self)
self.net = net
self.layers = self._create_effdim_model(net)
self.d = self._compute_number_of_parameters()
self.selected_out = {}

def forward(self, x):

tot = False

if not torch.is_tensor(x):
self.selected_out[0] = x
x = torch.from_numpy(x)
else:
self.selected_out[0] = x.detach().numpy()
for i in range(len(self.layers) - 1):
x = F.leaky_relu(self.layers[i](x))
self.selected_out[i+1] = x.detach().numpy()
tot = True
if tot:
x = self.layers[-1](x)
self.selected_out[len(self.layers)] = x.detach().numpy()
else:
x = F.leaky_relu(self.layers[-1](x))
self.selected_out[len(self.layers)] = x.detach().numpy()
return x, self.selected_out

def _create_rand_params(self):
for l in self.layers:
if type(l) == nn.Linear:
l.weight.data.uniform_(0,1)
def _compute_number_of_parameters(self):
'''
Function that computes the total amount of parameters of a model
'''
num = 0
for mod in self.layers:
num += sum(p.numel() for p in mod.parameters())
return num
def _create_effdim_model(self, model):
'''
Given a Pytorch model returns a list of all his layers.
'''
ret = []
for j, (name, m) in enumerate(model.named_modules()):
if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
ret.append(m.double())

if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
ret.append(m.double())

return ret

class net5410210257(nn.Module):

def __init__(self,input_size,output_size):
super(net5410210257,self).__init__()
self.l1 = nn.Linear(input_size,10, bias=False)
self.relu = nn.LeakyReLU()
self.l2 = nn.Linear(10,2, bias=False)
self.relu = nn.LeakyReLU()
self.l3 = nn.Linear(2,10, bias=False)
self.relu = nn.LeakyReLU()
self.l4 = nn.Linear(10,25, bias=False)
self.relu = nn.LeakyReLU()
self.l5 = nn.Linear(25,7, bias=False)

def forward(self,x):
output = self.l1(x)
output = self.relu(output)
output = self.l2(output)
output = self.relu(output)
output = self.l3(output)
output = self.relu(output)
output = self.l4(output)
output = self.relu(output)
output = self.l5(output)

return output

x = np.ones(54)
net1 = net5410210257(54,7)

smodel = ClassicalNeuralNetwork(net1)
model = NoSClassicalNeuralNetwork(net1)
out,_ = model(x)
outs,_ = smodel(x)

print("The output of the model without nn.Sequential is " + str(out))
print("The output of the model with nn.Sequential is " + str(outs))

print('--'*50 + 'PARAMETERS GRADS of NON-SEQUENTIAL MODEL' + '--'*50 )
out[1].backward()
for p in model.parameters():
print('--'*50 + 'PARAMETERS GRADS of SEQUENTIAL MODEL' + '--'*50 )
outs[1].backward()
for p in smodel.parameters():
``````

Best,
Max

You are reusing the same `net1` model and are thus accumulating the gradients in the `.grad` attribute of `net1`.
Delete the gradients and the result should match:

``````x = np.ones(54)
net1 = net5410210257(54,7)

smodel = ClassicalNeuralNetwork(net1)
model = NoSClassicalNeuralNetwork(net1)
out,_ = model(x)
outs,_ = smodel(x)

print("The output of the model without nn.Sequential is " + str(out))
print("The output of the model with nn.Sequential is " + str(outs))

print('--'*50 + 'PARAMETERS GRADS of NON-SEQUENTIAL MODEL' + '--'*50 )
out[1].backward()
reference = []
for p in model.parameters():
print('--'*50 + 'PARAMETERS GRADS of SEQUENTIAL MODEL' + '--'*50 )
current = []
outs[1].backward()
for p in smodel.parameters():