Dear all,
I’m writing because I have noticed a strange behaviour of the gradient computation in presence of a nn.Sequential module. I’m interested in computing the output’s gradient of a nn.Module. For my application, I had to rewrite a custom module which construct an equivalent representation of the original PyTorch model, but register a list of layers to compute the forward pass. I put the model’s implementation below
class ClassicalNeuralNetwork( nn.Module):
def __init__(self, net):
nn.Module.__init__(self)
self.net = net
self.layers = self._create_effdim_model(net)
self.d = self._compute_number_of_parameters()
self.selected_out = {}
def forward(self, x):
tot = False
if not torch.is_tensor(x):
self.selected_out[0] = x
x = torch.from_numpy(x)
else:
self.selected_out[0] = x.detach().numpy()
for i in range(len(self.layers) - 1):
x = F.leaky_relu(self.layers[i](x))
self.selected_out[i+1] = x.detach().numpy()
tot = True
if tot:
x = self.layers[-1](x)
self.selected_out[len(self.layers)] = x.detach().numpy()
else:
x = F.leaky_relu(self.layers[-1](x))
self.selected_out[len(self.layers)] = x.detach().numpy()
return x, self.selected_out
def _compute_number_of_parameters(self):
'''
Function that computes the total amount of parameters of a model
'''
num = 0
for mod in self.layers:
num += sum(p.numel() for p in mod.parameters())
return num
def _create_effdim_model(self, model):
'''
Given a Pytorch model returns a list of all his layers
'''
ret = []
for j, (name, m) in enumerate(model.named_modules()):
if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
ret.append(nn.Sequential(m.double()))
if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
ret.append(m.double())
return ret
In the last method _create_effdim_model
I used a nn.Sequential of a single layer which should be equivalent to add just the linear layer. Hence, an almost copy and paste version of ClassicalNeuralNetwork
has been constructed by removing the nn.Sequential. The output of the models are the same, but the parameters’ gradients are different and I cannot figure out the reason.
Do you have any idea? Does the nn.Sequential change the gradients?
Below, I attach the code to reproduce this behaviour.
import torch.nn as nn
import torch
import torch.nn.functional as F
import numpy as np
class ClassicalNeuralNetwork( nn.Module):
def __init__(self, net):
nn.Module.__init__(self)
self.net = net
self.layers = self._create_effdim_model(net)
self.d = self._compute_number_of_parameters()
self.selected_out = {}
def forward(self, x):
tot = False
if not torch.is_tensor(x):
self.selected_out[0] = x
x = torch.from_numpy(x)
else:
self.selected_out[0] = x.detach().numpy()
for i in range(len(self.layers) - 1):
x = F.leaky_relu(self.layers[i](x))
self.selected_out[i+1] = x.detach().numpy()
tot = True
if tot:
x = self.layers[-1](x)
self.selected_out[len(self.layers)] = x.detach().numpy()
else:
x = F.leaky_relu(self.layers[-1](x))
self.selected_out[len(self.layers)] = x.detach().numpy()
return x, self.selected_out
def _compute_number_of_parameters(self):
'''
Function that computes the total amount of parameters of a model
'''
num = 0
for mod in self.layers:
num += sum(p.numel() for p in mod.parameters())
return num
def _create_effdim_model(self, model):
'''
Given a Pytorch model returns a list of all his layers
'''
ret = []
for j, (name, m) in enumerate(model.named_modules()):
if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
ret.append(nn.Sequential(m.double()))
if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
ret.append(m.double())
return ret
class NoSClassicalNeuralNetwork( nn.Module):
def __init__(self, net):
nn.Module.__init__(self)
self.net = net
self.layers = self._create_effdim_model(net)
self.d = self._compute_number_of_parameters()
self.selected_out = {}
def forward(self, x):
tot = False
if not torch.is_tensor(x):
self.selected_out[0] = x
x = torch.from_numpy(x)
else:
self.selected_out[0] = x.detach().numpy()
for i in range(len(self.layers) - 1):
x = F.leaky_relu(self.layers[i](x))
self.selected_out[i+1] = x.detach().numpy()
tot = True
if tot:
x = self.layers[-1](x)
self.selected_out[len(self.layers)] = x.detach().numpy()
else:
x = F.leaky_relu(self.layers[-1](x))
self.selected_out[len(self.layers)] = x.detach().numpy()
return x, self.selected_out
def _create_rand_params(self):
for l in self.layers:
if type(l) == nn.Linear:
l.weight.data.uniform_(0,1)
def _compute_number_of_parameters(self):
'''
Function that computes the total amount of parameters of a model
'''
num = 0
for mod in self.layers:
num += sum(p.numel() for p in mod.parameters())
return num
def _create_effdim_model(self, model):
'''
Given a Pytorch model returns a list of all his layers.
'''
ret = []
for j, (name, m) in enumerate(model.named_modules()):
if type(m) == nn.Linear and j!= (len(list(model.named_modules()))-1):
ret.append(m.double())
if type(m) == nn.Linear and j== (len(list(model.named_modules()))-1):
ret.append(m.double())
return ret
class net5410210257(nn.Module):
def __init__(self,input_size,output_size):
super(net5410210257,self).__init__()
self.l1 = nn.Linear(input_size,10, bias=False)
self.relu = nn.LeakyReLU()
self.l2 = nn.Linear(10,2, bias=False)
self.relu = nn.LeakyReLU()
self.l3 = nn.Linear(2,10, bias=False)
self.relu = nn.LeakyReLU()
self.l4 = nn.Linear(10,25, bias=False)
self.relu = nn.LeakyReLU()
self.l5 = nn.Linear(25,7, bias=False)
def forward(self,x):
output = self.l1(x)
output = self.relu(output)
output = self.l2(output)
output = self.relu(output)
output = self.l3(output)
output = self.relu(output)
output = self.l4(output)
output = self.relu(output)
output = self.l5(output)
return output
x = np.ones(54)
net1 = net5410210257(54,7)
smodel = ClassicalNeuralNetwork(net1)
model = NoSClassicalNeuralNetwork(net1)
out,_ = model(x)
outs,_ = smodel(x)
print("The output of the model without nn.Sequential is " + str(out))
print("The output of the model with nn.Sequential is " + str(outs))
print('--'*50 + 'PARAMETERS GRADS of NON-SEQUENTIAL MODEL' + '--'*50 )
model.zero_grad()
out[1].backward()
for p in model.parameters():
print(p.grad)
print('--'*50 + 'PARAMETERS GRADS of SEQUENTIAL MODEL' + '--'*50 )
outs[1].backward()
for p in smodel.parameters():
print(p.grad)
Best,
Max