Autograd.grad breaks in hypernetwork implementations

Hi guys, I am trying to make a hypernetwork and got stuck.

z0 = torch.Tensor([1.,1.])
input_size = z0.shape[0]

mainNetwork = torch.nn.Sequential(
torch.nn.Linear(input_size,64),
torch.nn.Tanh(),
torch.nn.Linear(64,input_size))

p_shape,_,theta_init = get_parameters(mainNetwork)
input_size_theta = theta_init.shape[0]

hyperNetwork = torch.nn.Sequential(
torch.nn.Linear(input_size_theta,64),
torch.nn.Tanh(),
torch.nn.Linear(64,input_size_theta))
z0 = torch.tensor([1.,1.],requires_grad = True)
t = 0
theta_0 = hyperNetwork(theta_init)
set_parameters(mainNetwork,theta_0)
z1 = mainNetwork(z0)
torch.autograd.grad(z1,hyperNetwork.parameters(),grad_outputs = torch.ones_like(z1),allow_unused = True)
I get (None,None,None,None) as output.
get_parameters is a function that returns all the flattened parameters of a model into a single tensor.
set_parameters takes a single tensor and sets it as the model weights.

def get_parameters(model):
p_shape =
flat_parameters =
theta = torch.empty(0)
for p in model.parameters():
p_shape.append(p.size())
flat_parameters.append(p.flatten())
theta = torch.cat([theta,p.flatten()],dim = 0)
return p_shape,flat_parameters,theta

def set_parameters(model,theta):
p_shape,flat_parameters,_ = get_parameters(model)
idx = 0
j = 0
for i in range(len(model)):
if isinstance(model[i],torch.nn.Linear):
sub_theta_weight = theta[idx : idx + np.prod(p_shape[j])].reshape(p_shape[j])
sub_theta_bias = theta[idx + np.prod(p_shape[j]):idx + np.prod(p_shape[j]) + np.prod(p_shape[j+1])].reshape(p_shape[j+1])
model[i].weight = torch.nn.Parameter(sub_theta_weight)
model[i].bias = torch.nn.Parameter(sub_theta_bias)
idx = np.prod(p_shape[j]) + np.prod(p_shape[j+1])
j += 2

I also went through Hypernetwork implementation - #8 by mariaalfaroc
Apparently, even in these examples, torch.autograd.grad(x,hyperNetwork.parameters()) is giving the exact same error as in my case even though the loss.backward() is working fine.
Any help will be greatly appreciated. I am new to autograd and trying to learn more about how it works.
Thanks.

Hi Arkaprava!

I don’t follow your code in detail, but it appears that theta_0 depends on
the parameters of hyperNetwork, that the values of the parameters of
mainNetwork depend on the value of theta_0, and that z1 depends on
mainNetwork.

However, because of how set_parameters() is implemented, the computation
graph does not connect the parameters of mainNetwork to theta_0. So the
computation graph that you might imagine connects z1 to hyperNetwork is
“broken.” Therefore autograd.grad (z1, hyperNetwork.parameters())
gives you Nones.

I don’t see this documented anywhere, but, although a new Parameter that is
constructed from an existing Parameter shares the original Parameter’s data,
it is not the same Tensor nor does it share the original Parameter’s grad.
Furthermore, autograd does not track the connection between the new and
original Parameters. That is, constructing a new Parameter from an existing
Parameter “breaks the computation graph.”

Consider:

>>> import torch
>>> torch.__version__
'2.0.1'
>>> p0 = torch.nn.Parameter (torch.ones (3))
>>> p1 = torch.nn.Parameter (p0)
>>> p1.sum().backward()
>>> p1.grad
tensor([1., 1., 1.])
>>> p0.grad
>>>

Best.

K. Frank

Thanks. Is there any way to get around this?

Hi Arkaprava!

Probably – but it depends on what you’re actually trying to accomplish.

The problem is that Parameters don’t like to be modified while being tracked
by autograd.

Here is an example script this addresses this core issue:

import torch
print (torch.__version__)

class MainNetwork (torch.nn.Module):
    def __init__(self, input_size):
        super().__init__()
        # no Parameters nor requires_grad = True
        self.lin1_weight = torch.zeros (64, input_size)
        self.lin1_bias = torch.zeros (64)
        self.lin2_weight = torch.zeros (input_size, 64)
        self.lin2_bias = torch.zeros (input_size)
    
    def forward (self, x):
        # use functional.linear() instead of Linear
        x = torch.nn.functional.linear (x, self.lin1_weight, self.lin1_bias)
        x = torch.tanh (x)
        x = torch.nn.functional.linear (x, self.lin2_weight, self.lin2_bias)
        return x

def set_parameters (model, w1, b1, w2, b2):
    # don't recreate any Parameters -- just assign Tensors
    model.lin1_weight = w1
    model.lin1_bias = b1
    model.lin2_weight = w2
    model.lin2_bias = b2

input_size = 2
z0 = torch.ones (input_size, requires_grad = True)
w1 = torch.ones (64, input_size, requires_grad = True)
b1 = torch.ones (64, requires_grad = True)
w2 = torch.ones (input_size, 64, requires_grad = True)
b2 = torch.ones (input_size, requires_grad = True)

mainNetwork = MainNetwork (input_size)
set_parameters (mainNetwork, w1, b1, w2, b2)

z1 = mainNetwork (z0)
print (z0, z1)

w1_grad, b1_grad, w2_grad, b2_grad = torch.autograd.grad (z1, (w1, b1, w2, b2), grad_outputs = torch.ones_like(z1), allow_unused = True)

print (w1_grad.shape, b1_grad.shape, w2_grad.shape, b2_grad.shape)

And here is its output:

2.0.1
tensor([1., 1.], requires_grad=True) tensor([64.6835, 64.6835], grad_fn=<AddBackward0>)
torch.Size([64, 2]) torch.Size([64]) torch.Size([2, 64]) torch.Size([2])

If this approach doesn’t work for your use case, please post a fully-self-contained,
runnable script that illustrates a simplified version of your issue, together with the
output you get when you run it.

Best.

K. Frank

Hi KFrank,

thank you for this answer. any way to use this functional API for RNN? have to create RNN from scratch?