I was pretty happy to see that computation of Jacobian and Hessian matrices are now built into the new torch.autograd.functional
API which avoids laboriously writing code using nested for loops and multiple calls to autograd.grad
. However, I have been having a hard time understanding how to use them when the independent variables are parameters of an nn.Module
. For example, I would like to be able to use hessian
to compute the Hessian of a loss function w.r.t. the model’s parameters. If I don’t use an nn.Module
I can successfully compute the Hessian using
import torch
x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
y = torch.tensor([1.0, 2.0])
def compute_z(a, b):
output = (x * a.unsqueeze(0) ** 3 + x * b.unsqueeze(0) ** 3).sum(dim=1)
z = ((output - y)**2).mean()
return z
a = torch.tensor([1.0, -1.0, 2.0], requires_grad=True)
b = torch.tensor([2.0, -2.0, -1.0], requires_grad=True)
params = (a, b)
hessians = torch.autograd.functional.hessian(compute_z, params, strict=True)
param_names = ('a', 'b')
for d_name, d_hessians in zip(param_names, hessians):
for dd_name, dd_hessian in zip(param_names, d_hessians):
print(f'dz/d{dd_name}d{d_name} = \n{dd_hessian}\n')
which works as I would expect
dz/dada =
tensor([[ 963., 198., 972.],
[ 198., -801., 1296.],
[ 972., 1296., 9108.]])
dz/dbda =
tensor([[ 612., 792., 243.],
[ 792., 1044., 324.],
[3888., 5184., 1620.]])
dz/dadb =
tensor([[ 612., 792., 3888.],
[ 792., 1044., 5184.],
[ 243., 324., 1620.]])
dz/dbdb =
tensor([[4068., 3168., 972.],
[3168., 2052., 1296.],
[ 972., 1296., -909.]])
But when I encapsulate this within an nn.Module
I can’t get it to work. Among other things I’ve tried
import torch
class Net(torch.nn.Module):
def __init__(self):
super().__init__()
self.a = torch.nn.Parameter(torch.tensor([1.0, -1.0, 2.0]))
self.b = torch.nn.Parameter(torch.tensor([2.0, -2.0, -1.0]))
def forward(self, x):
output = (x * self.a.unsqueeze(0) ** 3 + x * self.b.unsqueeze(0) ** 3).sum(dim=1)
return output
x = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
y = torch.tensor([1.0, 2.0])
net = Net()
def compute_z(*net_parameters):
# Is there a proper way to set the parameters that works with hessian?
for p_src, p_dst in zip(net_parameters, net.parameters()):
p_dst.data = p_src.data
output = net(x)
z = ((output - y)**2).mean()
return z
# strict=True raises exception, allowing non-strict for demonstration
hessians = torch.autograd.functional.hessian(compute_z, tuple(net.parameters()))
param_names = [n for n, _ in net.named_parameters()]
for d_name, d_hessians in zip(param_names, hessians):
for dd_name, dd_hessian in zip(param_names, d_hessians):
print(f'dz/d{dd_name}d{d_name} = \n{dd_hessian}\n')
but this results in all zeros
dz/dada =
tensor([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
dz/dbda =
tensor([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
dz/dadb =
tensor([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
dz/dbdb =
tensor([[0., 0., 0.],
[0., 0., 0.],
[0., 0., 0.]])
Could someone explain what I’m doing wrong here and if there is some way to utilize autograd.functional.hessian
to compute the hessian w.r.t. a module’s parameters?