I try to implement a customized optimizer that performs step-length search.
It requires computing, $\phi(a) = net(w + a * p)$ and $d_{\phi}/d_a$,
where
- $a$ is the step-length,
- $w$ is the weight vector,
- $p$ is the step-direction vector,
- net is the neural network function that output a scalar loss value, $L$
I can compute $d_{\phi}/d_a$, which is equivalent to $d_L/d_a$, for simple net below.
The question is:
For a net constructed using nn.module,
- how to compute the gradient of loss wrt step-length?
- how to update the net parameter so that we can compute the gradient of the loss wrt to step-length?
#!/usr/bin/env python3
import torch
torch.manual_seed(12345)
N, D_in, H, D_out = 10, 2, 5, 1
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)
p1 = torch.randn(D_in, H, requires_grad=True)
p2 = torch.randn(H, D_out, requires_grad=True)
a = torch.randn(1, requires_grad=True)
w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)
w1 = w1 + a * p1
w2 = w2 + a * p2
h = x.mm(w1)
h_relu = h.clamp(min=0)
y_pred = h_relu.mm(w2)
loss = (y_pred - y).pow(2).mean()
# Compute ga: grad of loss wrt a
ga, = torch.autograd.grad(loss, a, create_graph=True)
print(ga)
# Net using nn.module #####
class Net(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(Net, self).__init__()
self.hidden = torch.nn.Linear(D_in, H)
self.output = torch.nn.Linear(H, D_out)
def forward(self, x):
y = torch.nn.functional.relu( self.hidden(x) )
y = self.output(y)
return y
net = Net(2, 5, 1)
loss_fn = torch.nn.MSELoss()
p1 = torch.transpose(p1, 0, 1)
p2 = torch.transpose(p2, 0, 1)
for name, p in net.named_parameters():
# Got RuntimeError: a leaf Variable that requires grad has been used in an in-place operation.
# if name == 'hidden.weight':
# p.add_(a * p1)
# elif name == 'output.weight':
# p.add_(a * p2)
# else:
# pass
# RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavio
# if name == 'hidden.weight':
# p = p + (a * p1)
# elif name == 'output.weight':
# p = p + (a * p2)
# else:
# pass
# TODO: how to update the weight so that we can compute the gradient of loss wrt step-length a?
pass
y_pred = net(x)
loss = loss_fn(y_pred, y)
print(loss.item())
# TODO: Compute ga: grad of loss wrt a
# ga, = torch.autograd.grad(loss, a, create_graph=True)
# print(ga)