lr = [nn.Parameter(torch.ones(1)*0.5) for _ in range(n)] #I have a list of learning rates parameters that I want to learn
grad = autograd.grad(train_loss, model.parameters, create_graph=True)
for param_indx, param in enumerate(model.parameters()):
param.data.add_(-lr[0]*grad[param_indx]) #manual SGD on parameters
model_loss = loss(model(input), y)
autograd.grad(model_loss, lr[0]) # compute loss gradient with respect to the learning rate
It should theoretically be able to compute that gradient, doesn’t it?
But I get an error saying:
RuntimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.
lr = [[nn.Parameter(torch.ones(1)*0.5) for _ in range(N)] for _ in range(parameter_length)] # a parameter_length x N list of learning rates
test_losses = []
for epoch in range(epochs):
for j in range(N):
train_loss = loss(model(data), y)
grad = autograd.grad(train_loss, model.parameters())
for param_indx, param in enumerate(model.parameters()):
param.data.add_(-lr[param_indx][j] * grad[param_indx])
test_loss = (model(data_test), y_test)
test_losses.append(test_loss)
loss_wrt_lr = []
for j in range(N):
for param_indx, _ in enumerate(model.parameters()):
lr_grad = autograd.grad(test_losses[j], lr[param_indx][lr]) # this is not computing the gradient
loss_wrt_lr.append(lr_grad)