Is it possible to perform backpropagation and accumulate gradients in just a subset of the leaf variables involved in the computational graph? My solution now is to call autograd.grad
and then copy the result in the .grad
tensors of those parameters. Is there a way to do this directly?
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
model1 = nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10))
model2 = nn.Sequential(nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10))
noise = torch.randn(5, 10)
target1, target2 = (torch.randint(10, (5,)).long() for _ in range(2))
data = model1(noise)
output = model2(data)
loss1 = F.cross_entropy(output, target1) # we care about model1's params
loss2 = F.cross_entropy(output, target2) # we care about model2's params
model1.zero_grad()
loss1.backward(retain_graph=True)
# -------------------------------------------------------------------------
# do something such that model2 has grads of loss2 while
# model1's grads remain untouched.
model2.zero_grad()
grads = autograd.grad(loss2, model2.parameters())
for param, grad in zip(model2.parameters(), grads):
param.grad.data.copy_(grad.data)