I see. So we can assign a gradient tensor to .grad (an example is shown below) to avoid another gradient torch created during .backward(). Am I right?
The example below is adapted from another answer: How to split backward process wrt each layer of neural network?
So in the example below, I can calculate the gradient manually and has a pre-defined gradient tensor for every layer (see the backward() function)?
I greatly appreciate your help!
import torch.nn as nn
from torch.autograd import Variable
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.layers = nn.ModuleList([
nn.Linear(10, 10),
nn.Linear(10, 10),
nn.Linear(10, 10),
nn.Linear(10, 10),
])
def forward(self, x):
self.output = []
self.input = []
for layer in self.layers:
# detach from previous history
x = Variable(x.data, requires_grad=True)
self.input.append(x)
# compute output
x = layer(x)
# add to list of outputs
self.output.append(x)
return x
def backward(self, g):
for i, output in reversed(list(enumerate(self.output))):
a = torch.ones(4, 10)
self.input[i].grad = a
if i == (len(self.output) - 1):
# for last node, use g
output.backward(g)
print(self.input[i].grad.shape)
else:
output.backward(self.input[i+1].grad.data)
model = Net()
inp = Variable(torch.randn(4, 10))
output = model(inp)
gradients = torch.randn(*output.size())
model.backward(gradients)