import torch
tensor_to_name = {}
def make_tensor_hook(tensor):
def tensor_hook(grad):
if tensor.requires_grad:
print("\ntensor_hook", tensor_to_name[tensor])
print("\tgrad", grad)
for t, name in tensor_to_name.items():
print("\t", name, t, t.grad)
else:
print("tensor doesn't require grad", tensor_to_name[tensor])
return tensor_hook
a = torch.tensor([1.0, 2.0], requires_grad=True)
b = a + a
c = b.mean()
b.retain_grad()
c.retain_grad()
tensor_to_name[a] = "a"
tensor_to_name[b] = "b"
tensor_to_name[c] = "c"
a.register_hook(make_tensor_hook(a))
b.register_hook(make_tensor_hook(b))
c.register_hook(make_tensor_hook(c))
c.backward()
After we executing the code, we get:
tensor_hook c
grad tensor(1.)
a tensor([1., 2.], requires_grad=True) None
b tensor([2., 4.], grad_fn=<AddBackward0>) None
c tensor(3., grad_fn=<MeanBackward0>) None
tensor_hook b
grad tensor([0.5000, 0.5000])
a tensor([1., 2.], requires_grad=True) None
b tensor([2., 4.], grad_fn=<AddBackward0>) None
c tensor(3., grad_fn=<MeanBackward0>) tensor(1.)
tensor_hook a
grad tensor([1., 1.])
a tensor([1., 2.], requires_grad=True) None
b tensor([2., 4.], grad_fn=<AddBackward0>) tensor([0.5000, 0.5000])
c tensor(3., grad_fn=<MeanBackward0>) tensor(1.)
According to the above output, we can see the grad attribute of a tensor is None during its corresponding hook. For example, c.grad is None during the registered hook while gets its value during the hook of b. The doc Backward Hooks execution says the grad field is updated after the prehook of the node, but it’s too vague for me. So the question is that when the grad attribute of a tensor is updated and where the corresponding source code.
Can you help me locate the specific codes for updating the grad field?
From the following source code (torch/csrc/autograd/engine.cpp) I found, I can identify the ordering of tensor_pre_hooks, pre_hooks and post_hooks. But I cannot find when and where the engine updates the grad field.
static variable_list call_function(
std::shared_ptr<GraphTask>& graph_task,
Node* func,
InputBuffer& inputBuffer) {
CheckpointValidGuard cpvguard(graph_task);
auto& fn = *func;
auto inputs =
call_tensor_pre_hooks(fn, InputBuffer::variables(std::move(inputBuffer)));
inputs = call_pre_hooks(fn, std::move(inputs));
if (!graph_task->keep_graph_) {
fn.will_release_variables();
}
const auto has_post_hooks = !fn.post_hooks().empty();
variable_list outputs;
if (has_post_hooks) {
// In functions/accumulate_grad.cpp, there is some logic to check the
// conditions under which the incoming gradient can be stolen directly
// (which elides a deep copy) instead of cloned. One of these conditions
// is that the incoming gradient's refcount must be 1 (nothing else is
// referencing the same data). Stashing inputs_copy here bumps the
// refcount, so if post hooks are employed, it's actually still ok for
// accumulate_grad.cpp to steal the gradient if the refcount is 2.
//
// "new_grad.use_count() <= 1 + !post_hooks().empty()" in
// accumulate_grad.cpp accounts for this, but also creates a silent
// dependency between engine.cpp (ie, this particular engine
// implementation) and accumulate_grad.cpp.
//
// If you change the logic here, make sure it's compatible with
// accumulate_grad.cpp.
auto inputs_copy = inputs;
outputs = fn(std::move(inputs_copy));
} else {
outputs = fn(std::move(inputs));
}
validate_outputs(fn.next_edges(), outputs, [&](const std::string& msg) {
std::ostringstream ss;
ss << "Function " << fn.name() << " returned an " << msg;
return ss.str();
});
if (has_post_hooks) {
// NOLINTNEXTLINE(bugprone-use-after-move)
return call_post_hooks(fn, std::move(outputs), inputs);
}
return outputs;
}