Two forward with same network to get two loss, why two backward works without retain_graph=True?

zai_heshi · December 2, 2022, 6:15am

import torch
import torch.nn as nn
import torch.optim as optim

class Linear(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(4, 3)

    def forward(self, x):
        output = self.linear(x)
        return output

def p1():
    linear_layer = Linear()
    optimizer = optim.SGD(linear_layer.parameters(), lr=0.001)

    x0 = torch.randn((3, 4))
    x1 = torch.randn((3, 4))

    output_1 = linear_layer(x0)
    output_2 = linear_layer(x1)
    loss1 = output_1.mean()
    loss2 = output_2.mean()
    optimizer.zero_grad()
    loss1.backward()
    loss2.backward()  # why this works?
    optimizer.step()
    for name, params in linear_layer.named_parameters():
        print('name: ', name)
        print('grad: ', params.grad)

def p2():
    linear_layer = Linear()
    optimizer = optim.SGD(linear_layer.parameters(), lr=0.001)

    x = torch.randn((3, 4))

    output_1 = linear_layer(x)
    loss1 = output_1.mean()
    loss2 = output_1.sum()
    optimizer.zero_grad()
    loss1.backward(retain_graph=True) # why this need set retain_graph=True
    loss2.backward()
    optimizer.step()
    for name, params in linear_layer.named_parameters():
        print('name: ', name)
        print('grad: ', params.grad)

Hello, I know the computational graph will be freed after calling backward(). Can anyone explain why p1 not need set retain_graph=True? Is it because two computational graphs are constructed?

huahuanZ · December 2, 2022, 7:39am

retain_graph=True is required only if you conduct backward through non-leaf tensor multiple times.

In the p1(), backward goes through the linear_layer.weight/bias twice, but they are leaf tensors.
In the p2(), output_1 is a non-leaf tensor created by linear_layer.