What exactly does `retain_variables=True` in `loss.backward()` do?

There is no diff between them when it comes to updating weights. When you call loss1.backward(retain_graph=True) and then loss2.backward() the gradients are added just like calling total_loss.backward().

If you want to have 2 optimizers or compute step size based on gradients then yes maybe it will lead to different updates.

They are different, I have to say.

This question has generated more traction than I thought. It seems to bother many users why method 1 and method 2 are same. So here is a small snippet showing the gradient obtained from method1 and method 2 is same.

import torch 

# define model 1 and model 2
class net1(torch.nn.Module):
	def __init__(self):
		super(net1, self).__init__()
		self.fc = torch.nn.Linear(2,2)

	def forward(self, x):
		return self.fc(x)

class net2(torch.nn.Module):
	def __init__(self):
		super(net2, self).__init__()
		self.fc = torch.nn.Linear(2,1)

	def forward(self, x):
		return self.fc(x)

# define loss 1 and loss 2
loss1 = torch.nn.MSELoss()
loss2 = torch.nn.L1Loss()

# define random input (x) and output (y)
x = torch.randn(1,2)
y1 = torch.randn(1,2)
y2 = torch.randn(1,1)
model1 = net1()
model2 = net2()

# method 1
y1_hat = model1(x)
loss1(y1_hat, y1).backward(retain_graph=True)
y2_hat = model2(y1_hat)
loss2(y2_hat, y2).backward()
# we compute gradient norm of weights on model1
print("gradient norm method1: {0}".format(torch.norm(model1.fc.weight.grad)))

# method 2 (zero previous gradient to run method 2)
model1.zero_grad()
model2.zero_grad()
y1_hat = model1(x)
y2_hat = model2(y1_hat)
total_loss = loss1(y1_hat, y1) + loss2(y2_hat, y2)
total_loss.backward()
# we compute gradient norm of weights on model1
print("gradient norm method2: {0}".format(torch.norm(model1.fc.weight.grad)))
1 Like