There is no diff between them when it comes to updating weights. When you call loss1.backward(retain_graph=True) and then loss2.backward() the gradients are added just like calling total_loss.backward().
If you want to have 2 optimizers or compute step size based on gradients then yes maybe it will lead to different updates.
This question has generated more traction than I thought. It seems to bother many users why method 1 and method 2 are same. So here is a small snippet showing the gradient obtained from method1 and method 2 is same.
import torch
# define model 1 and model 2
class net1(torch.nn.Module):
def __init__(self):
super(net1, self).__init__()
self.fc = torch.nn.Linear(2,2)
def forward(self, x):
return self.fc(x)
class net2(torch.nn.Module):
def __init__(self):
super(net2, self).__init__()
self.fc = torch.nn.Linear(2,1)
def forward(self, x):
return self.fc(x)
# define loss 1 and loss 2
loss1 = torch.nn.MSELoss()
loss2 = torch.nn.L1Loss()
# define random input (x) and output (y)
x = torch.randn(1,2)
y1 = torch.randn(1,2)
y2 = torch.randn(1,1)
model1 = net1()
model2 = net2()
# method 1
y1_hat = model1(x)
loss1(y1_hat, y1).backward(retain_graph=True)
y2_hat = model2(y1_hat)
loss2(y2_hat, y2).backward()
# we compute gradient norm of weights on model1
print("gradient norm method1: {0}".format(torch.norm(model1.fc.weight.grad)))
# method 2 (zero previous gradient to run method 2)
model1.zero_grad()
model2.zero_grad()
y1_hat = model1(x)
y2_hat = model2(y1_hat)
total_loss = loss1(y1_hat, y1) + loss2(y2_hat, y2)
total_loss.backward()
# we compute gradient norm of weights on model1
print("gradient norm method2: {0}".format(torch.norm(model1.fc.weight.grad)))