Hi, suppose we have two losses, I tried to sum these losses first and then do the backward / backward them one by one. I think that the gradients wrt a layer should be equal. But I found that it is different.
cudnn.deterministic = True
cudnn.benchmark = False
torch.manual_seed(2)
torch.set_printoptions(precision=16)
a = torch.randn(2, 3, 10, 10).cuda()
label = torch.tensor([0, 1]).cuda()
class Model(nn.Module):
def __init__(self) -> None:
super().__init__()
self.l1 = nn.Conv2d(3, 10, 3, 1, 1)
self.l2 = nn.Conv2d(10, 2, 3, 1, 1)
def forward(self, x):
x = self.l1(x)
x = self.l2(x)
x = x.view(x.size(0), -1)
return x
model = Model().cuda()
model.zero_grad()
x = torch.randn_like(a)
y = model(x)
loss = y.sum(dim=1)
print(loss)
loss.sum().backward(retain_graph=True)
grad_l1_1 = model.l1.weight.grad.clone()
grad_l2_1 = model.l2.weight.grad.clone()
model.zero_grad()
loss[0].backward(retain_graph=True)
loss[1].backward(retain_graph=True)
grad_l1_2 = model.l1.weight.grad.clone()
grad_l2_2 = model.l2.weight.grad.clone()
diff_l1 = (grad_l1_2 - grad_l1_1).abs().sum()
diff_l2 = (grad_l2_2 - grad_l2_1).abs().sum()
print(diff_l1)
print(diff_l2)
tensor(2.1591782569885254e-05, device='cuda:0')
tensor(0., device='cuda:0')
I think the reason might be the float issue. But it’s just a two layers network. is that normal to get so different grads like this?
Thanks!