You can even check:
M = nn.Linear(100,50)
W = M.weight
criterion = nn.MSELoss(M.parameters())
C = Variable(torch.rand(50,100),requires_grad=True)
A = Variable(torch.rand(4,100),requires_grad=True)
T1 = Variable(torch.rand(50,100))
T2 = Variable(torch.rand(4,50))
B = M(A)
D = W + C
l1 = torch.sum((D-T1)**2)
l2 = torch.sum((B-T2)**2)
l = l1 + l2
l.backward()
x = 2*(D - T1)
y = (B - T2).transpose(0,1).unsqueeze(0)
z = A.unsqueeze(0)
t = 2*torch.bmm(y,z).squeeze()
grad_test = x+t # 2 (W+C-T1) + 2 (W*A+b1-T2)*A
print(torch.sum((W.grad-grad_test)**2))
Variable containing:
0
[torch.FloatTensor of size 1]