So I am training a convLSTM network, however, I decided to check if BPTT differs in results when it is called on a cumulative loss or on each individual loss object. I experimented on the gradient over the weight w1 and it not exactly the same in two cases. Code is below, anyone can comment on that difference
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy
w1 = torch.nn.Parameter(torch.tensor(([[1,2],[4,1]]),dtype=torch.float))
x = torch.nn.Parameter(torch.tensor(([[1,5],[4,5]]),dtype=torch.float),requires_grad=False)
y = torch.nn.Parameter(torch.tensor(([[2,7],[3,4]]),dtype=torch.float),requires_grad=False)
loss_fn = torch.nn.MSELoss(reduction='mean')
t= 20
#initial grad
print(w1.grad)
#case 1 call backward outside of the loop on a cumulative loss
loss = 0
x1=x.clone()
for i in range(t):
y_pred = w1.mm(x1)
loss += loss_fn(y_pred, y)
print(i, loss_fn(y_pred, y).item())
x1= y_pred.detach()
loss.backward()
case1 = (w1.grad).data
#reset grad of w1
w1.grad= None
#case two : backward in the loop
loss = 0
x1=x.clone()
for i in range(t):
y_pred = w1.mm(x1)
loss += loss_fn(y_pred, y)
loss_fn(y_pred, y).backward(retain_graph=True)
print(i, loss_fn(y_pred, y).item())
x1= y_pred.detach()
case2 = (w1.grad).data
print(torch.all(torch.eq(case1, case2))) #this gives false
numpy.isclose(case1, case2) #this gives true, they are close but not same