Why accumulating gradients show us different value than summing them manually? Is it because of round-off errors?
Code:
class LogisticRegression(torch.nn.Module):
def __init__(self, input_dim, output_dim):
super(LogisticRegression, self).__init__()
self.linear = torch.nn.Linear(input_dim, output_dim)
def forward(self, x):
outputs = self.linear(x)
return outputs
batch_size = 128
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
train_dataset = dsets.MNIST(root='./dataMNIST', train=True, transform=transforms.ToTensor(), download=True)
test_dataset = dsets.MNIST(root='./dataMNIST', train=False, transform=transforms.ToTensor(), download=True)
trainloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
testloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
device = torch.device('cuda:0')
input_dim = 784
output_dim = 10
net = LogisticRegression(input_dim, output_dim)
net = net.to(device)
criterion = nn.CrossEntropyLoss(reduction = 'sum')
for data, target in trainloader:
data = Variable(data.view(-1, 28 * 28))
target = Variable(target)
data, target = data.to(device), target.to(device)
output = net(data)
cost = criterion(output, target)
cost.backward()
total_grad = 0.0
for p in net.parameters():
total_grad += p.grad.data
break
net.zero_grad()
for data, target in trainloader:
data = Variable(data.view(-1, 28 * 28))
target = Variable(target)
data, target = data.to(device), target.to(device)
output = net(data)
cost = criterion(output, target)
cost.backward()
for p in net.parameters():
total_grad -= p.grad.data
break
net.zero_grad()
print(total_grad.norm(2).item())
Output:
0.029222624376416206
Thank you!