Gradient Accumulation - lower accuracy as accumulation steps increases

Hello,
I’m doing a gradient accumulation on a toy problem (MNIST) and it seems like the gradient accumulation works well, except for getting a lower accuracy by a few percents as I increase the accumulation steps beyond 1.
The train-set’s size is divisible by the batch’s size, so I don’t expect a partial (last ) “mini-batch” to affect on the results.
For example, when the train batch size is set to 5000 while the accumulation steps=1 (regular) I get a higher accuracy in comparison to setting the training batch size to 1000 and increase the accumulation steps to 5. That’s a repeatable pattern, not due to stochastics.

Any idea why I get lower accuracy as I increase the accumulation_steps?
(I can’t fix the identation, so sorry for that)

import torch
print(torch.version)
from copy import deepcopy
from collections import OrderedDict
import gc
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD,Adam,lr_scheduler
from torch.utils.data import random_split
import torchvision
from torchvision import transforms,models
import torchvision.datasets as datasets

mnist_transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])

traindata = datasets.MNIST(root=’.’, train=True,download=True, transform=mnist_transform)
trainset,valset = random_split(traindata,[50000,10000])
train_loader = torch.utils.data.DataLoader(trainset, batch_size=1000,shuffle=True)
val_loader = torch.utils.data.DataLoader(valset, batch_size=1000,shuffle=False, num_workers=2)
testset = datasets.MNIST(root=’.’, train=False,download=True, transform=mnist_transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=256,shuffle=False, num_workers=2, drop_last=False)

class Net(nn.Module):

def init(self):
super(Net, self).init()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)

def forward(self, x):
    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = F.max_pool2d(x, 2)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    output = F.log_softmax(x, dim=1)
    return output

device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
print('device: ', device)
model = Net().to(device)

optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9)
lr_schedule = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
states = {}

best_val_acc = 0
best_val_model = None
loss_mini_batch = 0
accumulation_steps = 1

for epoch in range(4):
print('Epoch: ', epoch)
model.train(True)
running_loss = 0.0
running_correct = 0

for i, data in enumerate(train_loader, 0):
    print('i: ', i)
    inputs, labels = data
    if torch.cuda.is_available():
        inputs, labels = inputs.cuda(),labels.cuda()
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    print('loss before division (direct): ', loss.item())
    loss = loss / accumulation_steps                ### Normalize our loss (if averaged)
    print('loss after division: ', loss)
    print('loss_mini_batch before add: ', loss_mini_batch)
    loss_mini_batch +=  loss.item()
    print('loss_mini_batch after add: ', loss_mini_batch)
    loss.backward()
    
    if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
        optimizer.step()                            # Now we can do an optimizer step
        model.zero_grad()
        print('loss_mini_batch: ', loss_mini_batch)
        loss_mini_batch = 0                           # Reset gradients tensors
        print('******')
        
    running_loss += loss.item() * inputs.size(0)
    out = torch.argmax(outputs.detach(),dim=1)
    assert out.shape==labels.shape
    running_correct += (labels==out).sum().item()
Train_Acc = np.round(running_correct/len(trainset),decimals=4)
Train_Loss = np.round(running_loss/len(trainset),decimals=4)
print('Train Acc: ', Train_Acc, ',Train Loss: ', Train_Loss)
val_correct = 0
val_loss = 0
model.train(False)
with torch.no_grad():
    for inputs,labels in val_loader:
        if torch.cuda.is_available():
            inputs, labels = inputs.cuda(),labels.cuda()
        output = model(inputs)
        loss = criterion(output, labels)
        val_loss += loss.item()*inputs.size(0)
        scores, predictions = torch.max(output.data, 1) # only the whole batch
        val_correct += int(sum(predictions == labels))
Val_Acc = np.round(val_correct/len(valset),decimals=4)
Val_Loss = np.round(val_loss/len(valset),decimals=4)
print('Val Acc: ', Val_Acc, ',Val Loss: ', Val_Loss)
if Val_Acc>best_val_acc:
    best_val_acc = Val_Acc
    best_val_model = deepcopy(model.state_dict())
lr_schedule.step()

I get equal gradients (up to floating point precision) using your model and this example script for different number of accumulation steps:

import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

model = Net()
N = 50
x = torch.randn(N, 1, 28, 28)
target = torch.randint(0, 10, (N,))
criterion = nn.NLLLoss()
model.eval() # disable dropout

# raw run
out = model(x)
loss = criterion(out, target)
loss.backward()
grads_ref = [param.grad.clone() for param in model.parameters()]

# gradient accumulation
model.zero_grad()
nb_steps = 5
step = int(x.size(0)/nb_steps)
for i in range(0, x.size(0), step):
    print(i, i+step)
    x_ = x[i:i+step]
    out = model(x)
    loss = criterion(out, target)
    loss = loss / nb_steps
    loss.mean().backward()

grads = [param.grad.clone() for param in model.parameters()]

# compare
for g1, g2 in zip(grads_ref, grads):
    print((g1 - g2).abs().max())

> tensor(1.9325e-08)
  tensor(1.5600e-08)
  tensor(2.1886e-08)
  tensor(3.5390e-08)
  tensor(4.6566e-09)
  tensor(1.8626e-09)
  tensor(1.6764e-08)
  tensor(7.4506e-09)

You could also check, if your code yields the same gradients to narrow down the difference between the different approaches.
Note that I had to disable dropout in order to compare the results. Otherwise the randomly dropped activations wouldn’t allow for the comparison.