# Gradient Accumulation - lower accuracy as accumulation steps increases

Hello,
I’m doing a gradient accumulation on a toy problem (MNIST) and it seems like the gradient accumulation works well, except for getting a lower accuracy by a few percents as I increase the accumulation steps beyond 1.
The train-set’s size is divisible by the batch’s size, so I don’t expect a partial (last ) “mini-batch” to affect on the results.
For example, when the train batch size is set to 5000 while the accumulation steps=1 (regular) I get a higher accuracy in comparison to setting the training batch size to 1000 and increase the accumulation steps to 5. That’s a repeatable pattern, not due to stochastics.

Any idea why I get lower accuracy as I increase the accumulation_steps?
(I can’t fix the identation, so sorry for that)

import torch
print(torch.version)
from copy import deepcopy
from collections import OrderedDict
import gc
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split
import torchvision
from torchvision import transforms,models
import torchvision.datasets as datasets

mnist_transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])

trainset,valset = random_split(traindata,[50000,10000])

class Net(nn.Module):

def init(self):
super(Net, self).init()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)

``````def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
``````

device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
print('device: ', device)
model = Net().to(device)

optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9)
lr_schedule = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
states = {}

best_val_acc = 0
best_val_model = None
loss_mini_batch = 0
accumulation_steps = 1

for epoch in range(4):
print('Epoch: ', epoch)
model.train(True)
running_loss = 0.0
running_correct = 0

``````for i, data in enumerate(train_loader, 0):
``````
``````    print('i: ', i)
inputs, labels = data
if torch.cuda.is_available():
inputs, labels = inputs.cuda(),labels.cuda()
outputs = model(inputs)
loss = criterion(outputs, labels)
print('loss before division (direct): ', loss.item())
loss = loss / accumulation_steps                ### Normalize our loss (if averaged)
print('loss after division: ', loss)
loss_mini_batch +=  loss.item()
loss.backward()

if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
optimizer.step()                            # Now we can do an optimizer step
print('loss_mini_batch: ', loss_mini_batch)
loss_mini_batch = 0                           # Reset gradients tensors
print('******')

running_loss += loss.item() * inputs.size(0)
out = torch.argmax(outputs.detach(),dim=1)
assert out.shape==labels.shape
running_correct += (labels==out).sum().item()
Train_Acc = np.round(running_correct/len(trainset),decimals=4)
Train_Loss = np.round(running_loss/len(trainset),decimals=4)
print('Train Acc: ', Train_Acc, ',Train Loss: ', Train_Loss)
val_correct = 0
val_loss = 0
model.train(False)
if torch.cuda.is_available():
inputs, labels = inputs.cuda(),labels.cuda()
output = model(inputs)
loss = criterion(output, labels)
val_loss += loss.item()*inputs.size(0)
scores, predictions = torch.max(output.data, 1) # only the whole batch
val_correct += int(sum(predictions == labels))
Val_Acc = np.round(val_correct/len(valset),decimals=4)
Val_Loss = np.round(val_loss/len(valset),decimals=4)
print('Val Acc: ', Val_Acc, ',Val Loss: ', Val_Loss)
if Val_Acc>best_val_acc:
best_val_acc = Val_Acc
best_val_model = deepcopy(model.state_dict())
lr_schedule.step()``````

I get equal gradients (up to floating point precision) using your model and this example script for different number of accumulation steps:

``````import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)

def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output

model = Net()
N = 50
x = torch.randn(N, 1, 28, 28)
target = torch.randint(0, 10, (N,))
criterion = nn.NLLLoss()
model.eval() # disable dropout

# raw run
out = model(x)
loss = criterion(out, target)
loss.backward()

nb_steps = 5
step = int(x.size(0)/nb_steps)
for i in range(0, x.size(0), step):
print(i, i+step)
x_ = x[i:i+step]
out = model(x)
loss = criterion(out, target)
loss = loss / nb_steps
loss.mean().backward()

# compare
print((g1 - g2).abs().max())

> tensor(1.9325e-08)
tensor(1.5600e-08)
tensor(2.1886e-08)
tensor(3.5390e-08)
tensor(4.6566e-09)
tensor(1.8626e-09)
tensor(1.6764e-08)
tensor(7.4506e-09)
``````

You could also check, if your code yields the same gradients to narrow down the difference between the different approaches.
Note that I had to disable dropout in order to compare the results. Otherwise the randomly dropped activations wouldn’t allow for the comparison.