Hello,
I’m doing a gradient accumulation on a toy problem (MNIST) and it seems like the gradient accumulation works well, except for getting a lower accuracy by a few percents as I increase the accumulation steps beyond 1.
The train-set’s size is divisible by the batch’s size, so I don’t expect a partial (last ) “mini-batch” to affect on the results.
For example, when the train batch size is set to 5000 while the accumulation steps=1 (regular) I get a higher accuracy in comparison to setting the training batch size to 1000 and increase the accumulation steps to 5. That’s a repeatable pattern, not due to stochastics.
Any idea why I get lower accuracy as I increase the accumulation_steps?
(I can’t fix the identation, so sorry for that)
import torch
print(torch.version)
from copy import deepcopy
from collections import OrderedDict
import gc
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD,Adam,lr_scheduler
from torch.utils.data import random_split
import torchvision
from torchvision import transforms,models
import torchvision.datasets as datasetsmnist_transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])traindata = datasets.MNIST(root=‘.’, train=True,download=True, transform=mnist_transform)
trainset,valset = random_split(traindata,[50000,10000])
train_loader = torch.utils.data.DataLoader(trainset, batch_size=1000,shuffle=True)
val_loader = torch.utils.data.DataLoader(valset, batch_size=1000,shuffle=False, num_workers=2)
testset = datasets.MNIST(root=‘.’, train=False,download=True, transform=mnist_transform)
test_loader = torch.utils.data.DataLoader(testset, batch_size=256,shuffle=False, num_workers=2, drop_last=False)
class Net(nn.Module):
def init(self):
super(Net, self).init()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)def forward(self, x): x = self.conv1(x) x = F.relu(x) x = self.conv2(x) x = F.relu(x) x = F.max_pool2d(x, 2) x = self.dropout1(x) x = torch.flatten(x, 1) x = self.fc1(x) x = F.relu(x) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output
device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)
print('device: ', device)
model = Net().to(device)optimizer = SGD(model.parameters(), lr=0.1, momentum=0.9)
lr_schedule = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
states = {}best_val_acc = 0
best_val_model = None
loss_mini_batch = 0
accumulation_steps = 1
for epoch in range(4):
print('Epoch: ', epoch)
model.train(True)
running_loss = 0.0
running_correct = 0for i, data in enumerate(train_loader, 0):
print('i: ', i) inputs, labels = data if torch.cuda.is_available(): inputs, labels = inputs.cuda(),labels.cuda() optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, labels) print('loss before division (direct): ', loss.item()) loss = loss / accumulation_steps ### Normalize our loss (if averaged) print('loss after division: ', loss) print('loss_mini_batch before add: ', loss_mini_batch) loss_mini_batch += loss.item() print('loss_mini_batch after add: ', loss_mini_batch) loss.backward() if (i+1) % accumulation_steps == 0: # Wait for several backward steps optimizer.step() # Now we can do an optimizer step model.zero_grad() print('loss_mini_batch: ', loss_mini_batch) loss_mini_batch = 0 # Reset gradients tensors print('******') running_loss += loss.item() * inputs.size(0) out = torch.argmax(outputs.detach(),dim=1) assert out.shape==labels.shape running_correct += (labels==out).sum().item() Train_Acc = np.round(running_correct/len(trainset),decimals=4) Train_Loss = np.round(running_loss/len(trainset),decimals=4) print('Train Acc: ', Train_Acc, ',Train Loss: ', Train_Loss) val_correct = 0 val_loss = 0 model.train(False) with torch.no_grad(): for inputs,labels in val_loader: if torch.cuda.is_available(): inputs, labels = inputs.cuda(),labels.cuda() output = model(inputs) loss = criterion(output, labels) val_loss += loss.item()*inputs.size(0) scores, predictions = torch.max(output.data, 1) # only the whole batch val_correct += int(sum(predictions == labels)) Val_Acc = np.round(val_correct/len(valset),decimals=4) Val_Loss = np.round(val_loss/len(valset),decimals=4) print('Val Acc: ', Val_Acc, ',Val Loss: ', Val_Loss) if Val_Acc>best_val_acc: best_val_acc = Val_Acc best_val_model = deepcopy(model.state_dict()) lr_schedule.step()