How to set batch size correctly when using multi-GPU training?

Hi,
I have a question on how to set the batch size correctly when using DistributedDataParallel.
If I have N GPUs across which I’m training the model, and I set the batch size of the DataLoader to 16, would the effective batch size be 16 or 16 x N?

Here is a small worked example to make it clearer.

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
import torch.multiprocessing as mp
import torch.distributed as dist
from argparse import ArgumentParser
import os


class MyModel(nn.Module):
    def __init__(self, input_dim, inner_layer_1, inner_layer_2, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, inner_layer_1)
        self.fc2 = nn.Linear(inner_layer_1, inner_layer_2)
        self.fc3 = nn.Linear(inner_layer_2, output_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.softmax(x, dim=1)
        return x


def train(gpu_number, n_epochs, model, train_data, optimizer, loss_fn, log_interval=2):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '29500'
    torch.distributed.init_process_group(
        backend='nccl',
        init_method='env://',
        world_size=2,  # total number of gpus
        rank=gpu_number
    )

    sampler = DistributedSampler(train_data, num_replicas=2, rank=gpu_number)
    trainloader = DataLoader(train_data, batch_size=16, sampler=sampler)

    #torch.cuda.set_device(gpu_number)
    model = model.cuda(gpu_number)
    model = DDP(model, device_ids=[gpu_number], output_device=gpu_number)
    for epoch in range(n_epochs):
        for i, batch in enumerate(trainloader):
            inputs, labels = batch[:,:8].cuda(gpu_number), batch[:,-2:].cuda(gpu_number)
            optimizer.zero_grad()
            outputs = model.forward(inputs)

            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
    dist.barrier()

if __name__ == "__main__":
    train_data = torch.rand(30000, 100)
    n_epochs = 4
    learning_rate = 0.001
    model = MyModel(8, 800, 300, 2)
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) 
    mp.spawn(train, nprocs=2, args=(n_epochs, model, train_data, optimizer, loss_fn, 2))

in this line trainloader = DataLoader(train_data, batch_size=16, sampler=sampler) I set the batch size to 16, but have two GPUs. What would be the equivalent / effective batch size? Would it be 16 or 32 in this case?

The valid batch size is 16*N. 16 is just the batch size in each GPU. During loss backward, DDP makes all-reduce to average the gradients across all GPUs, so the valid batch size is 16*N.

1 Like