Model trained by DistributedDataParallel didn't perform as well as single GPU

I am working on DistributedDataParallel, trying to speed up training process. However, after two epochs, the distributed did not perform as well as the normal.

the log of distributed verision:

Epoch [1/2], Step [100/150], Loss: 2.1133
Epoch [2/2], Step [100/150], Loss: 1.9204
Training complete in: 0:00:27.426653
Dev loss: 1.8674346208572388

the log of normal version

Epoch [1/2], Step [100/600], Loss: 2.1626
Epoch [1/2], Step [200/600], Loss: 1.9929
Epoch [1/2], Step [300/600], Loss: 1.9224
Epoch [1/2], Step [400/600], Loss: 1.7479
Epoch [1/2], Step [500/600], Loss: 1.6264
Epoch [1/2], Step [600/600], Loss: 1.5411
Epoch [2/2], Step [100/600], Loss: 1.4387
Epoch [2/2], Step [200/600], Loss: 1.3243
Epoch [2/2], Step [300/600], Loss: 1.2894
Epoch [2/2], Step [400/600], Loss: 1.1754
Epoch [2/2], Step [500/600], Loss: 1.1271
Epoch [2/2], Step [600/600], Loss: 1.1246
Training complete in: 0:00:53.779830
Dev loss: 1.1193695068359375

the source code

distributed version

import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist

class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

def train(gpu, args):
    rank = args.nr * args.gpus + gpu
    dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    batch_size = 100
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    # Data loading code
    train_dataset = torchvision.datasets.MNIST(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                    num_replicas=args.world_size,
                                                                    rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)

    start = datetime.now()
    total_step = len(train_loader)
    for epoch in range(args.epochs):
        train_sampler.set_epoch(epoch)
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step,
                                                                         loss.item()))
    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))
        dev_dataset = torchvision.datasets.MNIST(root='./data',
                                                 train=False,
                                                 transform=transforms.ToTensor(),
                                                 download=False)
        dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=0,
                                                 pin_memory=True)
        _ = model.eval()
        with torch.no_grad():
            y_hat = []
            y = []
            for i, (images, labels) in enumerate(dev_loader):
                y.append(labels.cuda(non_blocking=True))
                y_hat.append(model(images.cuda(non_blocking=True)))
            y_hat = torch.cat(y_hat)
            y = torch.cat(y)
            loss = criterion(y_hat, y)
            print(f'Dev loss: {loss.item()}')

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=2, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args()
    ###################################################
    args.world_size = args.gpus * args.nodes                                 #
    os.environ['MASTER_ADDR'] = HOST                                      #
    os.environ['MASTER_PORT'] = PORT                                      #
    mp.spawn(train, nprocs=args.gpus, args=(args, ))                    #
    ###################################################

if __name__ == '__main__':
    """
    Epoch [1/2], Step [100/150], Loss: 2.1133
    Epoch [2/2], Step [100/150], Loss: 1.9204
    Training complete in: 0:00:27.426653
    Dev loss: 1.8674346208572388
    """
    main()

the single gpu version

import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist

class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super(ConvNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(7*7*32, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        return out

def train(gpu, args):
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    batch_size = 100
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Data loading code
    train_dataset = torchvision.datasets.MNIST(root='./data',
                                               train=True,
                                               transform=transforms.ToTensor(),
                                               download=True)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,
                                               num_workers=0,
                                               pin_memory=True)

    start = datetime.now()
    total_step = len(train_loader)
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1,
                    args.epochs,
                    i + 1,
                    total_step,
                    loss.item())
                   )
    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))
        dev_dataset = torchvision.datasets.MNIST(root='./data',
                                                 train=False,
                                                 transform=transforms.ToTensor(),
                                                 download=False)
        dev_loader = torch.utils.data.DataLoader(dataset=dev_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=True,
                                                 num_workers=0,
                                                 pin_memory=True)
        _ = model.eval()
        with torch.no_grad():
            y_hat = []
            y = []
            for i, (images, labels) in enumerate(dev_loader):
                y.append(labels.cuda(non_blocking=True))
                y_hat.append(model(images.cuda(non_blocking=True)))
            y_hat = torch.cat(y_hat)
            y = torch.cat(y)
            loss = criterion(y_hat, y)
            print(f'Dev loss: {loss.item()}')

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=2, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args()
    train(0, args)

if __name__ == '__main__':
    """
    Epoch [1/2], Step [100/600], Loss: 2.1626
    Epoch [1/2], Step [200/600], Loss: 1.9929
    Epoch [1/2], Step [300/600], Loss: 1.9224
    Epoch [1/2], Step [400/600], Loss: 1.7479
    Epoch [1/2], Step [500/600], Loss: 1.6264
    Epoch [1/2], Step [600/600], Loss: 1.5411
    Epoch [2/2], Step [100/600], Loss: 1.4387
    Epoch [2/2], Step [200/600], Loss: 1.3243
    Epoch [2/2], Step [300/600], Loss: 1.2894
    Epoch [2/2], Step [400/600], Loss: 1.1754
    Epoch [2/2], Step [500/600], Loss: 1.1271
    Epoch [2/2], Step [600/600], Loss: 1.1246
    Training complete in: 0:00:53.779830
    Dev loss: 1.1193695068359375
    """
    main()

any help is appreciated

ops, I just found it’s an issue about learning rate.

When I set learning rate into 4e-4, four times of the rate using on single GPU.

Epoch [1/2], Step [100/150], Loss: 1.7276
Epoch [2/2], Step [100/150], Loss: 1.2062
Training complete in: 0:00:18.275619
Dev loss: 1.1129298210144043

Right, learning rate, batch size, loss function can all play a role here. For example, if using the same per-process batch size, then the DDP gang collectively will consume more samples with larger world size, and hence the learning rate will need to adjust accordingly.

Can you explain a bit further please? Why would we need to change the learning rate on multiple GPUs? If we keep the same batch size then the only difference would be that there are four machines simultaneously processing batches of the same size as with one GPU right?