Distributed Training slower than DataParallel

Hey @tyb_10, I tried a toy model locally, but cannot reproduce this behavior. With 2 GPUs, the code below shows DP is about 9X slower than DDP. Can you try this code in your environment, or can you share a min repro of your code that I can try locally?

DP execution time (ms) by CUDA event: 2938.427490234375
DP execution time (s) by Python time: 2.9386751651763916 
DDP rank-1 execution time (ms) by CUDA event 326.289306640625
DDP rank-0 execution time (ms) by CUDA event 326.19061279296875
DDP rank-1 execution time (s) by Python time 0.3264338970184326 
DDP rank-0 execution time (s) by Python time 0.32636237144470215 
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn.parallel import DataParallel as DP

import time

X = 100
B = 200

def ddp_example(rank, world_size):
    # create default process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)
    b = B // world_size
    # create local model
    model = nn.Linear(X, X).to(rank)
    # construct DDP model
    ddp_model = DDP(model, device_ids=[rank])
    # define loss function and optimizer
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    with torch.cuda.device(rank):
        tik = time.time()
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        for _ in range(20):
            # forward pass
            outputs = ddp_model(torch.randn(b, X).to(rank))
            labels = torch.randn(b, X).to(rank)
            # backward pass
            loss_fn(outputs, labels).backward()
            # update parameters
            optimizer.step()
        end.record()
        print(f"DDP rank-{rank} execution time (ms) by CUDA event {start.elapsed_time(end)}")
        torch.cuda.synchronize()
        tok = time.time()
        print(f"DDP rank-{rank} execution time (s) by Python time {tok - tik} ")


def dp_example():
    b = B  # don't need to divide by 2 here as DataParallel will scatter inputs
    model = nn.Linear(X, X).to(0)
    # construct DDP model
    dp_model = DP(model, device_ids=[0, 1])
    # define loss function and optimizer
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(dp_model.parameters(), lr=0.001)

    tik = time.time()
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    for _ in range(20):
        # forward pass
        outputs = dp_model(torch.randn(b, X).to(0))
        labels = torch.randn(b, X).to(0)
        # backward pass
        loss_fn(outputs, labels).backward()
        # update parameters
        optimizer.step()
    end.record()
    print(f"DP execution time (ms) by CUDA event: {start.elapsed_time(end)}")
    torch.cuda.synchronize()
    tok = time.time()
    print(f"DP execution time (s) by Python time: {tok - tik} ")


def main():
    dp_example()

    world_size = 2
    mp.spawn(ddp_example,
        args=(world_size,),
        nprocs=world_size,
        join=True)

if __name__=="__main__":
    main()


1 Like