DDP (with gloo): All processes take extra memory on GPU 0

Hi! I’m running a minimal DDP example (adapted from examples/distributed/ddp/main.py at main · pytorch/examples · GitHub ; code provided below).

import os
import tempfile
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim

from torch.nn.parallel import DistributedDataParallel as DDP


def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    torch.cuda.set_device(f"cuda:{rank}")
    torch.cuda.empty_cache()
    dist.init_process_group("gloo", rank=rank, world_size=world_size)


def cleanup():
    dist.destroy_process_group()


class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))


def demo_basic(rank, world_size):
    print(f"Running basic DDP example on rank {rank}.")
    setup(rank, world_size)

    # create model and move it to GPU with id rank
    model = ToyModel().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()
    outputs = ddp_model(torch.randn(20, 10))
    labels = torch.randn(20, 5).to(rank)
    loss_fn(outputs, labels).backward()
    optimizer.step()

    cleanup()


def run_demo(demo_fn, world_size):
    mp.spawn(demo_fn,
             args=(world_size,),
             nprocs=world_size,
             join=True)


if __name__ == "__main__":
    run_demo(demo_basic, 2)

When running with the gloo backend, it seems like some extra memory is allocated on GPU 0. If i switch the backend to nccl, this extra memory is not allocated (nvidia-smi output below is for gloo):

$ nvidia-smi
Sat Mar  9 19:26:56 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA TITAN Xp                On  | 00000000:03:00.0 Off |                  N/A |
| 23%   35C    P2              61W / 250W |    320MiB / 12288MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA TITAN Xp                On  | 00000000:81:00.0 Off |                  N/A |
| 23%   30C    P2              59W / 250W |    174MiB / 12288MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+

+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
|    0   N/A  N/A    656420      C   ...r/miniconda3/envs/ddp/bin/python      170MiB |
|    0   N/A  N/A    656421      C   ...r/miniconda3/envs/ddp/bin/python      146MiB |
|    1   N/A  N/A    656421      C   ...r/miniconda3/envs/ddp/bin/python      170MiB |
+---------------------------------------------------------------------------------------+

I’ve seen a few posts about how torch.cuda.set_device fixes this issue with NCCL—how might gloo differ in this setting / why is there extra memory allocated in this case? This is with PyTorch 2.2.1

Thanks in advance!

1 Like

Getting the same problem here…