Hi! I’m running a minimal DDP example (adapted from examples/distributed/ddp/main.py at main · pytorch/examples · GitHub ; code provided below).
import os
import tempfile
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
torch.cuda.set_device(f"cuda:{rank}")
torch.cuda.empty_cache()
dist.init_process_group("gloo", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def demo_basic(rank, world_size):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)
# create model and move it to GPU with id rank
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
def run_demo(demo_fn, world_size):
mp.spawn(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
run_demo(demo_basic, 2)
When running with the gloo
backend, it seems like some extra memory is allocated on GPU 0. If i switch the backend to nccl
, this extra memory is not allocated (nvidia-smi
output below is for gloo
):
$ nvidia-smi
Sat Mar 9 19:26:56 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03 Driver Version: 535.129.03 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA TITAN Xp On | 00000000:03:00.0 Off | N/A |
| 23% 35C P2 61W / 250W | 320MiB / 12288MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
| 1 NVIDIA TITAN Xp On | 00000000:81:00.0 Off | N/A |
| 23% 30C P2 59W / 250W | 174MiB / 12288MiB | 0% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
+---------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=======================================================================================|
| 0 N/A N/A 656420 C ...r/miniconda3/envs/ddp/bin/python 170MiB |
| 0 N/A N/A 656421 C ...r/miniconda3/envs/ddp/bin/python 146MiB |
| 1 N/A N/A 656421 C ...r/miniconda3/envs/ddp/bin/python 170MiB |
+---------------------------------------------------------------------------------------+
I’ve seen a few posts about how torch.cuda.set_device
fixes this issue with NCCL—how might gloo differ in this setting / why is there extra memory allocated in this case? This is with PyTorch 2.2.1
Thanks in advance!