I am using a2-megagpu-16g in GCP which has 16 A100.
CUDA 11.1
NCCL 2.8.4
Pytorch 1.8.0 (installed via pip)
I am testing DDP based on Getting Started with Distributed Data Parallel — PyTorch Tutorials 1.9.0+cu102 documentation
Backend with “Gloo” works but with “NCCL”, it fails
Running basic DDP example on rank 0.
Running basic DDP example on rank 1.
Traceback (most recent call last):
File "quick_tutorial3.py", line 66, in <module>
run_demo(demo_basic, 2)
File "quick_tutorial3.py", line 57, in run_demo
join=True)
File "/home/wonkyum/venv/espnet/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/wonkyum/venv/espnet/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/wonkyum/venv/espnet/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/wonkyum/venv/espnet/lib/python3.6/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/wonkyum/quick_tutorial3.py", line 39, in demo_basic
ddp_model = DDP(model, device_ids=[rank])
File "/home/wonkyum/venv/espnet/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 446, in __init__
self._sync_params_and_buffers(authoritative_rank=0)
File "/home/wonkyum/venv/espnet/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 460, in _sync_params_and_buffers
authoritative_rank)
File "/home/wonkyum/venv/espnet/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 1156, in _distributed_broadcast_coalesced
self.process_group, tensors, buffer_size, authoritative_rank
RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:825, unhandled cuda error, NCCL version 2.7.8
ncclUnhandledCudaError: Call to CUDA function failed.
my code is like below:
import os
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def demo_basic(rank, world_size):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)
# create model and move it to GPU with id rank
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
def run_demo(demo_fn, world_size):
mp.spawn(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
n_gpus = torch.cuda.device_count()
if n_gpus < 8:
print(f"Requires at least 8 GPUs to run, but got {n_gpus}.")
else:
run_demo(demo_basic, 2)
Is there a way to resolve this issue?