background
I’m basically following the DDP tutorial verbatim.
when I try to send my model to a rank device over NCCL, I get CUDA-capable device(s) is/are busy or unavailable
My environment is:
Python 3.10.12 (main, Jul 5 2023, 18:54:27) [GCC 11.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.distributed.is_nccl_available()
True
>>> torch.cuda.device_count()
2
>>> print(torch.__version__)
2.3.0
>>> print(torch.version.cuda)
12.1
DDP example script:
#!/usr/bin/env python
import os
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def run(rank, size):
""" Distributed function to be implemented later. """
print(f"sending ToyModel to rank {rank}.")
model = ToyModel().to(rank)
print(f"initializing ToyModel on rank {rank}.")
ddp_model = DDP(model, device_ids=[rank])
print(f"defining loss function.")
loss_fn = nn.MSELoss()
print(f"initializing optimizer on rank {rank}.")
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
print(f"zero gradients optimizer on rank {rank}.")
optimizer.zero_grad()
print(f"forward pass on rank {rank}.")
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
print(f"computing loss on rank {rank}.")
loss = loss_fn(outputs, labels)
print(f"loss = {loss.item()}")
loss.backward()
optimizer.step()
def init_process(rank, size, fn, backend='nccl'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = '127.0.0.1'
os.environ['MASTER_PORT'] = '29501'
if not torch.distributed.is_nccl_available():
raise RuntimeError("NCCL backend not available.")
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
# size = torch.cuda.device_count()
size = 2
processes = []
mp.set_start_method("spawn")
for rank in range(size):
p = mp.Process(target=init_process, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()
error:
the above script give the following error:
(/nfs/turbo/umms-welchjd/mkarikom/shared3012) [mkarikom@lh0473 scratch]$ /nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py
sending ToyModel to rank 1.
sending ToyModel to rank 0.
Process Process-2:
Traceback (most recent call last):
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 59, in init_process
fn(rank, size)
File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 23, in run
model = ToyModel().to(rank)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1173, in to
return self._apply(convert)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 779, in _apply
module._apply(fn)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 804, in _apply
param_applied = fn(param)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1159, in convert
return t.to(
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Process Process-1:
Traceback (most recent call last):
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 59, in init_process
fn(rank, size)
File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 23, in run
model = ToyModel().to(rank)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1173, in to
return self._apply(convert)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 779, in _apply
module._apply(fn)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 804, in _apply
param_applied = fn(param)
File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1159, in convert
return t.to(
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.