Can't move model to NCCL device

background

I’m basically following the DDP tutorial verbatim.
when I try to send my model to a rank device over NCCL, I get CUDA-capable device(s) is/are busy or unavailable

My environment is:

Python 3.10.12 (main, Jul  5 2023, 18:54:27) [GCC 11.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.distributed.is_nccl_available()
True
>>> torch.cuda.device_count()
2
>>> print(torch.__version__)
2.3.0
>>> print(torch.version.cuda)
12.1

DDP example script:

#!/usr/bin/env python
import os
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP

class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))

def run(rank, size):
    """ Distributed function to be implemented later. """
    print(f"sending ToyModel to rank {rank}.")
    model = ToyModel().to(rank)
    print(f"initializing ToyModel on rank {rank}.")
    ddp_model = DDP(model, device_ids=[rank])

    print(f"defining loss function.")
    loss_fn = nn.MSELoss()

    print(f"initializing optimizer on rank {rank}.")
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    print(f"zero gradients optimizer on rank {rank}.")
    optimizer.zero_grad()
    print(f"forward pass on rank {rank}.")
    outputs = ddp_model(torch.randn(20, 10))
    labels = torch.randn(20, 5).to(rank)
    print(f"computing loss on rank {rank}.")
    loss = loss_fn(outputs, labels)
    print(f"loss = {loss.item()}")
    loss.backward()
    optimizer.step()

def init_process(rank, size, fn, backend='nccl'):
    """ Initialize the distributed environment. """
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29501'
    if not torch.distributed.is_nccl_available():
        raise RuntimeError("NCCL backend not available.")
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, size)


if __name__ == "__main__":
    # size = torch.cuda.device_count()
    size = 2
    processes = []
    mp.set_start_method("spawn")
    for rank in range(size):
        p = mp.Process(target=init_process, args=(rank, size, run))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

error:

the above script give the following error:

(/nfs/turbo/umms-welchjd/mkarikom/shared3012) [mkarikom@lh0473 scratch]$ /nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py
sending ToyModel to rank 1.
sending ToyModel to rank 0.
Process Process-2:
Traceback (most recent call last):
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 59, in init_process
    fn(rank, size)
  File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 23, in run
    model = ToyModel().to(rank)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1173, in to
    return self._apply(convert)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 779, in _apply
    module._apply(fn)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 804, in _apply
    param_applied = fn(param)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1159, in convert
    return t.to(
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Process Process-1:
Traceback (most recent call last):
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 59, in init_process
    fn(rank, size)
  File "/nfs/turbo/umms-welchjd/mkarikom/enhancer_work/scratch/test_ddp.py", line 23, in run
    model = ToyModel().to(rank)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1173, in to
    return self._apply(convert)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 779, in _apply
    module._apply(fn)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 804, in _apply
    param_applied = fn(param)
  File "/nfs/turbo/umms-welchjd/mkarikom/shared3012/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1159, in convert
    return t.to(
RuntimeError: CUDA error: CUDA-capable device(s) is/are busy or unavailable
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.