Multi gpu training on server

Hi, I am trying to get some minimal multiple gpu training to work. I have managed to successully get it running with Pytorch 1.x, but not on Pytorch 2.x. Below is the GPU information and the logs I get when running it on Pytorch 2.

The file I run is taken from the Pytorch tutorial:
https://pytorch.org/tutorials/intermediate/ddp_tutorial.html

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim

from torch.nn.parallel import DistributedDataParallel as DDP

class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))


def demo_basic():
    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
    dist.init_process_group("nccl")
    rank = dist.get_rank()
    print(f"Start running basic DDP example on rank {rank}.")
    # create model and move it to GPU with id rank
    device_id = rank % torch.cuda.device_count()
    model = ToyModel().to(device_id)
    ddp_model = DDP(model, device_ids=[device_id])
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()
    outputs = ddp_model(torch.randn(20, 10))
    labels = torch.randn(20, 5).to(device_id)
    loss_fn(outputs, labels).backward()
    optimizer.step()
    dist.destroy_process_group()
    print(f"Finished running basic DDP example on rank {rank}.")

if __name__ == "__main__":
    demo_basic()

torchrun --nnodes=1 --nproc_per_node=2 --standalone elastic_ddp.py

±----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |
|-----------------------------------------±-----------------------±---------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 2080 Ti On | 00000000:05:00.0 Off | N/A |
| 0% 26C P8 22W / 252W | 1MiB / 11264MiB | 0% E. Process |
| | | N/A |
±----------------------------------------±-----------------------±---------------------+
| 1 NVIDIA GeForce RTX 2080 Ti On | 00000000:44:00.0 Off | N/A |
| 0% 28C P8 17W / 252W | 1MiB / 11264MiB | 0% E. Process |
| | | N/A |
±----------------------------------------±-----------------------±---------------------+

master_addr is only used for static rdzv_backend and when rdzv_endpoint is not specified.
[W socket.cpp:426] [c10d] The server socket cannot be initialized on [::]:29400 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [localhost]:29400 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [localhost]:29400 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [localhost]:29400 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [localhost]:29400 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:426] [c10d] The server socket cannot be initialized on [::]:37139 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [sh03-12n12.int]:37139 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [sh03-12n12.int]:37139 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [sh03-12n12.int]:37139 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:601] [c10d] The client socket cannot be initialized to connect to [sh03-12n12.int]:37139 (errno: 97 - Address family not supported by protocol).
Start running basic DDP example on rank 0.
Start running basic DDP example on rank 1.
sh03-12n12:1768:1768 [0] enqueue.cc:100 NCCL WARN Cuda failure ‘invalid device function’
sh03-12n12:1768:1768 [0] NCCL INFO Bootstrap : Using ib0:10.51.12.12<0>
sh03-12n12:1768:1768 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
sh03-12n12:1769:1769 [1] NCCL INFO cudaDriverVersion 12040
sh03-12n12:1768:1768 [0] NCCL INFO cudaDriverVersion 12040
NCCL version 2.14.3+cuda11.5
sh03-12n12:1769:1769 [1] enqueue.cc:100 NCCL WARN Cuda failure ‘invalid device function’
sh03-12n12:1769:1769 [1] NCCL INFO Bootstrap : Using ib0:10.51.12.12<0>
sh03-12n12:1769:1769 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
sh03-12n12:1769:1807 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/RoCE [RO]; OOB ib0:10.51.12.12<0>
sh03-12n12:1769:1807 [1] NCCL INFO Using network IB
sh03-12n12:1768:1808 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/RoCE [RO]; OOB ib0:10.51.12.12<0>
sh03-12n12:1768:1808 [0] NCCL INFO Using network IB
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
time : 2025-03-20_11:51:07
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1768:1808 [0] NCCL INFO Setting affinity for GPU 0 to 010000
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1769:1807 [1] NCCL INFO Setting affinity for GPU 1 to 010000
sh03-12n12:1769:1807 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1768:1808 [0] NCCL INFO Channel 00/02 : 0 1
sh03-12n12:1768:1808 [0] NCCL INFO Channel 01/02 : 0 1
sh03-12n12:1768:1808 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1769:1807 [1] NCCL INFO Channel 00 : 1[44000] → 0[5000] via SHM/direct/direct
sh03-12n12:1769:1807 [1] NCCL INFO P2P is disabled between connected GPUs 1 and 0. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1769:1807 [1] NCCL INFO Could not enable P2P between dev 1(=44000) and dev 0(=5000)
sh03-12n12:1769:1807 [1] NCCL INFO Channel 01 : 1[44000] → 0[5000] via SHM/direct/direct
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1768:1808 [0] NCCL INFO Channel 00 : 0[5000] → 1[44000] via SHM/direct/direct
sh03-12n12:1768:1808 [0] NCCL INFO P2P is disabled between connected GPUs 0 and 1. You can repress this message with NCCL_IGNORE_DISABLED_P2P=1.
sh03-12n12:1768:1808 [0] NCCL INFO Could not enable P2P between dev 0(=5000) and dev 1(=44000)
sh03-12n12:1768:1808 [0] NCCL INFO Channel 01 : 0[5000] → 1[44000] via SHM/direct/direct
sh03-12n12:1769:1807 [1] NCCL INFO Connected all rings
sh03-12n12:1769:1807 [1] NCCL INFO Connected all trees
sh03-12n12:1769:1807 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
sh03-12n12:1769:1807 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
sh03-12n12:1768:1808 [0] NCCL INFO Connected all rings
sh03-12n12:1768:1808 [0] NCCL INFO Connected all trees
sh03-12n12:1768:1808 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512
sh03-12n12:1768:1808 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
sh03-12n12:1768:1808 [0] NCCL INFO comm 0x53da9150 rank 0 nranks 2 cudaDev 0 busId 5000 - Init COMPLETE
sh03-12n12:1769:1807 [1] NCCL INFO comm 0x54abacb0 rank 1 nranks 2 cudaDev 1 busId 44000 - Init COMPLETE

sh03-12n12:1768:1768 [0] misc/strongstream.cc:166 NCCL WARN Cuda failure ‘invalid device function’
sh03-12n12:1768:1768 [0] NCCL INFO enqueue.cc:992 → 1
sh03-12n12:1768:1768 [0] NCCL INFO group.cc:173 → 1
sh03-12n12:1768:1768 [0] NCCL INFO group.cc:340 → 1
sh03-12n12:1768:1768 [0] NCCL INFO group.cc:421 → 1
sh03-12n12:1768:1768 [0] NCCL INFO group.cc:106 → 1
Traceback (most recent call last):
File “/home/users/msena/deqn_env/mgpu_training/distributed_tutorial.py”, line 41, in

sh03-12n12:1769:1769 [1] misc/strongstream.cc:166 NCCL WARN Cuda failure ‘invalid device function’
sh03-12n12:1769:1769 [1] NCCL INFO enqueue.cc:992 → 1
sh03-12n12:1769:1769 [1] NCCL INFO group.cc:173 → 1
sh03-12n12:1769:1769 [1] NCCL INFO group.cc:340 → 1
sh03-12n12:1769:1769 [1] NCCL INFO group.cc:421 → 1
sh03-12n12:1769:1769 [1] NCCL INFO group.cc:106 → 1
Traceback (most recent call last):
File “/home/users/msena/deqn_env/mgpu_training/distributed_tutorial.py”, line 41, in
demo_basic()
File “/home/users/msena/deqn_env/mgpu_training/distributed_tutorial.py”, line 28, in demo_basic
demo_basic()
File “/home/users/msena/deqn_env/mgpu_training/distributed_tutorial.py”, line 28, in demo_basic
ddp_model = DDP(model, device_ids=[device_id])
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/nn/parallel/distributed.py”, line 674, in init
ddp_model = DDP(model, device_ids=[device_id])
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/nn/parallel/distributed.py”, line 674, in init
_verify_param_shape_across_processes(self.process_group, parameters)
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/distributed/utils.py”, line 118, in _verify_param_shape_across_processes
_verify_param_shape_across_processes(self.process_group, parameters)
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/distributed/utils.py”, line 118, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: NCCL Error 1: unhandled cuda error
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: NCCL Error 1: unhandled cuda error
sh03-12n12:1768:1813 [0] NCCL INFO [Service thread] Connection closed by localRank 0
sh03-12n12:1768:1768 [0] NCCL INFO comm 0x53da9150 rank 0 nranks 2 cudaDev 0 busId 5000 - Abort COMPLETE
sh03-12n12:1769:1814 [1] NCCL INFO [Service thread] Connection closed by localRank 1
sh03-12n12:1769:1769 [1] NCCL INFO comm 0x54abacb0 rank 1 nranks 2 cudaDev 1 busId 44000 - Abort COMPLETE
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1768) of binary: /share/software/user/open/python/3.9.0/bin/python3.9
Traceback (most recent call last):
File “/share/software/user/open/py-pytorch/2.0.0_py39/bin/torchrun”, line 33, in
sys.exit(load_entry_point(‘torch==2.0.0+gitunknown’, ‘console_scripts’, ‘torchrun’)())
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py”, line 346, in wrapper
return f(*args, **kwargs)
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/distributed/run.py”, line 794, in main
run(args)
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/distributed/run.py”, line 785, in run
elastic_launch(
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/distributed/launcher/api.py”, line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File “/share/software/user/open/py-pytorch/2.0.0_py39/lib/python3.9/site-packages/torch/distributed/launcher/api.py”, line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

distributed_tutorial.py FAILED

Failures:
[1]:
time : 2025-03-20_11:51:07
host : sh03-12n12.int
rank : 1 (local_rank: 1)
exitcode : 1 (pid: 1769)
error_file: <N/A>
traceback : To enable traceback see: Error Propagation — PyTorch 2.6 documentation

Root Cause (first observed failure):
[0]:
time : 2025-03-20_11:51:07
host : sh03-12n12.int
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1768)
error_file: <N/A>

Here are the modules I currenly load:
module load cuda/12.4.0
module load python/3.9.0
module load py-pytorch/2.0.0_py39

I suspect the error is caused by some incorrect version of software loading? Any help is much appreciated!

Are you seeing these issues with any newer PyTorch release as 2.0.0 is already quite old?

When i run it on pytorch 2.2.1 with python 3.12 i get the following error

torchrun --nnodes=1 --nproc_per_node=2 --standalone distributed_training.py
[W socket.cpp:464] [c10d] The server socket cannot be initialized on [::]:0 (errno: 97 - Address family not supported by protocol).
[W socket.cpp:697] [c10d] The client socket cannot be initialized to connect to [localhost]:38726 (errno: 97 - Address family not supported
by protocol).
[W socket.cpp:697] [c10d] The client socket cannot be initialized to connect to [localhost]:38726 (errno: 97 - Address family not supported
by protocol).
Fatal Python error: Segmentation fault
Current thread 0x00007f857c2d9740 (most recent call first):
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_
backend.py”, line 113 in call_store
File "/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous

backend.py", line 64 in init
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_
backend.py”, line 253 in create_backend
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/registry.py”, li
ne 36 in _create_c10d_handler
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/api.py”, line 25
8 in create_handler
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/elastic/rendezvous/registry.py”, li
ne 66 in get_rendezvous_handler
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/launcher/api.py”, line 238 in launc
h_agent
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/launcher/api.py”, line 135 in cal
l

File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/run.py”, line 803 in run
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/run.py”, line 812 in main
File “/share/software/user/open/py-pytorch/2.2.1_py312/lib/python3.12/site-packages/torch/distributed/elastic/multiprocessing/errors/in
it
.py”, line 347 in wrapper
File “/share/software/user/open/py-pytorch/2.2.1_py312/bin/torchrun”, line 33 in
Extension modules: numpy.core._multiarray_umath, numpy.core._multiarray_tests, numpy.linalg._umath_linalg, numpy.fft._pocketfft_internal, n
umpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random.mt19937, numpy.random.mtrand, numpy.random.
philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, torch._C, torch._C._fft, torch._C._linalg, torch._C._nested, tor
ch._C._nn, torch._C._sparse, torch._C._special (total: 20)
Segmentation fault

Any idea? Thanks again!