NCCL WARN Duplicate GPU detected error

jazin · August 17, 2023, 8:27pm

Dear all,
I"m using a code for image inpainting for multinode training. Here is the subprocess function:

“”"
def subprocess_fn(rank, args, temp_dir):
dnnlib.util.Logger(file_name=os.path.join(args.run_dir, ‘log.txt’), file_mode=‘a’, should_flush=True)

# Init torch.distributed.

if args.num_gpus > 1:
    init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init'))
    if os.name == 'nt':
        init_method = 'file:///' + init_file.replace('\\', '/')
        torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus)
    else:
        init_method = f'file://{init_file}'
        torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus)

# Init torch_utils.
sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None
training_stats.init_multiprocessing(rank=rank, sync_device=sync_device)
if rank != 0:
    custom_ops.verbosity = 'none'

# Execute training loop.
training_loop.training_loop(rank=rank, **args)

“”"

In the tarining loop, we have:

“”"
# Initialize.
start_time = time.time()
device = torch.device(‘cuda’, rank)
np.random.seed(random_seed * num_gpus + rank)
torch.manual_seed(random_seed * num_gpus + rank)
torch.backends.cudnn.benchmark = cudnn_benchmark # Improves training speed.
torch.backends.cuda.matmul.allow_tf32 = allow_tf32 # Allow PyTorch to internally use tf32 for matmul
torch.backends.cudnn.allow_tf32 = allow_tf32 # Allow PyTorch to internally use tf32 for convolutions
conv2d_gradfix.enabled = True # Improves training speed.
grid_sample_gradfix.enabled = True # Avoids errors with the augmentation pipe.

# Load training set.
if rank == 0:
    print('Loading training set...')
training_set = dnnlib.util.construct_class_by_name(**training_set_kwargs) # subclass of training.dataset.Dataset
val_set = dnnlib.util.construct_class_by_name(**val_set_kwargs) # subclass of training.dataset.Dataset
training_set_sampler = misc.InfiniteSampler(dataset=training_set, rank=rank, num_replicas=num_gpus, seed=random_seed)
training_set_iterator = iter(torch.utils.data.DataLoader(dataset=training_set, sampler=training_set_sampler, batch_size=batch_size//num_gpus, **data_loader_kwargs))
if rank == 0:
    print()
    print('Num images: ', len(training_set))
    print('Image shape:', training_set.image_shape)
    print('Label shape:', training_set.label_shape)
    print()

# Construct networks.
if rank == 0:
    print('Constructing networks...')
common_kwargs = dict(c_dim=training_set.label_dim, img_resolution=training_set.resolution, img_channels=training_set.num_channels)
G = dnnlib.util.construct_class_by_name(**G_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
D = dnnlib.util.construct_class_by_name(**D_kwargs, **common_kwargs).train().requires_grad_(False).to(device) # subclass of torch.nn.Module
G_ema = copy.deepcopy(G).eval()

“”"
This function is used in this way:

# Launch processes.
print('Launching processes...')
torch.multiprocessing.set_start_method('spawn')
with tempfile.TemporaryDirectory() as temp_dir:
    if args.num_gpus == 1:
        subprocess_fn(rank=0, args=args, temp_dir=temp_dir)
    else:
        torch.multiprocessing.spawn(fn=subprocess_fn, args=(args, temp_dir), nprocs=args.num_gpus)

After running this code with 8 GPUs within 2 nodes, this is the error:
“”"

htc-gpu007:225031:225226 [3] init.cc:573 NCCL WARN Duplicate GPU detected : rank 3 and rank 7 both on CUDA device e3000

htc-gpu007:225035:225224 [3] init.cc:573 NCCL WARN Duplicate GPU detected : rank 7 and rank 3 both on CUDA device e3000
htc-gpu007:225035:225224 [3] NCCL INFO init.cc:840 → 5
htc-gpu007:225031:225226 [3] NCCL INFO init.cc:840 → 5

htc-gpu007:225028:225214 [0] init.cc:573 NCCL WARN Duplicate GPU detected : rank 0 and rank 4 both on CUDA device 17000

htc-gpu007:225034:225222 [2] init.cc:573 NCCL WARN Duplicate GPU detected : rank 6 and rank 2 both on CUDA device ca000

htc-gpu007:225032:225223 [0] init.cc:573 NCCL WARN Duplicate GPU detected : rank 4 and rank 0 both on CUDA device 17000
htc-gpu007:225035:225224 [3] NCCL INFO group.cc:73 → 5 [Async thread]
htc-gpu007:225034:225222 [2] NCCL INFO init.cc:840 → 5
htc-gpu007:225028:225214 [0] NCCL INFO init.cc:840 → 5
htc-gpu007:225031:225226 [3] NCCL INFO group.cc:73 → 5 [Async thread]

htc-gpu007:225033:225228 [1] init.cc:573 NCCL WARN Duplicate GPU detected : rank 5 and rank 1 both on CUDA device 65000
htc-gpu007:225032:225223 [0] NCCL INFO init.cc:840 → 5
htc-gpu007:225033:225228 [1] NCCL INFO init.cc:840 → 5
htc-gpu007:225034:225222 [2] NCCL INFO group.cc:73 → 5 [Async thread]

htc-gpu007:225029:225221 [1] init.cc:573 NCCL WARN Duplicate GPU detected : rank 1 and rank 5 both on CUDA device 65000
htc-gpu007:225029:225221 [1] NCCL INFO init.cc:840 → 5
htc-gpu007:225028:225214 [0] NCCL INFO group.cc:73 → 5 [Async thread]
htc-gpu007:225032:225223 [0] NCCL INFO group.cc:73 → 5 [Async thread]
htc-gpu007:225029:225221 [1] NCCL INFO group.cc:73 → 5 [Async thread]

htc-gpu007:225030:225225 [2] init.cc:573 NCCL WARN Duplicate GPU detected : rank 2 and rank 6 both on CUDA device ca000
htc-gpu007:225030:225225 [2] NCCL INFO init.cc:840 → 5
htc-gpu007:225033:225228 [1] NCCL INFO group.cc:73 → 5 [Async thread]
htc-gpu007:225030:225225 [2] NCCL INFO group.cc:73 → 5 [Async thread]
Traceback (most recent call last):
File “/home/htc/jkasravi/SCRATCH/projects/spine/Sotware/MAT/train.py”, line 652, in
main() # pylint: disable=no-value-for-parameter
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/click/core.py”, line 1157, in call
return self.main(*args, **kwargs)
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/click/core.py”, line 1078, in main
rv = self.invoke(ctx)
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/click/core.py”, line 1434, in invoke
return ctx.invoke(self.callback, **ctx.params)
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/click/core.py”, line 783, in invoke
return __callback(*args, **kwargs)
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/click/decorators.py”, line 33, in new_func
return f(get_current_context(), *args, **kwargs)
File “/home/htc/jkasravi/SCRATCH/projects/spine/Sotware/MAT/train.py”, line 647, in main
torch.multiprocessing.spawn(fn=subprocess_fn, args=(args, temp_dir), nprocs=args.num_gpus)
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/torch/multiprocessing/spawn.py”, line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method=‘spawn’)
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/torch/multiprocessing/spawn.py”, line 157, in start_processes
while not context.join():
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/torch/multiprocessing/spawn.py”, line 118, in join
raise Exception(msg)
Exception:

– Process 7 terminated with the following error:
Traceback (most recent call last):
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/torch/multiprocessing/spawn.py”, line 19, in _wrap
fn(i, *args)
File “/home/htc/jkasravi/SCRATCH/projects/spine/Sotware/MAT/train.py”, line 464, in subprocess_fn
torch.distributed.init_process_group(backend=‘nccl’, init_method=init_method, rank=rank, world_size=args.num_gpus)
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py”, line 455, in init_process_group
barrier()
File “/scratch/htc/jkasravi/miniconda/envs/MAT/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py”, line 1960, in barrier
work = _default_pg.barrier()
RuntimeError: NCCL error in: /pytorch/torch/lib/c10d/ProcessGroupNCCL.cpp:784, invalid usage, NCCL version 2.7.8

srun: error: htc-gpu007: task 0: Exited with exit code 1
htc-gpu008:298389:298389 [0] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298389:298389 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298389:298389 [0] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298389:298389 [0] NCCL INFO Using network IB
NCCL version 2.7.8+cuda11.0
htc-gpu008:298395:298395 [2] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298395:298395 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298393:298393 [0] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298393:298393 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298391:298391 [2] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298391:298391 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298396:298396 [3] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298396:298396 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298390:298390 [1] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298390:298390 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298394:298394 [1] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298394:298394 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298391:298391 [2] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298391:298391 [2] NCCL INFO Using network IB
htc-gpu008:298393:298393 [0] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298393:298393 [0] NCCL INFO Using network IB
htc-gpu008:298395:298395 [2] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298395:298395 [2] NCCL INFO Using network IB
htc-gpu008:298396:298396 [3] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298396:298396 [3] NCCL INFO Using network IB
htc-gpu008:298390:298390 [1] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298390:298390 [1] NCCL INFO Using network IB
htc-gpu008:298394:298394 [1] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298394:298394 [1] NCCL INFO Using network IB
htc-gpu008:298392:298392 [3] NCCL INFO Bootstrap : Using [0]eth0:130.73.31.150<0>
htc-gpu008:298392:298392 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
htc-gpu008:298392:298392 [3] NCCL INFO NET/IB : Using [0]ibp75s0:1/IB ; OOB eth0:130.73.31.150<0>
htc-gpu008:298392:298392 [3] NCCL INFO Using network IB

htc-gpu008:298390:298589 [1] init.cc:573 NCCL WARN Duplicate GPU detected : rank 1 and rank 5 both on CUDA device 65000

htc-gpu008:298394:298591 [1] init.cc:573 NCCL WARN Duplicate GPU detected : rank 5 and rank 1 both on CUDA device 65000

htc-gpu008:298395:298587 [2] init.cc:573 NCCL WARN Duplicate GPU detected : rank 6 and rank 2 both on CUDA device ca000
htc-gpu008:298390:298589 [1] NCCL INFO init.cc:840 → 5
htc-gpu008:298394:298591 [1] NCCL INFO init.cc:840 → 5

htc-gpu008:298393:298586 [0] init.cc:573 NCCL WARN Duplicate GPU detected : rank 4 and rank 0 both on CUDA device 17000
htc-gpu008:298395:298587 [2] NCCL INFO init.cc:840 → 5
htc-gpu008:298393:298586 [0] NCCL INFO init.cc:840 → 5

htc-gpu008:298391:298585 [2] init.cc:573 NCCL WARN Duplicate GPU detected : rank 2 and rank 6 both on CUDA device ca000
htc-gpu008:298394:298591 [1] NCCL INFO group.cc:73 → 5 [Async thread]
“”“”

I read somewhere that this error belongs to sending model or data to Cuda instead of rank. I looked at the code, and all things seem ok.

Could you please help me to solve this problem?
Thanks

liuchongming1999 · March 2, 2024, 7:50am

I have the same question, have you solved this?