Hi, I guess I have the same issue.
I want to run the pytorch tutorial code: (GETTING STARTED WITH DISTRIBUTED DATA PARALLEL), three run_demon
function works fine when it’s 'gloo'
backend, which is the original code. But when I change 'gloo'
to 'nccl'
, the third demo demo_model_parallel
breask down.
Env: CentOS 7.6, torch1.10.2 + CUDA11.3 + 4 2080Ti
Here’s the full scripts from the tutorial, only changed the backend.
import os
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
# On Windows platform, the torch.distributed package only
# supports Gloo backend, FileStore and TcpStore.
# For FileStore, set init_method parameter in init_process_group
# to a local file. Example as follow:
# init_method="file:///f:/libtmp/some_file"
# dist.init_process_group(
# "gloo",
# rank=rank,
# init_method=init_method,
# world_size=world_size)
# For TcpStore, same way as on Linux.
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
########
######## Here, just change 'gloo' to 'nccl'
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def demo_basic(rank, world_size):
print(f"Running basic DDP example on rank {rank}.")
setup(rank, world_size)
# create model and move it to GPU with id rank
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
def run_demo(demo_fn, world_size):
mp.spawn(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True)
def demo_checkpoint(rank, world_size):
print(f"Running DDP checkpoint example on rank {rank}.")
setup(rank, world_size)
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"
if rank == 0:
# All processes should see same parameters as they all start from same
# random parameters and gradients are synchronized in backward passes.
# Therefore, saving it in one process is sufficient.
torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)
# Use a barrier() to make sure that process 1 loads the model after process
# 0 saves it.
dist.barrier()
# configure map_location properly
map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}
ddp_model.load_state_dict(
torch.load(CHECKPOINT_PATH, map_location=map_location))
optimizer.zero_grad()
outputs = ddp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(rank)
loss_fn = nn.MSELoss()
loss_fn(outputs, labels).backward()
optimizer.step()
# Not necessary to use a dist.barrier() to guard the file deletion below
# as the AllReduce ops in the backward pass of DDP already served as
# a synchronization.
if rank == 0:
os.remove(CHECKPOINT_PATH)
cleanup()
class ToyMpModel(nn.Module):
def __init__(self, dev0, dev1):
super(ToyMpModel, self).__init__()
self.dev0 = dev0
self.dev1 = dev1
self.net1 = torch.nn.Linear(10, 10).to(dev0)
self.relu = torch.nn.ReLU()
self.net2 = torch.nn.Linear(10, 5).to(dev1)
def forward(self, x):
x = x.to(self.dev0)
x = self.relu(self.net1(x))
x = x.to(self.dev1)
return self.net2(x)
def demo_model_parallel(rank, world_size):
print(f"Running DDP with model parallel example on rank {rank}.")
setup(rank, world_size)
# setup mp_model and devices for this process
dev0 = (rank * 2) % world_size
dev1 = (rank * 2 + 1) % world_size
mp_model = ToyMpModel(dev0, dev1)
ddp_mp_model = DDP(mp_model)
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_mp_model.parameters(), lr=0.001)
optimizer.zero_grad()
# outputs will be on dev1
outputs = ddp_mp_model(torch.randn(20, 10))
labels = torch.randn(20, 5).to(dev1)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
if __name__ == "__main__":
n_gpus = torch.cuda.device_count()
assert n_gpus >= 2, f"Requires at least 2 GPUs to run, but got {n_gpus}"
world_size = n_gpus
run_demo(demo_basic, world_size)
run_demo(demo_checkpoint, world_size)
run_demo(demo_model_parallel, world_size)
Here’s the contents printed in the terminal:
(pt102) [ZDD@localhost _multiprocessing]$ export NCCL_DEBUG=INFO
(pt102) [ZDD@localhost _multiprocessing]$ python 3test.py
Running basic DDP example on rank 2.
Running basic DDP example on rank 3.
Running basic DDP example on rank 1.
Running basic DDP example on rank 0.
localhost:174222:174222 [0] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174222:174222 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174222:174222 [0] NCCL INFO NET/IB : No device found.
localhost:174222:174222 [0] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174222:174222 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda11.3
localhost:174225:174225 [3] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174225:174225 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174225:174225 [3] NCCL INFO NET/IB : No device found.
localhost:174225:174225 [3] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174225:174225 [3] NCCL INFO Using network Socket
localhost:174224:174224 [2] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174224:174224 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174224:174224 [2] NCCL INFO NET/IB : No device found.
localhost:174224:174224 [2] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174224:174224 [2] NCCL INFO Using network Socket
localhost:174223:174223 [1] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174223:174223 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174223:174223 [1] NCCL INFO NET/IB : No device found.
localhost:174223:174223 [1] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174223:174223 [1] NCCL INFO Using network Socket
localhost:174222:174437 [0] NCCL INFO Channel 00/02 : 0 1 2 3
localhost:174225:174490 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
localhost:174222:174437 [0] NCCL INFO Channel 01/02 : 0 1 2 3
localhost:174223:174492 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
localhost:174224:174491 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
localhost:174222:174437 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
localhost:174223:174492 [1] NCCL INFO Setting affinity for GPU 1 to 03ff
localhost:174225:174490 [3] NCCL INFO Setting affinity for GPU 3 to 0ffc00
localhost:174224:174491 [2] NCCL INFO Setting affinity for GPU 2 to 0ffc00
localhost:174222:174437 [0] NCCL INFO Setting affinity for GPU 0 to 03ff
localhost:174222:174437 [0] NCCL INFO Channel 00 : 0[18000] -> 1[3b000] via direct shared memory
localhost:174224:174491 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via direct shared memory
localhost:174222:174437 [0] NCCL INFO Channel 01 : 0[18000] -> 1[3b000] via direct shared memory
localhost:174224:174491 [2] NCCL INFO Channel 01 : 2[86000] -> 3[af000] via direct shared memory
localhost:174225:174490 [3] NCCL INFO Channel 00 : 3[af000] -> 0[18000] via direct shared memory
localhost:174223:174492 [1] NCCL INFO Channel 00 : 1[3b000] -> 2[86000] via direct shared memory
localhost:174225:174490 [3] NCCL INFO Channel 01 : 3[af000] -> 0[18000] via direct shared memory
localhost:174223:174492 [1] NCCL INFO Channel 01 : 1[3b000] -> 2[86000] via direct shared memory
localhost:174222:174437 [0] NCCL INFO Connected all rings
localhost:174225:174490 [3] NCCL INFO Connected all rings
localhost:174223:174492 [1] NCCL INFO Connected all rings
localhost:174224:174491 [2] NCCL INFO Connected all rings
localhost:174225:174490 [3] NCCL INFO Channel 00 : 3[af000] -> 2[86000] via direct shared memory
localhost:174225:174490 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via direct shared memory
localhost:174223:174492 [1] NCCL INFO Channel 00 : 1[3b000] -> 0[18000] via direct shared memory
localhost:174223:174492 [1] NCCL INFO Channel 01 : 1[3b000] -> 0[18000] via direct shared memory
localhost:174224:174491 [2] NCCL INFO Channel 00 : 2[86000] -> 1[3b000] via direct shared memory
localhost:174224:174491 [2] NCCL INFO Channel 01 : 2[86000] -> 1[3b000] via direct shared memory
localhost:174222:174437 [0] NCCL INFO Connected all trees
localhost:174222:174437 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174222:174437 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174225:174490 [3] NCCL INFO Connected all trees
localhost:174225:174490 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174225:174490 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174223:174492 [1] NCCL INFO Connected all trees
localhost:174223:174492 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174223:174492 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174224:174491 [2] NCCL INFO Connected all trees
localhost:174224:174491 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174224:174491 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174223:174492 [1] NCCL INFO comm 0x7f9734002010 rank 1 nranks 4 cudaDev 1 busId 3b000 - Init COMPLETE
localhost:174224:174491 [2] NCCL INFO comm 0x7f6470002010 rank 2 nranks 4 cudaDev 2 busId 86000 - Init COMPLETE
localhost:174222:174437 [0] NCCL INFO comm 0x7ff730002010 rank 0 nranks 4 cudaDev 0 busId 18000 - Init COMPLETE
localhost:174225:174490 [3] NCCL INFO comm 0x7f4094002010 rank 3 nranks 4 cudaDev 3 busId af000 - Init COMPLETE
localhost:174222:174222 [0] NCCL INFO Launch mode Parallel
Running DDP checkpoint example on rank 0.
Running DDP checkpoint example on rank 2.
Running DDP checkpoint example on rank 1.
Running DDP checkpoint example on rank 3.
localhost:174530:174530 [0] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174530:174530 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174530:174530 [0] NCCL INFO NET/IB : No device found.
localhost:174530:174530 [0] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174530:174530 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda11.3
localhost:174532:174532 [2] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174532:174532 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174532:174532 [2] NCCL INFO NET/IB : No device found.
localhost:174532:174532 [2] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174532:174532 [2] NCCL INFO Using network Socket
localhost:174533:174533 [3] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174533:174533 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174533:174533 [3] NCCL INFO NET/IB : No device found.
localhost:174533:174533 [3] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174533:174533 [3] NCCL INFO Using network Socket
localhost:174531:174531 [1] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174531:174531 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174531:174531 [1] NCCL INFO NET/IB : No device found.
localhost:174531:174531 [1] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174531:174531 [1] NCCL INFO Using network Socket
localhost:174530:174718 [0] NCCL INFO Channel 00/02 : 0 1 2 3
localhost:174533:174778 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
localhost:174530:174718 [0] NCCL INFO Channel 01/02 : 0 1 2 3
localhost:174531:174779 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
localhost:174530:174718 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
localhost:174532:174765 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
localhost:174530:174718 [0] NCCL INFO Setting affinity for GPU 0 to 03ff
localhost:174531:174779 [1] NCCL INFO Setting affinity for GPU 1 to 03ff
localhost:174533:174778 [3] NCCL INFO Setting affinity for GPU 3 to 0ffc00
localhost:174532:174765 [2] NCCL INFO Setting affinity for GPU 2 to 0ffc00
localhost:174530:174718 [0] NCCL INFO Channel 00 : 0[18000] -> 1[3b000] via direct shared memory
localhost:174530:174718 [0] NCCL INFO Channel 01 : 0[18000] -> 1[3b000] via direct shared memory
localhost:174533:174778 [3] NCCL INFO Channel 00 : 3[af000] -> 0[18000] via direct shared memory
localhost:174532:174765 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via direct shared memory
localhost:174533:174778 [3] NCCL INFO Channel 01 : 3[af000] -> 0[18000] via direct shared memory
localhost:174532:174765 [2] NCCL INFO Channel 01 : 2[86000] -> 3[af000] via direct shared memory
localhost:174531:174779 [1] NCCL INFO Channel 00 : 1[3b000] -> 2[86000] via direct shared memory
localhost:174531:174779 [1] NCCL INFO Channel 01 : 1[3b000] -> 2[86000] via direct shared memory
localhost:174530:174718 [0] NCCL INFO Connected all rings
localhost:174533:174778 [3] NCCL INFO Connected all rings
localhost:174533:174778 [3] NCCL INFO Channel 00 : 3[af000] -> 2[86000] via direct shared memory
localhost:174532:174765 [2] NCCL INFO Connected all rings
localhost:174531:174779 [1] NCCL INFO Connected all rings
localhost:174533:174778 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via direct shared memory
localhost:174532:174765 [2] NCCL INFO Channel 00 : 2[86000] -> 1[3b000] via direct shared memory
localhost:174531:174779 [1] NCCL INFO Channel 00 : 1[3b000] -> 0[18000] via direct shared memory
localhost:174532:174765 [2] NCCL INFO Channel 01 : 2[86000] -> 1[3b000] via direct shared memory
localhost:174531:174779 [1] NCCL INFO Channel 01 : 1[3b000] -> 0[18000] via direct shared memory
localhost:174530:174718 [0] NCCL INFO Connected all trees
localhost:174530:174718 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174530:174718 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174533:174778 [3] NCCL INFO Connected all trees
localhost:174533:174778 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174533:174778 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174532:174765 [2] NCCL INFO Connected all trees
localhost:174532:174765 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174532:174765 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174531:174779 [1] NCCL INFO Connected all trees
localhost:174531:174779 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
localhost:174531:174779 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
localhost:174532:174765 [2] NCCL INFO comm 0x7f1b8c002010 rank 2 nranks 4 cudaDev 2 busId 86000 - Init COMPLETE
localhost:174530:174718 [0] NCCL INFO comm 0x7f7a98002010 rank 0 nranks 4 cudaDev 0 busId 18000 - Init COMPLETE
localhost:174533:174778 [3] NCCL INFO comm 0x7f9e40002010 rank 3 nranks 4 cudaDev 3 busId af000 - Init COMPLETE
localhost:174530:174530 [0] NCCL INFO Launch mode Parallel
localhost:174531:174779 [1] NCCL INFO comm 0x7f06ac002010 rank 1 nranks 4 cudaDev 1 busId 3b000 - Init COMPLETE
Running DDP with model parallel example on rank 3.
Running DDP with model parallel example on rank 1.
Running DDP with model parallel example on rank 0.
Running DDP with model parallel example on rank 2.
localhost:174829:174829 [0] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174829:174829 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174829:174829 [0] NCCL INFO NET/IB : No device found.
localhost:174829:174829 [0] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174829:174829 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda11.3
localhost:174832:174832 [0] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174832:174832 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174832:174832 [0] NCCL INFO NET/IB : No device found.
localhost:174832:174832 [0] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174832:174832 [0] NCCL INFO Using network Socket
localhost:174830:174830 [2] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174830:174830 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174830:174830 [2] NCCL INFO NET/IB : No device found.
localhost:174830:174830 [2] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174830:174830 [2] NCCL INFO Using network Socket
localhost:174833:174833 [2] NCCL INFO Bootstrap : Using eno2:172.160.153.254<0>
localhost:174833:174833 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
localhost:174833:174833 [2] NCCL INFO NET/IB : No device found.
localhost:174833:174833 [2] NCCL INFO NET/Socket : Using [0]eno2:172.160.153.254<0> [1]virbr0:192.168.122.1<0>
localhost:174833:174833 [2] NCCL INFO Using network Socket
localhost:174833:175151 [2] init.cc:521 NCCL WARN Duplicate GPU detected : rank 3 and rank 1 both on CUDA device 86000
localhost:174833:175151 [2] NCCL INFO init.cc:904 -> 5
localhost:174833:175151 [2] NCCL INFO group.cc:72 -> 5 [Async thread]
localhost:174832:175097 [0] init.cc:521 NCCL WARN Duplicate GPU detected : rank 2 and rank 0 both on CUDA device 18000
localhost:174830:175150 [2] init.cc:521 NCCL WARN Duplicate GPU detected : rank 1 and rank 3 both on CUDA device 86000
localhost:174829:175096 [0] init.cc:521 NCCL WARN Duplicate GPU detected : rank 0 and rank 2 both on CUDA device 18000
localhost:174832:175097 [0] NCCL INFO init.cc:904 -> 5
localhost:174830:175150 [2] NCCL INFO init.cc:904 -> 5
localhost:174829:175096 [0] NCCL INFO init.cc:904 -> 5
localhost:174830:175150 [2] NCCL INFO group.cc:72 -> 5 [Async thread]
localhost:174832:175097 [0] NCCL INFO group.cc:72 -> 5 [Async thread]
localhost:174829:175096 [0] NCCL INFO group.cc:72 -> 5 [Async thread]
Traceback (most recent call last):
File "3test.py", line 170, in <module>
run_demo(demo_model_parallel, world_size)
File "3test.py", line 74, in run_demo
join=True)
File "/home/ZDD/anaconda3/envs/pt102/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/ZDD/anaconda3/envs/pt102/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/home/ZDD/anaconda3/envs/pt102/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/ZDD/anaconda3/envs/pt102/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/home/ZDD/learn_pt/_multiprocessing/3test.py", line 147, in demo_model_parallel
ddp_mp_model = DDP(mp_model)
File "/home/ZDD/anaconda3/envs/pt102/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 578, in __init__
dist._verify_model_across_ranks(self.process_group, parameters)
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:957, invalid usage, NCCL version 21.0.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
Here’s pytorch and GPU infomation:
(pt102) [ZDD@localhost _multiprocessing]$ python
Python 3.7.11 (default, Jul 27 2021, 14:32:16)
[GCC 7.5.0] :: Anaconda, Inc. on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.__version__
'1.10.2+cu113'
(pt102) [ZDD@localhost _multiprocessing]$ nvidia-smi -L
GPU 0: NVIDIA GeForce RTX 2080 Ti (UUID: GPU-23f3423a-d4dc-3935-9cae-7f78b3fcd0b8)
GPU 1: NVIDIA GeForce RTX 2080 Ti (UUID: GPU-721a94a1-9f09-930e-a8d8-78213e5e3c31)
GPU 2: NVIDIA GeForce RTX 2080 Ti (UUID: GPU-004f8588-04a5-1a05-9f16-f48e5bfa8064)
GPU 3: NVIDIA GeForce RTX 2080 Ti (UUID: GPU-4104537b-4ce7-3d92-534e-80e4263a2abe)
(pt102) [ZDD@localhost _multiprocessing]$ nvidia-smi
Sun Feb 27 04:15:54 2022
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.54 Driver Version: 510.54 CUDA Version: 11.6 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 NVIDIA GeForce ... On | 00000000:18:00.0 Off | N/A |
| 27% 25C P8 1W / 250W | 0MiB / 11264MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 NVIDIA GeForce ... On | 00000000:3B:00.0 Off | N/A |
| 27% 27C P8 14W / 250W | 0MiB / 11264MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 2 NVIDIA GeForce ... On | 00000000:86:00.0 Off | N/A |
| 27% 26C P8 21W / 250W | 0MiB / 11264MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 3 NVIDIA GeForce ... On | 00000000:AF:00.0 Off | N/A |
| 27% 25C P8 3W / 250W | 0MiB / 11264MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+
By the way, I have the same issue in another virtual env: torch1.8.1+CUDA10.1
. But 'gloo'
still works fine.
This is my first time to post, forgive me if something not proper.
Hope to fix the issue. Thanks!