Hi, I run the folloing codes on a ubuntu machine with 2 gpus:
import argparse
import torch
import os
import torch.distributed
def distributed_training_init(model, backend='nccl', sync_bn=False):
if sync_bn:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
rank = int(os.environ['RANK'])
world_size = int(os.environ['WORLD_SIZE'])
gpu = int(os.environ['LOCAL_RANK'])
print(rank, world_size, gpu)
torch.distributed.init_process_group(backend, world_size=world_size,
rank=rank, init_method='env://')
print('gpu', gpu)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
return model
if __name__ == "__main__":
args = argparse.ArgumentParser()
args.add_argument('--local_rank', type=int)
args = args.parse_args()
print(args)
net = torch.nn.Linear(3, 4)
net.to('cuda:0')
net = distributed_training_init(net)
# python -m torch.distributed.launch --nproc_per_node=2 w1.py
(pytorch-env) wfang@Precision-5820-Tower-X-Series:~/tempdir$ python -m torch.distributed.launch --nproc_per_node=2 w1.py
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
Namespace(local_rank=0)
Namespace(local_rank=1)
0 2 0
1 2 1
gpu 0
gpu 1
Traceback (most recent call last):
File "/home/wfang/tempdir/w1.py", line 28, in <module>
net = distributed_training_init(net)
File "/home/wfang/tempdir/w1.py", line 18, in distributed_training_init
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 446, in __init__
Traceback (most recent call last):
File "/home/wfang/tempdir/w1.py", line 28, in <module>
self._sync_params_and_buffers(authoritative_rank=0)
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 457, in _sync_params_and_buffers
self._distributed_broadcast_coalesced(
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1155, in _distributed_broadcast_coalesced
net = distributed_training_init(net)
File "/home/wfang/tempdir/w1.py", line 18, in distributed_training_init
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 446, in __init__
dist._broadcast_coalesced(
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1614378073850/work/torch/lib/c10d/ProcessGroupNCCL.cpp:825, invalid usage, NCCL version 2.7.8
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
self._sync_params_and_buffers(authoritative_rank=0)
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 457, in _sync_params_and_buffers
self._distributed_broadcast_coalesced(
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1155, in _distributed_broadcast_coalesced
dist._broadcast_coalesced(
RuntimeError: NCCL error in: /opt/conda/conda-bld/pytorch_1614378073850/work/torch/lib/c10d/ProcessGroupNCCL.cpp:825, invalid usage, NCCL version 2.7.8
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
Killing subprocess 2167196
Killing subprocess 2167197
Traceback (most recent call last):
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/distributed/launch.py", line 340, in <module>
main()
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/distributed/launch.py", line 326, in main
sigkill_handler(signal.SIGTERM, None) # not coming back
File "/home/wfang/anaconda3/envs/pytorch-env/lib/python3.9/site-packages/torch/distributed/launch.py", line 301, in sigkill_handler
raise subprocess.CalledProcessError(returncode=last_return_code, cmd=cmd)
subprocess.CalledProcessError: Command '['/home/wfang/anaconda3/envs/pytorch-env/bin/python', '-u', 'w1.py', '--local_rank=1']' returned non-zero exit status 1.
This machine has two gpus:
(pytorch-env) wfang@Precision-5820-Tower-X-Series:~/tempdir$ nvidia-smi
Thu Sep 30 19:48:19 2021
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00 Driver Version: 455.32.00 CUDA Version: 11.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 GeForce RTX 208... On | 00000000:17:00.0 Off | N/A |
| 18% 44C P8 16W / 250W | 0MiB / 11019MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
| 1 GeForce RTX 208... On | 00000000:B3:00.0 Off | N/A |
| 35% 56C P0 66W / 250W | 0MiB / 11011MiB | 0% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
| No running processes found |
+-----------------------------------------------------------------------------+