Hi, I am using nccl
backend dist.send
and dist.recv
to perform multi node training on seperate machines. But I got system call failed during the process.
Here is the minimal reproduce code
Client
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
def main():
mp.spawn(main_worker, nprocs=1, args=(2,))
def main_worker(rank,n):
print("begin ")
dist.init_process_group(backend = 'nccl', init_method='tcp://xx.xx.x.30:23456',
rank=1, world_size=2)
print(rank)
input = torch.rand([10,10]).to(1)
dist.recv(input,0)
print("finish")
if __name__ == '__main__':
main()
And here is the server
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
def main():
mp.spawn(main_worker, nprocs=1, args=(2,))
def main_worker(rank,n):
print("begin 0")
dist.init_process_group(backend = 'nccl', init_method='tcp://xx.xx.x.30:23456',
rank=0, world_size=2)
print(rank)
input = torch.rand([10,10]).to(0)
dist.send(input,1)
if __name__ == '__main__':
main()
They could initialize but can not send and recv
The result(error) is
Client
begin
0
x1:870114:870114 [1] NCCL INFO Bootstrap : Using eno1:xx.xx.x.xx<0>
x1:870114:870114 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
x1:870114:870114 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
x1:870114:870114 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE ; OOB eno1:xx.xx.x.16<0>
x1:870114:870114 [1] NCCL INFO Using network IB
x1:870114:870206 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] 0/-1/-1->1->-1
x1:870114:870206 [1] NCCL INFO Setting affinity for GPU 1 to 3ff003ff
x1:870114:870206 [1] NCCL INFO Channel 00 : 0[5000] -> 1[3000] [receive] via NET/IB/0
x1:870114:870206 [1] NCCL INFO Channel 01 : 0[5000] -> 1[3000] [receive] via NET/IB/0
x1:870114:870206 [1] NCCL INFO Channel 00 : 1[3000] -> 0[5000] [send] via NET/IB/0
x1:870114:870206 [1] NCCL INFO Channel 01 : 1[3000] -> 0[5000] [send] via NET/IB/0
x1:870114:870206 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer xx.xx.x.30<45376>
x1:870114:870206 [1] NCCL INFO include/socket.h:445 -> 2
x1:870114:870206 [1] NCCL INFO include/socket.h:457 -> 2
x1:870114:870206 [1] NCCL INFO transport/net_ib.cc:505 -> 2
x1:870114:870206 [1] NCCL INFO include/net.h:22 -> 2
x1:870114:870206 [1] NCCL INFO transport/net.cc:234 -> 2
x1:870114:870206 [1] NCCL INFO transport.cc:119 -> 2
x1:870114:870206 [1] NCCL INFO init.cc:778 -> 2
x1:870114:870206 [1] NCCL INFO init.cc:904 -> 2
x1:870114:870206 [1] NCCL INFO group.cc:72 -> 2 [Async thread]
Traceback (most recent call last):
File "test.py", line 28, in <module>
main()
File "test.py", line 6, in main
mp.spawn(main_worker, nprocs=1, args=(2,))
File "/home/haokang/miniconda3/envs/3.8/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/haokang/miniconda3/envs/3.8/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/haokang/miniconda3/envs/3.8/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/haokang/miniconda3/envs/3.8/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "/NFS/home/haokang/test.py", line 23, in main_worker
dist.recv(input,0)
File "/home/haokang/miniconda3/envs/3.8/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1002, in recv
pg.recv([tensor], src, tag).wait()
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1169, unhandled system error, NCCL version 21.0.3
ncclSystemError: System call (socket, malloc, munmap, etc) failed.
Server
begin 0
0
x0:26431:26431 [0] NCCL INFO Bootstrap : Using enp14s0:0:xx.xx.x.xx<0>
x0:26431:26431 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
x0:26431:26431 [0] misc/ibvwrap.cc:63 NCCL WARN Failed to open libibverbs.so[.1]
x0:26431:26431 [0] NCCL INFO NET/Socket : Using [0]enp14s0:0:xx.xx.x.30<0> [1]lxdbr0:10.192.229.1<0> [2]br-a2bb5c34e1e1:172.18.0.1<0> [3]vethVPT6RK:fe80::fc8b:a2ff:fe77:b6d0%vethVPT6RK<0> [4]vethf00825f:fe80::5811:50ff:fe0f:65b6%vethf00825f<0> [5]veth88c9646:fe80::18b8:e8ff:feaf:7b41%veth88c9646<0> [6]veth6755258:fe80::7c74:71ff:fe97:44dc%veth6755258<0>
x0:26431:26431 [0] NCCL INFO Using network Socket
NCCL version 2.10.3+cuda10.2
x0:26431:26608 [0] NCCL INFO Channel 00/02 : 0 1
x0:26431:26608 [0] NCCL INFO Channel 01/02 : 0 1
x0:26431:26608 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1
x0:26431:26608 [0] NCCL INFO Setting affinity for GPU 0 to 0fff
x0:26431:26608 [0] NCCL INFO Channel 00 : 1[3000] -> 0[5000] [receive] via NET/Socket/0
x0:26431:26608 [0] NCCL INFO Channel 01 : 1[3000] -> 0[5000] [receive] via NET/Socket/0
x0:26431:26608 [0] NCCL INFO Channel 00 : 0[5000] -> 1[3000] [send] via NET/Socket/0
x0:26431:26608 [0] NCCL INFO Channel 01 : 0[5000] -> 1[3000] [send] via NET/Socket/0
x0:26431:26608 [0] NCCL INFO Connected all rings
x0:26431:26608 [0] NCCL INFO Connected all trees
x0:26431:26608 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 8/8/512
x0:26431:26608 [0] NCCL INFO 2 coll channels, 2 p2p channels, 1 p2p channels per peer
x0:26431:26608 [0] NCCL INFO comm 0x7efec4001200 rank 0 nranks 2 cudaDev 0 busId 5000 - Init COMPLETE
x0:26431:26431 [0] NCCL INFO Launch mode Parallel
It seems that something was wrong with client
When I print the debug info
x1:947320:947407 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer
Why it was closed since it is successfully intiailized?