We’ve built PyTorch from source and tried to call send/recv, but failed. Could you tell me what I am doing wrong? A toy program is here:
1 import os
2 import socket
3 import torch
4 import torch.distributed as dist
5 from torch.multiprocessing import Process
6
7
8 def run(rank, size, hostname):
9 print(f"I am {rank} of {size} in {hostname}")
10 tensor = torch.zeros(1, device=torch.device('cuda:{}'.format(rank)))
11 if rank == 0:
12 tensor += 1
13 # Send the tensor to process 1
14 dist.send(tensor=tensor, dst=1)
15 else:
16 # Receive tensor from process 0
17 dist.recv(tensor=tensor, src=0)
18 print('Rank ', rank, ' has data ', tensor[0])
19
20
21 def init_processes(rank, size, hostname, fn, backend='tcp'):
22 """ Initialize the distributed environment. """
23 dist.init_process_group(backend, rank=rank, world_size=size)
24 fn(rank, size, hostname)
25
26
27 if __name__ == "__main__":
28 world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
29 world_rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
30 hostname = socket.gethostname()
31 init_processes(world_rank, world_size, hostname, run, backend='mpi')
The cluster which I use is managed using slurm. Here is a list of loaded modules:
1) /gpu/cuda-10.0 2) /mpi/hpcx-v2.4.0 3) /python/python-3.6.8 4) /python/pytorch-1.3.0,
where pytorch-1.3.0 is installed from source.
And this is how I call this script:
mpirun -np 2 python3 pytorch_distributed.py
The error looks as follows
I am 1 of 2 in gn10.zhores
I am 0 of 2 in gn10.zhores
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 11502 on node gn10 exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------