I am using dist.send and dist.recv to send tensors using backend == ‘nccl’.
When I am trying to send torch.ShortTensor, some errors came.
Also backend == ‘gloo’ will cause trouble, too.
here is the code example
import torch.multiprocessing as mp
import torch
import torch.distributed as dist
def testfunc(rank,nothing):
print(rank)
dist.init_process_group(
backend="nccl", init_method="tcp://127.0.0.1:1214", world_size=2, rank=rank,group_name="test"
)
if rank == 0:
# something = torch.rand([1,2]).to(0)
something = torch.rand([1,2]).type(torch.cuda.ShortTensor).to(0)
dist.send(something,1)
print(something)
if rank == 1:
# something = torch.rand([1,2]).to(3)
something = torch.rand([1,2]).type(torch.cuda.ShortTensor).to(3)
dist.recv(something,0)
def main():
torch.multiprocessing.set_start_method("spawn")
for i in range(2):
# print(i)
p = mp.Process(target = testfunc,args = (i,1))
p.start()
if __name__ == '__main__':
main()
here is the result
0
1
Process Process-2:
Process Process-1:
Traceback (most recent call last):
Traceback (most recent call last):
File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/haokang/distributed/test.py", line 22, in testfunc
dist.recv(something,0)
File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1002, in recv
pg.recv([tensor], src, tag).wait()
File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
RuntimeError: Unconvertible NCCL type Short
File "/home/haokang/distributed/test.py", line 16, in testfunc
dist.send(something,1)
File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 959, in send
default_pg.send([tensor], dst, tag).wait()
RuntimeError: Unconvertible NCCL type Short