How to use dist.send to send torch.cuda.ShortTensor or torch.cuda.CharTensor

I am using dist.send and dist.recv to send tensors using backend == ‘nccl’.
When I am trying to send torch.ShortTensor, some errors came.
Also backend == ‘gloo’ will cause trouble, too.
here is the code example

import torch.multiprocessing as mp
import torch

import torch.distributed as dist

def testfunc(rank,nothing):
    print(rank)
    dist.init_process_group(
        backend="nccl", init_method="tcp://127.0.0.1:1214", world_size=2, rank=rank,group_name="test"
    )
    if rank == 0:
        # something = torch.rand([1,2]).to(0)
        something = torch.rand([1,2]).type(torch.cuda.ShortTensor).to(0)
        
        dist.send(something,1)
        print(something)
    if rank == 1:
        # something = torch.rand([1,2]).to(3)
        something = torch.rand([1,2]).type(torch.cuda.ShortTensor).to(3)
        
        dist.recv(something,0)


    
    
def main():
    torch.multiprocessing.set_start_method("spawn")
    for i in range(2):
        # print(i)
        p = mp.Process(target = testfunc,args = (i,1))
        p.start()




if __name__ == '__main__':
    main()

here is the result

0
1
Process Process-2:
Process Process-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/haokang/distributed/test.py", line 22, in testfunc
    dist.recv(something,0)
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1002, in recv
    pg.recv([tensor], src, tag).wait()
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
RuntimeError: Unconvertible NCCL type Short
  File "/home/haokang/distributed/test.py", line 16, in testfunc
    dist.send(something,1)
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 959, in send
    default_pg.send([tensor], dst, tag).wait()
RuntimeError: Unconvertible NCCL type Short

Hi, @Maxwell_Albert, do you have the actual error msg?

import torch.multiprocessing as mp
import torch

import torch.distributed as dist

def testfunc(rank,nothing):
    print(rank)
    dist.init_process_group(
        backend="nccl", init_method="tcp://127.0.0.1:1214", world_size=2, rank=rank,group_name="test"
    )
    if rank == 0:
        # something = torch.rand([1,2]).to(0)
        something = torch.rand([1,2]).type(torch.cuda.ShortTensor).to(0)
        
        dist.send(something,1)
        print(something)
    if rank == 1:
        # something = torch.rand([1,2]).to(3)
        something = torch.rand([1,2]).type(torch.cuda.ShortTensor).to(3)
        
        dist.recv(something,0)


    
    
def main():
    torch.multiprocessing.set_start_method("spawn")
    for i in range(2):
        # print(i)
        p = mp.Process(target = testfunc,args = (i,1))
        p.start()




if __name__ == '__main__':
    main()

here is the code example
and here is the error

0
1
Process Process-2:
Process Process-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/haokang/distributed/test.py", line 22, in testfunc
    dist.recv(something,0)
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1002, in recv
    pg.recv([tensor], src, tag).wait()
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
RuntimeError: Unconvertible NCCL type Short
  File "/home/haokang/distributed/test.py", line 16, in testfunc
    dist.send(something,1)
  File "/home/haokang/anaconda3/envs/kh3.8/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 959, in send
    default_pg.send([tensor], dst, tag).wait()
RuntimeError: Unconvertible NCCL type Short

Thanks for sharing your code example here. It looks like ShortTensor is not supported. Is it intentional that you want to use ShortTensor here?

cc: @cbalioglu

Hi!
Sure I want to use short tensor to represent some quantization tensors. I tried ‘gloo’ backend and it is successful. Maybe just ‘nccl’ does not support it.