I am trying to all gather a list of qint8
tensors on Pytorch 1.10.1 with the following code:
import torch
import torch.multiprocessing as mp
import torch.distributed as dist
import os
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '17777'
# initialize the process group
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def subprocess_fn(rank):
setup(rank, 4)
test_tensor = torch.Tensor([rank]).cuda(rank)
tensor_list=[torch.ones(1,dtype=torch.qint8).cuda(rank) for _ in range(4)]
print(f'Before broadcast and quant data on rank {rank} is {test_tensor},type is {type(test_tensor)}.')
dist.all_gather(tensor_list,torch.quantize_per_tensor(inputx=test_tensor,scale=test_tensor.max(),zero_point=0,dtype=torch.qint8))
print(f'After broadcast data on rank {rank} is {tensor_list}, dtype is {tensor_list[0].dtype}.')
tensor_list=[n.float() for n in tensor_list]
print(f'After broadcast and expand data on rank {rank} is {tensor_list}, dtype is {tensor_list[0].dtype}.')
if __name__ == '__main__':
mp.spawn(subprocess_fn, args=(), nprocs=4)
and I got the following error:
Traceback (most recent call last):
File "/data/users/heyangqin/anaconda3/envs/deepspeed_fp8/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/vc_data/users/heyangqin/fp8/fp8_dist_test.py", line 18, in subprocess_fn
tensor_list=[torch.ones(1,dtype=torch.qint8).cuda(rank) for _ in range(4)]
File "/vc_data/users/heyangqin/fp8/fp8_dist_test.py", line 18, in <listcomp>
tensor_list=[torch.ones(1,dtype=torch.qint8).cuda(rank) for _ in range(4)]
NotImplementedError: Could not run 'aten::empty.memory_format' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::empty.memory_format' is only available for these backends: [CPU, CUDA, Meta, MkldnnCPU, SparseCPU, SparseCUDA, BackendSelect, Python, Named, Conjugate, Negative, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradLazy, AutogradXPU, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, UNKNOWN_TENSOR_TYPE_ID, Autocast, Batched, VmapMode].
Is there any way to work around this or the torch.distributed
cannot work with quantized tensors yet? Thank you!