How to use quantized tensors in torch.distributed?

HeyangQin · January 13, 2022, 1:02am

I am trying to all gather a list of qint8 tensors on Pytorch 1.10.1 with the following code:

import torch
import torch.multiprocessing as mp
import torch.distributed as dist
import os

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '17777'
    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def subprocess_fn(rank):
    setup(rank, 4)
    test_tensor = torch.Tensor([rank]).cuda(rank)
    tensor_list=[torch.ones(1,dtype=torch.qint8).cuda(rank) for _ in range(4)]
    print(f'Before broadcast and quant data on rank {rank} is {test_tensor},type is {type(test_tensor)}.')
    dist.all_gather(tensor_list,torch.quantize_per_tensor(inputx=test_tensor,scale=test_tensor.max(),zero_point=0,dtype=torch.qint8))
    print(f'After broadcast data on rank {rank} is {tensor_list}, dtype is {tensor_list[0].dtype}.')
    tensor_list=[n.float() for n in tensor_list]
    print(f'After broadcast and expand data on rank {rank} is {tensor_list}, dtype is {tensor_list[0].dtype}.')

if __name__ == '__main__':
    mp.spawn(subprocess_fn, args=(), nprocs=4)

and I got the following error:

Traceback (most recent call last):
  File "/data/users/heyangqin/anaconda3/envs/deepspeed_fp8/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
    fn(i, *args)
  File "/vc_data/users/heyangqin/fp8/fp8_dist_test.py", line 18, in subprocess_fn
    tensor_list=[torch.ones(1,dtype=torch.qint8).cuda(rank) for _ in range(4)]
  File "/vc_data/users/heyangqin/fp8/fp8_dist_test.py", line 18, in <listcomp>
    tensor_list=[torch.ones(1,dtype=torch.qint8).cuda(rank) for _ in range(4)]
NotImplementedError: Could not run 'aten::empty.memory_format' with arguments from the 'QuantizedCPU' backend. This could be because the operator doesn't exist for this backend, or was omitted during the selective/custom build process (if using custom build). If you are a Facebook employee using PyTorch on mobile, please visit https://fburl.com/ptmfixes for possible resolutions. 'aten::empty.memory_format' is only available for these backends: [CPU, CUDA, Meta, MkldnnCPU, SparseCPU, SparseCUDA, BackendSelect, Python, Named, Conjugate, Negative, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradLazy, AutogradXPU, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, UNKNOWN_TENSOR_TYPE_ID, Autocast, Batched, VmapMode].

Is there any way to work around this or the torch.distributed cannot work with quantized tensors yet? Thank you!

rvarm1 · January 18, 2022, 8:26am

It looks like this approach results in an error indicating that some ops used in all_gather are not implemented for quantized tensors, although we have built out prototype APIs for some quantized collective communication. See:

github.com

pytorch/pytorch/blob/master/test/distributed/algorithms/quantization/test_quantization.py

# Owner(s): ["oncall: distributed"]

import torch
import os
import torch.cuda
import sys
import torch.distributed as dist
import torch.distributed.algorithms.quantization.quantization as quant
from torch.distributed.algorithms.quantization.quantization import DQuantType
from torch.testing._internal.common_distributed import (
    MultiProcessTestCase,
    init_multigpu_helper,
    requires_gloo,
    skip_if_rocm,
    skip_if_lt_x_gpu,
    requires_nccl,
)
from torch.testing._internal.common_utils import sandcastle_skip_if, run_tests, TEST_WITH_DEV_DBG_ASAN, NO_MULTIPROCESSING_SPAWN

torch.backends.cuda.matmul.allow_tf32 = False

This file has been truncated. show original

github.com

pytorch/pytorch/blob/master/torch/distributed/algorithms/quantization/quantization.py

import functools
import torch
import torch.distributed as dist


from enum import Enum


TORCH_HALF_MIN = torch.finfo(torch.float16).min
TORCH_HALF_MAX = torch.finfo(torch.float16).max

class DQuantType(Enum):
    """
    Different quantization methods for auto_quantize API are identified here.
    auto_quantize API currently supports fp16 and bfp16 methods.
    """
    FP16 = "fp16",
    BFP16 = "bfp16"

    def __str__(self) -> str:

This file has been truncated. show original

Although please note that these APIs are prototype and may thus have some issues if you decide to give it a try. cc @wanchaol