Quantization scales and bias

shas19 · June 18, 2021, 2:44am

I was using Pytorch for post-training quantization for my resnet18 model. Following is part of the code.

net.qconfig = torch.quantization.QConfig(
    activation=torch.quantization.MinMaxObserver.with_args(dtype=torch.quint8, qscheme=torch.per_tensor_symmetric), 
    weight=torch.quantization.MinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_tensor_symmetric))

I wanted to print bias and scale for each tensor that is being used internally for each Tensor.
Can someone please help me do it the right way?

Thanks.

Vasiliy_Kuznetsov · June 21, 2021, 11:39pm

Hi @shas19 , if you print out the quantized network it should show the scale and zero_points of various layers. Is there something else you are looking for? Could you be more specific?

ken012git · June 27, 2021, 3:14pm

Hi,

I have the same question for printing quantized weights. Is there a way to see the quantized int values which are used in the convolution operators, like the example shown below:

class M(torch.nn.Module):
    def __init__(self):
        super(M, self).__init__()
        # QuantStub converts tensors from floating point to quantized
        self.quant = torch.quantization.QuantStub()
        self.conv = torch.nn.Conv2d(1, 1, 1)
        self.bn = torch.nn.BatchNorm2d(1)
        self.relu = torch.nn.ReLU()
        # DeQuantStub converts tensors from quantized to floating point
        self.dequant = torch.quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.conv(x)
        print(x)  # <- input is quantized
        print(self.conv.weight)   # <- It prints: bound method Conv2d.weight of QuantizedConv2d(1, 1, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)
        print(self.conv.bias)   # <- It prints: <bound method Conv2d.bias of QuantizedConv2d(1, 1, kernel_size=(1, 1), stride=(1, 1), scale=1.0, zero_point=0)>
        x = self.bn(x)
        x = self.relu(x)
        x = self.dequant(x)
        return x

# conver the model to qat
qconfig = QConfig(
    activation = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver),
    weight = FakeQuantize.with_args(
        observer=MovingAverageMinMaxObserver,
        quant_min=-128, quant_max=127,
        dtype=torch.qint8)
)
model_fp32 = M()
model_fp32.train()
model_fp32.qconfig = qconfig
model_fp32_prepared = prepare_qat(model_fp32)
model_fp32_prepared.eval()
model_int8 = convert(model_fp32_prepared)
# run the model
input_fp32 = torch.randn(4, 1, 4, 4)
res = model_int8(input_fp32)

ken012git · June 27, 2021, 3:37pm

Also, I did look into the code. I found the qat conv module calling fake quant before doing convolution.

github.com

pytorch/pytorch/blob/master/torch/nn/qat/modules/conv.py#L36

    
      
              factory_kwargs = {'device': device, 'dtype': dtype}
              super().__init__(in_channels, out_channels, kernel_size,
                               stride=stride, padding=padding, dilation=dilation,
                               groups=groups, bias=bias, padding_mode=padding_mode,
                               **factory_kwargs)
              assert qconfig, 'qconfig must be provided for QAT module'
              self.qconfig = qconfig
              self.weight_fake_quant = qconfig.weight(factory_kwargs=factory_kwargs)
          
          
def forward(self, input):
              return self._conv_forward(input, self.weight_fake_quant(self.weight), self.bias)
          
          
@classmethod
          def from_float(cls, mod):
              r"""Create a qat module from a float module or qparams_dict
          
          
        Args: `mod` a float module, either produced by torch.quantization utilities
                  or directly from user
              """
              assert type(mod) == cls._FLOAT_MODULE, 'qat.' + cls.__name__ + '.from_float only works for ' + \
                  cls._FLOAT_MODULE.__name__

github.com

pytorch/pytorch/blob/master/torch/quantization/fake_quantize.py#L141

    
      
                  _scale, _zero_point = self.calculate_qparams()
                  _scale, _zero_point = _scale.to(self.scale.device), _zero_point.to(self.zero_point.device)
                  if self.scale.shape != _scale.shape:
                      self.scale.resize_(_scale.shape)
                      self.zero_point.resize_(_zero_point.shape)
                  self.scale.copy_(_scale)
                  self.zero_point.copy_(_zero_point)
          
          
    if self.fake_quant_enabled[0] == 1:
                  if self.is_per_channel:
                      X = torch.fake_quantize_per_channel_affine(
                          X, self.scale, self.zero_point,
                          self.ch_axis, self.quant_min, self.quant_max)
                  else:
                      X = torch.fake_quantize_per_tensor_affine(
                          X, float(self.scale), int(self.zero_point),
                          self.quant_min, self.quant_max)
              return X
          
          
@torch.jit.export
          def extra_repr(self):

And, the fake quant calls a fake quantize affine function, for example the fake_quantize_per_tensor_affine function in pytorch/torch/onnx/symbolic_opset10.py

I print the output of self.weight_fake_quant(self.weight) and see all quanted and requanted floating point numbers. My question is that the convolution seems using the floating point r instead of the integers q from the formula in the QAT’s paper r = S*(q - Z)?