Trying to Understand the Scale Computation During Static Quantisation

vimal_william · March 2, 2025, 2:27pm

Dear Mates,

I’m learning about quantization and know about the computation of scales for INT-8. When trying to compare the manual calculation with the MinMaxObserver it matches but when I try to compare the MinMaxObserver, manual computation, and static quantized model’s scale it does not match.

PS: Compare with MinMaxObserver as it’s the default configuration.

It will be great to learn about it as it can help me in my future works and attached the code and sample results for reference.

import torch
import torch.ao.quantization
from torchvision import models
from torch import Tensor
from torch.ao.quantization import MinMaxObserver


def quant_forward_hook(module, input, output):
    output_ = output.detach()

    obs = MinMaxObserver(dtype=torch.qint8, qscheme=torch.per_tensor_affine)
    obs(output_)
    scale_o, _ = obs.calculate_qparams()

    int8_ = torch.iinfo(torch.int8)
    scale_m = (output_.max().item() - output_.min().item()) / (int8_.max - int8_.min)

    print(f"scale [observer]: {scale_o}")
    print(f"scale [manual]: {scale_m}")



def expr_minmax(tensor: Tensor):
    minmax = MinMaxObserver(dtype=torch.qint8, qscheme=torch.per_tensor_affine)
    minmax(tensor)

    print(f"min [observer]: {minmax.min_val}")
    print(f"max [observer]: {minmax.max_val}")

    r_max = tensor.max().item()
    r_min = tensor.min().item()

    print(f"min [manual]: {r_min}")
    print(f"max [manual]: {r_max}")

    scale_o, _ = minmax.calculate_qparams()

    int8_ = torch.iinfo(torch.int8)
    scale_m = (r_max - r_min) / (int8_.max - int8_.min)

    print(f"scale [observer]: {scale_o}")
    print(f"scale [manual]: {scale_m}")


class QuantizedVGG16(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.quant = torch.ao.quantization.QuantStub()
        self.vgg16 = models.vgg16(pretrained=True)
        self.dequant = torch.ao.quantization.DeQuantStub()
    
    def forward(self, x):
        x = self.quant(x)
        x = self.vgg16(x)
        return self.dequant(x)
    

def main():
    tensor = torch.rand([1,3,224,224])
    if tensor.numel() == 0:
        return
    
    expr_minmax(tensor)

    model = models.vgg16(pretrained=True)
    model.eval()

    for module in model.modules():
        if isinstance(module, torch.nn.Conv2d):
            module.register_forward_hook(quant_forward_hook)
            break
    
    model(tensor)

    # static quantisation
    q_model = QuantizedVGG16()
    q_model.eval()

    q_model.qconfig = torch.ao.quantization.default_qconfig
    print(q_model.qconfig)
    torch.ao.quantization.prepare(q_model, inplace=True)
    q_model(tensor)

    vgg16_int = torch.ao.quantization.convert(q_model,inplace=True)
    for module in vgg16_int.modules():
        if isinstance(module, torch.ao.nn.quantized.Conv2d):
            print(f"scale [torch-computed]: {module.scale}")
            break

if __name__ == "__main__":
    main()

Ouput:

min [observer]: 1.2934207916259766e-05
max [observer]: 0.9999988675117493
min [manual]: 1.2934207916259766e-05
max [manual]: 0.9999988675117493
scale [observer]: tensor([0.0039])
scale [manual]: 0.0039215134639366
/home/vimal/.local/lib/python3.10/site-packages/torchvision/models/_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
  warnings.warn(
/home/vimal/.local/lib/python3.10/site-packages/torchvision/models/_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=VGG16_Weights.IMAGENET1K_V1`. You can also use `weights=VGG16_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)
scale [observer]: tensor([0.0268])
scale [manual]: 0.02681882147695504
QConfig(activation=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, quant_min=0, quant_max=127){}, weight=functools.partial(<class 'torch.ao.quantization.observer.MinMaxObserver'>, dtype=torch.qint8, qscheme=torch.per_tensor_symmetric){})
scale [torch-computed]: 0.053848814219236374

kindly please help in this regards

Thanks
Vimal William