Rounding error using quantized model parameters

Karthik_Ganesan · November 14, 2021, 10:45pm

I’m writing my own custom implementation of quantized layers but I don’t seem to be getting the same answer as the PyTorch quantizedCPU backend.

PyTorch output: tensor([[ 0.1730, 0.7621, -0.3675, -0.2648, -0.4000]])
Manual output: tensor([[ 0.2096, 0.7595, -0.3691, -0.2660, -0.4016]])

Of course, the values aren’t that far off but they still seem different enough that I’m wondering if there’s more to it than just rounding error? Here’s how I am calculating my outputs. Does PyTorch do something very different, which would account for this difference?

import torch
import torch.nn as nn


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.quant = torch.quantization.QuantStub()
        self.fc_1 = nn.Linear(in_features=10, out_features=5)
        self.dequant = torch.quantization.DeQuantStub()

    def forward(self, x):
        x = self.quant(x)
        x = self.fc_1(x)
        x = self.dequant(x)
        return x


torch.manual_seed(0)
testInput = torch.rand(5, 10)
model = Net()   # Model will be initalized to random values.

quant_net = Net().cpu()
quant_net.load_state_dict(model.state_dict())
quant_net.eval()
quant_net.qconfig = torch.quantization.get_default_qconfig('qnnpack')
quant_net_prepared = torch.quantization.prepare(quant_net)
for i in range(5):
    singleInput = torch.unsqueeze(testInput[i], 0)
    quant_net_prepared(singleInput)
quant_model = torch.quantization.convert(quant_net_prepared)

singleInput = torch.unsqueeze(testInput[0], 0)
print(f"PyTorch output: {quant_model(singleInput)}")

# Unpack the values in the quantized model's state dict to run the layer manually.
quant_scale, quant_zero_point = quant_model.state_dict()['quant.scale'].item(), quant_model.state_dict()['quant.zero_point'].item()
fc_out_scale, fc_out_zero_point = quant_model.state_dict()['fc_1.scale'].item(), quant_model.state_dict()['fc_1.zero_point'].item()
weight_tensor, bias = quant_model.state_dict()['fc_1._packed_params._packed_params']
fc_scale, fc_zero_point = weight_tensor.q_scale(), weight_tensor.q_zero_point()
weight = weight_tensor.dequantize()

# Quantize the input and convert them to INT8. Store them in floats to avoid overflows.
singleInputInt8 = torch.quantize_per_tensor(singleInput, quant_scale, quant_zero_point, dtype=torch.quint8)
singleInputInt8 = torch.int_repr(singleInputInt8).float()
weightInt8 = torch.quantize_per_tensor(weight, fc_scale, fc_zero_point, dtype=torch.qint8)
weightInt8 = torch.int_repr(weightInt8).float()

weightInt8 = weightInt8.transpose(0, 1)
m, n, k = singleInputInt8.size(0), singleInputInt8.size(1), weightInt8.size(1)
out = torch.zeros(m, k)
for i in range(m):
    for j in range(k):
        sum = 0
        for h in range(n):
            A = singleInputInt8[i, h].item() - quant_zero_point
            B = weightInt8[h, j].item() - fc_zero_point
            sum += A*B
        out[i, j] = (sum*quant_scale*fc_scale) + bias[j]

print(f"Manual output: {out}")

HDCharles · November 17, 2021, 2:08am

You may want to check out the unit tests to see how the reference implementation is defined there and try to pass those unit tests instead of making your own and being unsure about whats different.

github.com

pytorch/pytorch/blob/7ee84ad321207e31a29a93ed4ea1e5890125ecec/test/quantization/core/test_quantized_op.py#L3161

    
      
                  q_mod = torch.nn.quantized.ConvTranspose3d
                  dq_op = torch.ops.quantized.conv_transpose3d_dynamic
                  dim = 5
                  dtype = torch.quint8
          
          
        if qengine_is_qnnpack():
                      return  # TODO: fix MakeDeConvOutputShape overflowing for convT3d with qnnpack
                  self._test_qconv_op_impl(q_mod, dq_op, dim, dtype)
          
          

          
class TestQuantizedLinear(TestCase):
              """Tests the correctness of the quantized linear and linear_relu op."""
              @given(batch_size=st.integers(1, 4),
                     input_channels=st.integers(16, 32),
                     output_channels=st.integers(4, 8),
                     use_bias=st.booleans(),
                     use_relu=st.booleans(),
                     use_multi_dim_input=st.booleans(),
                     use_channelwise=st.booleans())
              @override_qengines
              def test_qlinear(self, batch_size, input_channels, output_channels, use_bias,

and

github.com

pytorch/pytorch/blob/7ee84ad321207e31a29a93ed4ea1e5890125ecec/test/test_nnapi.py#L644

    
      
                              qpt([[1.0, 2.0]], 0.25, 128),
                              qpt([[3.0, 4.0]], 0.25, 128),
                          ],
                          convert_args=[
                              qpt(torch.zeros((1, 2)), 0.25, 128),
                              qpt(torch.zeros((1, 2)), 0.25, 128),
                          ]
                      )
                      # NOTE: NNAPI qadd supports broadcast, but PT does not.
          
          
def test_qlinear(self):
              torch.manual_seed(29)
              weight = qpt(torch.randn(16, 32), 0.125, 0, torch.qint8)
              bias = torch.randn(16)
              mod = torch.nn.quantized.Linear(32, 16)
              mod.set_weight_bias(weight, bias)
              inp = qpt(torch.randn(2, 32), 0.05, 130, torch.quint8)
              self.check(mod, inp)
          
          
def test_seblock_mul(self):
              class MulModel(torch.nn.Module):

although i’d rather not dig through the actual linear implementation’s c++ code, I believe its something along the lines of Lei Mao's Log Book – Quantization for Neural Networks

Karthik_Ganesan · November 17, 2021, 2:50am

Thank you! I did come across that link you posted and based my implementation on that. But I didn’t think of using the unit tests, so I’ll be sure to try that and reply back to close the issue.