Since matrix multiplication is not supported for model quantization I’m performing it with a nn.Linear layer which I change its weigths in every forward pass.
This approach works well for the FP32 model but it crashes when the model is quantized. The issue is that, when the model is converted to int8, the following lines of code are not valid
self.linear.weight.requires_grad = False
self.linear.weight.copy_ (input1[b])
because in the converted model self.linear.weight
is not a torch.nn.Paramater
but a method which returns a Tensor
Any workarround on this?
FULL CODE
import torch
import torch.nn as nn
class BatchedMatMul(nn.Module):
def __init__(self):
super().__init__()
self.quant = torch.quantization.QuantStub()
self.linear = nn.Linear(3,3, bias=False)
self.dequant = torch.quantization.DeQuantStub()
def forward(self, input1, input2):
y = []
for b in range(input1.shape[0]):
print(f"Linear's type: {type(self.linear)}")
print(f"Linear's weigth type: {type(self.linear.weight)}")
self.linear.weight.requires_grad = False
self.linear.weight.copy_ (self.quant(input1[b]))
y.append(self.linear(self.quant(input2[b])))
return self.dequant(torch.stack(y))
print("Cronstruct model...")
matmul = BatchedMatMul()
print("Cronstruct model... [OK]")
matmul.eval()
print("Running FP32 inference...")
inp = torch.ones(3, 3).repeat(2,1,1)
y = matmul(inp, inp)
print(y)
print("Running FP32 inference... [OK]")
print("Quantizing...")
matmul.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
matmul_prepared = torch.quantization.prepare(matmul)
matmul_prepared(inp, inp)
model_int8 = torch.quantization.convert(matmul_prepared)
print("Quantizing... [OK]")
print("Running INT8 inference...")
y = model_int8(inp, inp)
print(y)
print("Running INT8 inference..[OK]")
OUTPUT
Cronstruct model...
Cronstruct model... [OK]
Running FP32 inference...
Linear's weigth type: <class 'torch.nn.parameter.Parameter'>
Linear's weigth type: <class 'torch.nn.parameter.Parameter'>
tensor([[[3., 3., 3.],
[3., 3., 3.],
[3., 3., 3.]],
[[3., 3., 3.],
[3., 3., 3.],
[3., 3., 3.]]])
Running FP32 inference... [OK]
Quantizing...
Linear's weigth type: <class 'torch.nn.parameter.Parameter'>
Linear's weigth type: <class 'torch.nn.parameter.Parameter'>
Quantizing... [OK]
Running INT8 inference...
Linear's weigth type: <class 'method'>
/usr/local/lib/python3.6/dist-packages/torch/quantization/observer.py:121: UserWarning: Please use quant_min and quant_max to specify the range for observers. reduce_range will be deprecated in a future release of PyTorch.
reduce_range will be deprecated in a future release of PyTorch."
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-81-024fd82f94de> in <module>()
34 print("Quantizing... [OK]")
35 print("Running INT8 inference...")
---> 36 y = model_int8(inp, inp)
37 print(y)
38 print("Running INT8 inference..[OK]")
1 frames
<ipython-input-81-024fd82f94de> in forward(self, input1, input2)
10 for b in range(input1.shape[0]):
11 print(f"Linear's weigth type: {type(self.linear.weight)}")
---> 12 self.linear.weight.requires_grad = False
13 self.linear.weight.copy_ (input1[b])
14 y.append(self.linear(input2[b]))
AttributeError: 'method' object has no attribute 'requires_grad'