Hi,

I am trying to quantize a sequential model filled with convolution blocks and test its performance, when I test the quantized model on my local MacBook it performs extremely well, get more than 4x speed up. However, when I try to run the same code on Google Colab and another Linux system, it seems that the quantized model is only slightly faster than the non-quantized model (about 25%).

average run time for quantized model(Local PC): ~0.39s

average run time for non-quantized model(Local PC): ~3.53s

average run time for quantized model(Google Colab): ~4.25s

average run time for non-quantized model(Google Colab): ~4.97s

I tried to switch qconfig from “fbgemm” to “qnnpack” but it does not yields better performance.

I Also tried to make all kernel sizes fixed to 5 and dilations fixed to 1 and it does not help either

Here’s the script to test model run time. Is there any possible reason for this? Many thanks.

```
import torch
import torch.nn as nn
import time
import numpy as np
#kernel sizes and dilations used in conv blocks
class MyConfig:
kernel_sizes = [(1, 7), (7, 1), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5)]
dilations = [(1, 1), (1, 1), (1, 1), (2, 1), (4, 1), (8, 1), (16, 1), (32, 1), (1, 1), (2, 2), (4, 4), (8, 8), (16, 16), (32, 32)]
config = MyConfig()
class ConvBlock(nn.Sequential):
def __init__(self, in_planes, out_planes, kernel_size=3, dilation =1 ,groups = 1,
stride=1):
pad = ((kernel_size[0] - 1) // 2 * dilation[0], (kernel_size[1] - 1) // 2 * dilation[1])
super(ConvBlock, self).__init__(nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad, dilation, bias=False, groups = groups),
nn.BatchNorm2d(out_planes, momentum=0.1),
nn.ReLU(inplace=False)
)
#A sequential of convolution blocks
class Model(nn.Module):
def __init__(self,
kernel_sizes,
dilations,nf = 96):
super(Model, self).__init__()
self.Encoder = self.encoder(kernel_sizes, dilations, nf)
self.quant = torch.quantization.QuantStub()
self.dequant = torch.quantization.DeQuantStub()
def encoder(self, kernel_sizes, dilations, nf=96, outf=8):
block = []
for i in range(len(kernel_sizes)):
if i == 0:
block.append(ConvBlock(2, nf, kernel_sizes[i], dilations[i]))
else:
block.append(ConvBlock(nf, nf, kernel_sizes[i], dilations[i]))
block.append(ConvBlock(nf, outf, (1, 1), (1, 1)))
return nn.Sequential(*block)
def forward(self, x):
x = self.quant(x)
x = self.Encoder(x)
x = self.dequant(x)
return x
def fuse_module(self):
for m in self.modules():
if type(m) == ConvBlock:
torch.quantization.fuse_modules(m, [['0', '1', '2']], inplace=True)
#test quantized model
net = Model(config.kernel_sizes,config.dilations)
net.eval()
net.fuse_module()
net.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(net, inplace=True)
#calibrate parameter and convert
test_input = torch.randn((1,2,256,203))
net(test_input)
torch.quantization.convert(net,inplace = True)
run_times = []
net.eval()
with torch.no_grad():
for i in range(10):
test_input = torch.randn((1,2,256,203))
t = time.time()
net(test_input)
run_times.append(time.time()-t)
print("int8 model time = {} s".format(np.mean(run_times)))
#Do not quantize the model and do inference directly
net = Model(config.kernel_sizes,config.dilations)
run_times = []
net.eval()
with torch.no_grad():
for i in range(10):
test_input = torch.randn((1,2,256,203))
t = time.time()
net(test_input)
run_times.append(time.time()-t)
print("fp32 model time = {} s".format(np.mean(run_times)))
```