Slower quantization on linux systems

Yuyang_Zhu · June 5, 2021, 1:44am

Hi,
I am trying to quantize a sequential model filled with convolution blocks and test its performance, when I test the quantized model on my local MacBook it performs extremely well, get more than 4x speed up. However, when I try to run the same code on Google Colab and another Linux system, it seems that the quantized model is only slightly faster than the non-quantized model (about 25%).

average run time for quantized model(Local PC): ~0.39s
average run time for non-quantized model(Local PC): ~3.53s
average run time for quantized model(Google Colab): ~4.25s
average run time for non-quantized model(Google Colab): ~4.97s

I tried to switch qconfig from “fbgemm” to “qnnpack” but it does not yields better performance.
I Also tried to make all kernel sizes fixed to 5 and dilations fixed to 1 and it does not help either

Here’s the script to test model run time. Is there any possible reason for this? Many thanks.

import torch
import torch.nn as nn
import time
import numpy as np

#kernel sizes and dilations used in conv blocks
class MyConfig:
  kernel_sizes = [(1, 7), (7, 1), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5), (5, 5)]
  dilations    = [(1, 1), (1, 1), (1, 1), (2, 1), (4, 1), (8, 1), (16, 1), (32, 1), (1, 1), (2, 2), (4, 4), (8, 8), (16, 16), (32, 32)]
config = MyConfig()

class ConvBlock(nn.Sequential):
    def __init__(self, in_planes, out_planes, kernel_size=3, dilation =1 ,groups = 1,
                 stride=1):
        pad = ((kernel_size[0] - 1) // 2 * dilation[0], (kernel_size[1] - 1) // 2 * dilation[1])
        super(ConvBlock, self).__init__(nn.Conv2d(in_planes, out_planes, kernel_size, stride, pad, dilation, bias=False, groups = groups),
                                                  nn.BatchNorm2d(out_planes, momentum=0.1),
                                                 nn.ReLU(inplace=False)
                                                  )
#A sequential of convolution blocks
class Model(nn.Module):
    def __init__(self,
                 kernel_sizes,
                 dilations,nf = 96):
        super(Model, self).__init__()
        self.Encoder = self.encoder(kernel_sizes, dilations, nf)
        self.quant = torch.quantization.QuantStub()
        self.dequant = torch.quantization.DeQuantStub()

    def encoder(self, kernel_sizes, dilations, nf=96, outf=8):
        block = []
        for i in range(len(kernel_sizes)):
            if i == 0:
                block.append(ConvBlock(2, nf, kernel_sizes[i], dilations[i]))
            else:
                block.append(ConvBlock(nf, nf, kernel_sizes[i], dilations[i]))
        block.append(ConvBlock(nf, outf, (1, 1), (1, 1)))
        return nn.Sequential(*block)

    def forward(self, x):
        x = self.quant(x)
        x = self.Encoder(x)
        x = self.dequant(x)
        return x
    def fuse_module(self):
        for m in self.modules():
          if type(m) == ConvBlock:
            torch.quantization.fuse_modules(m, [['0', '1', '2']], inplace=True)

#test quantized model
net = Model(config.kernel_sizes,config.dilations)
net.eval()
net.fuse_module()
net.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(net, inplace=True)
#calibrate parameter and convert
test_input = torch.randn((1,2,256,203))
net(test_input)
torch.quantization.convert(net,inplace = True)

run_times = []
net.eval()  
with torch.no_grad():
    for i in range(10):
        test_input = torch.randn((1,2,256,203))
        t = time.time()
        net(test_input)
        run_times.append(time.time()-t)
print("int8 model time = {} s".format(np.mean(run_times)))

#Do not quantize the model and do inference directly
net = Model(config.kernel_sizes,config.dilations)
run_times = []
net.eval()  
with torch.no_grad():
    for i in range(10):
        test_input = torch.randn((1,2,256,203))
        t = time.time()
        net(test_input)
        run_times.append(time.time()-t)
print("fp32 model time = {} s".format(np.mean(run_times)))