Hi @supriyar ,
So I have similar problem with mobilenet_v3: I’m testing the time performance of my float32 and quantized model. The quantized model is significantly slower than the float32 model, both on ‘fbgemm’ and ‘qnnpack’ and both on PC and Android:
PC, one thread, fbgemm: 0.011s vs 0.034s (avegare from 100 trials)
PC, one thread, qnnpack: 0.012s vs 0.035s (avegare from 100 trials)
What I basically did is:
- took Duo Li implementation of MobileNetV3
- added QuantStub at the beginning and DeQuantStub at the end of the model
- changed all adds, muls and divs into FloatFunctional for quantized tensors support
- set model.qconfig to ‘fbgemm’ or ‘qnnpack’
- prepared model for qat
- converted model to quantized model
- compared performance between quantized version and non-quantized
Quantized model is ~4x smaller, but the inference is taking signifficantly slower.
Is there something that I’m missing?
to reproduce:
torchvision 0.8.2
pytorch 1.7.1
Windows10
"""
MIT License
Copyright (c) 2019 Duo LI
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
"""
import torch
from tqdm import tqdm
import time
import torch.nn as nn
import math
def _make_divisible(v, divisor, min_value=None):
"""
This function is taken from the original tf repo.
It ensures that all layers have a channel number that is divisible by 8
It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
:param v:
:param divisor:
:param min_value:
:return:
"""
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than 10%.
if new_v < 0.9 * v:
new_v += divisor
return new_v
##################################################################################
# FLOAT32 ARCHITECTURE
##################################################################################
class h_sigmoid(nn.Module):
def __init__(self, inplace=True):
super(h_sigmoid, self).__init__()
self.relu = nn.ReLU6(inplace=inplace)
def forward(self, x):
return self.relu(x + 3) / 6
class h_swish(nn.Module):
def __init__(self, inplace=True):
super(h_swish, self).__init__()
self.sigmoid = h_sigmoid(inplace=inplace)
def forward(self, x):
return x * self.sigmoid(x)
class SELayer(nn.Module):
def __init__(self, channel, reduction=4):
super(SELayer, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, _make_divisible(channel // reduction, 8)),
nn.ReLU(inplace=True),
nn.Linear(_make_divisible(channel // reduction, 8), channel),
h_sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y
def conv_3x3_bn(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
nn.BatchNorm2d(oup),
h_swish()
)
def conv_1x1_bn(inp, oup):
return nn.Sequential(
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
h_swish()
)
class InvertedResidual(nn.Module):
def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
super(InvertedResidual, self).__init__()
assert stride in [1, 2]
self.identity = stride == 1 and inp == oup
if inp == hidden_dim:
self.conv = nn.Sequential(
# dw
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
h_swish() if use_hs else nn.ReLU(inplace=True),
# Squeeze-and-Excite
SELayer(hidden_dim) if use_se else nn.Identity(),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
)
else:
self.conv = nn.Sequential(
# pw
nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
nn.BatchNorm2d(hidden_dim),
h_swish() if use_hs else nn.ReLU(inplace=True),
# dw
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
# Squeeze-and-Excite
SELayer(hidden_dim) if use_se else nn.Identity(),
h_swish() if use_hs else nn.ReLU(inplace=True),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
)
def forward(self, x):
if self.identity:
return x + self.conv(x)
else:
return self.conv(x)
class MobileNetV3(nn.Module):
def __init__(self, cfgs, mode, num_classes=1, width_mult=1.):
super(MobileNetV3, self).__init__()
# setting of inverted residual blocks
self.cfgs = cfgs
assert mode in ['large', 'small']
# building first layer
input_channel = _make_divisible(16 * width_mult, 8)
layers = [conv_3x3_bn(3, input_channel, 2)]
# building inverted residual blocks
block = InvertedResidual
for k, t, c, use_se, use_hs, s in self.cfgs:
output_channel = _make_divisible(c * width_mult, 8)
exp_size = _make_divisible(input_channel * t, 8)
layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
input_channel = output_channel
self.features = nn.Sequential(*layers)
# building last several layers
self.conv = conv_1x1_bn(input_channel, exp_size)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
output_channel = {'large': 1280, 'small': 1024}
output_channel = _make_divisible(output_channel[mode] * width_mult, 8) if width_mult > 1.0 else output_channel[mode]
self.classifier = nn.Sequential(
nn.Linear(exp_size, output_channel),
h_swish(),
nn.Dropout(0.2),
nn.Linear(output_channel, num_classes),
)
self._initialize_weights()
def forward(self, x):
x = self.features(x)
x = self.conv(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()
def mobilenetv3_small(**kwargs):
"""
Constructs a MobileNetV3-Small model
"""
return MobileNetV3(cfgs, mode='small', **kwargs)
##################################################################################
# QUANTIZED ARCHITECTURE
##################################################################################
class h_sigmoid_quant(nn.Module):
def __init__(self, inplace=True):
super(h_sigmoid_quant, self).__init__()
self.relu = nn.ReLU6(inplace=inplace)
self.q_add = nn.quantized.FloatFunctional()
def forward(self, x):
return self.q_add.mul_scalar(self.relu(self.q_add.add_scalar(x, 3.)), 1/6)
# return self.relu(x)
class h_swish_quant(nn.Module):
def __init__(self, inplace=True):
super(h_swish_quant, self).__init__()
self.sigmoid = h_sigmoid_quant(inplace=inplace)
self.q_mul = nn.quantized.FloatFunctional()
def forward(self, x):
return self.q_mul.mul(x, self.sigmoid(x))
class SELayerQuant(nn.Module):
def __init__(self, channel, reduction=4):
super(SELayerQuant, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, _make_divisible(channel // reduction, 8)),
nn.ReLU(inplace=True),
nn.Linear(_make_divisible(channel // reduction, 8), channel),
h_sigmoid_quant()
)
self.q_mul = nn.quantized.FloatFunctional()
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return self.q_mul.mul(x, y)
def conv_3x3_bn_quant(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
nn.BatchNorm2d(oup),
h_swish_quant()
)
def conv_1x1_bn_quant(inp, oup):
return nn.Sequential(
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
h_swish_quant()
)
class InvertedResidualQuant(nn.Module):
def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs):
super(InvertedResidualQuant, self).__init__()
assert stride in [1, 2]
self.identity = stride == 1 and inp == oup
self.q_add = nn.quantized.FloatFunctional()
if inp == hidden_dim:
self.conv = nn.Sequential(
# dw
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
h_swish_quant() if use_hs else nn.ReLU(inplace=True),
# Squeeze-and-Excite
SELayerQuant(hidden_dim) if use_se else nn.Identity(),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
)
else:
self.conv = nn.Sequential(
# pw
nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
nn.BatchNorm2d(hidden_dim),
h_swish_quant() if use_hs else nn.ReLU(inplace=True),
# dw
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False),
nn.BatchNorm2d(hidden_dim),
# Squeeze-and-Excite
SELayerQuant(hidden_dim) if use_se else nn.Identity(),
h_swish_quant() if use_hs else nn.ReLU(inplace=True),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
)
def forward(self, x):
if self.identity:
return self.q_add.add(x, self.conv(x))
else:
return self.conv(x)
class MobileNetV3_quant(nn.Module):
def __init__(self, cfgs, mode, num_classes=1, width_mult=1.):
super(MobileNetV3_quant, self).__init__()
# setting of inverted residual blocks
self.cfgs = cfgs
assert mode in ['large', 'small']
self.quant = torch.quantization.QuantStub()
self.dequant = torch.quantization.DeQuantStub()
# building first layer
input_channel = _make_divisible(16 * width_mult, 8)
layers = [conv_3x3_bn_quant(3, input_channel, 2)]
# building inverted residual blocks
block = InvertedResidualQuant
for k, t, c, use_se, use_hs, s in self.cfgs:
output_channel = _make_divisible(c * width_mult, 8)
exp_size = _make_divisible(input_channel * t, 8)
layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs))
input_channel = output_channel
self.features = nn.Sequential(*layers)
# building last several layers
self.conv = conv_1x1_bn_quant(input_channel, exp_size)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
output_channel = {'large': 1280, 'small': 1024}
output_channel = _make_divisible(output_channel[mode] * width_mult, 8) if width_mult > 1.0 else output_channel[mode]
self.classifier = nn.Sequential(
nn.Linear(exp_size, output_channel),
h_swish_quant(),
nn.Dropout(0.2),
nn.Linear(output_channel, num_classes),
)
self._initialize_weights()
def forward(self, x):
x = self.quant(x)
x = self.features(x)
x = self.conv(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
x = self.dequant(x)
return x
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
n = m.weight.size(1)
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()
def mobilenetv3_small_quant(**kwargs):
"""
Constructs a MobileNetV3-Small model
"""
return MobileNetV3_quant(cfgs, mode='small', **kwargs)
##################################################################################
# RUN COMPARISION
##################################################################################
def test_net(mimage, quant):
if quant:
model = mobilenetv3_small_quant()
model.qconfig = torch.quantization.get_default_qat_qconfig('qnnpack')
torch.quantization.prepare_qat(model, inplace=True)
else:
model = mobilenetv3_small()
if quant:
model = torch.quantization.convert(model)
model.eval()
model.to(torch.device("cpu"))
t0 = time.time()
with torch.no_grad():
with tqdm(total=RUNS, ncols=100) as pbar:
for _ in range(RUNS):
model(mimage)
pbar.update()
return (time.time() - t0) / RUNS
if __name__ == '__main__':
RUNS = 100
cfgs = [
# k, t, c, SE, HS, s
[3, 1, 16, 1, 0, 2],
[3, 4.5, 24, 0, 0, 2],
[3, 3.67, 24, 0, 0, 1],
[5, 4, 40, 1, 1, 2],
[5, 6, 40, 1, 1, 1],
[5, 6, 40, 1, 1, 1],
[5, 3, 48, 1, 1, 1],
[5, 3, 48, 1, 1, 1],
[5, 6, 96, 1, 1, 2],
[5, 6, 96, 1, 1, 1],
[5, 6, 96, 1, 1, 1],
]
torch.set_num_threads(1)
image = torch.rand(1, 3, 224, 224)
print(f"time float32: {test_net(image, False)}")
print(f"time quant: {test_net(image, True)}")