Performance drop when quantizing Efficientnet

Hi,
I’m trying to quantize a trained model of Efficientnet-Lite0, following the architectural changes detailed in this blog post.
I’m using the implementation from this repo and I get a significant accuracy drop (5-10%) after quantizing the model.
The full model after converting to 8-bit is:

 EfficientNet(
  (conv_stem): ConvReLU6(
    (0): QuantizedConv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), scale=0.36273476481437683, zero_point=57, padding=(1, 1))
    (1): QuantizedReLU6(inplace=True)
  )
  (bn1): Identity()
  (act1): Identity()
  (blocks): Sequential(
    (0): Sequential(
      (0): DepthwiseSeparableConv(
        (skip_add): QFunctional(
          scale=1.0, zero_point=0
          (activation_post_process): Identity()
        )
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), scale=0.6822086572647095, zero_point=56, padding=(1, 1), groups=32)
          (1): QuantizedReLU6(inplace=True)
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_pw): QuantizedConv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), scale=0.7673127055168152, zero_point=65)
        (bn2): Identity()
        (act2): Identity()
      )
    )
    (1): Sequential(
      (0): InvertedResidual(
        (skip_add): QFunctional(
          scale=1.0, zero_point=0
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), scale=0.5392391085624695, zero_point=60)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), scale=0.322853684425354, zero_point=57, padding=(1, 1), groups=96)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(96, 24, kernel_size=(1, 1), stride=(1, 1), scale=0.7627326250076294, zero_point=63)
        (bn3): Identity()
      )
      (1): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.8407724499702454, zero_point=62
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), scale=0.3213047683238983, zero_point=63)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(144, 144, kernel_size=(3, 3), stride=(1, 1), scale=0.267162948846817, zero_point=67, padding=(1, 1), groups=144)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(144, 24, kernel_size=(1, 1), stride=(1, 1), scale=0.6916980743408203, zero_point=53)
        (bn3): Identity()
      )
    )
    (2): Sequential(
      (0): InvertedResidual(
        (skip_add): QFunctional(
          scale=1.0, zero_point=0
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(24, 144, kernel_size=(1, 1), stride=(1, 1), scale=0.30310994386672974, zero_point=62)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(144, 144, kernel_size=(5, 5), stride=(2, 2), scale=0.20994137227535248, zero_point=61, padding=(2, 2), groups=144)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(144, 40, kernel_size=(1, 1), stride=(1, 1), scale=0.6519036889076233, zero_point=65)
        (bn3): Identity()
      )
      (1): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.7288376092910767, zero_point=63
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(40, 240, kernel_size=(1, 1), stride=(1, 1), scale=0.20947812497615814, zero_point=52)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(240, 240, kernel_size=(5, 5), stride=(1, 1), scale=0.24765455722808838, zero_point=83, padding=(2, 2), groups=240)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(240, 40, kernel_size=(1, 1), stride=(1, 1), scale=0.4334663450717926, zero_point=61)
        (bn3): Identity()
      )
    )
    (3): Sequential(
      (0): InvertedResidual(
        (skip_add): QFunctional(
          scale=1.0, zero_point=0
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(40, 240, kernel_size=(1, 1), stride=(1, 1), scale=0.20177333056926727, zero_point=56)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(240, 240, kernel_size=(3, 3), stride=(2, 2), scale=0.22160769999027252, zero_point=61, padding=(1, 1), groups=240)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(240, 80, kernel_size=(1, 1), stride=(1, 1), scale=0.5097917914390564, zero_point=64)
        (bn3): Identity()
      )
      (1): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.514493465423584, zero_point=64
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), scale=0.15477867424488068, zero_point=47)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), scale=0.19667555391788483, zero_point=82, padding=(1, 1), groups=480)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(480, 80, kernel_size=(1, 1), stride=(1, 1), scale=0.2826884686946869, zero_point=64)
        (bn3): Identity()
      )
      (2): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.5448680520057678, zero_point=65
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), scale=0.12001236528158188, zero_point=67)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(480, 480, kernel_size=(3, 3), stride=(1, 1), scale=0.1878129243850708, zero_point=79, padding=(1, 1), groups=480)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(480, 80, kernel_size=(1, 1), stride=(1, 1), scale=0.23110872507095337, zero_point=61)
        (bn3): Identity()
      )
    )
    (4): Sequential(
      (0): InvertedResidual(
        (skip_add): QFunctional(
          scale=1.0, zero_point=0
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(80, 480, kernel_size=(1, 1), stride=(1, 1), scale=0.20795781910419464, zero_point=51)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(480, 480, kernel_size=(5, 5), stride=(1, 1), scale=0.2575533390045166, zero_point=81, padding=(2, 2), groups=480)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(480, 112, kernel_size=(1, 1), stride=(1, 1), scale=0.5269572138786316, zero_point=63)
        (bn3): Identity()
      )
      (1): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.5629716515541077, zero_point=65
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(112, 672, kernel_size=(1, 1), stride=(1, 1), scale=0.16619464755058289, zero_point=58)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(672, 672, kernel_size=(5, 5), stride=(1, 1), scale=0.2228115200996399, zero_point=69, padding=(2, 2), groups=672)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(672, 112, kernel_size=(1, 1), stride=(1, 1), scale=0.3241402208805084, zero_point=63)
        (bn3): Identity()
      )
      (2): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.642544686794281, zero_point=67
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(112, 672, kernel_size=(1, 1), stride=(1, 1), scale=0.13504581153392792, zero_point=60)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(672, 672, kernel_size=(5, 5), stride=(1, 1), scale=0.2062821239233017, zero_point=73, padding=(2, 2), groups=672)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(672, 112, kernel_size=(1, 1), stride=(1, 1), scale=0.25870615243911743, zero_point=63)
        (bn3): Identity()
      )
    )
    (5): Sequential(
      (0): InvertedResidual(
        (skip_add): QFunctional(
          scale=1.0, zero_point=0
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(112, 672, kernel_size=(1, 1), stride=(1, 1), scale=0.16723443567752838, zero_point=66)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(672, 672, kernel_size=(5, 5), stride=(2, 2), scale=0.22132091224193573, zero_point=61, padding=(2, 2), groups=672)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(672, 192, kernel_size=(1, 1), stride=(1, 1), scale=0.4806938171386719, zero_point=63)
        (bn3): Identity()
      )
      (1): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.49192753434181213, zero_point=64
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(192, 1152, kernel_size=(1, 1), stride=(1, 1), scale=0.1888679713010788, zero_point=51)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(1152, 1152, kernel_size=(5, 5), stride=(1, 1), scale=0.2976231873035431, zero_point=83, padding=(2, 2), groups=1152)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(1152, 192, kernel_size=(1, 1), stride=(1, 1), scale=0.34456929564476013, zero_point=60)
        (bn3): Identity()
      )
      (2): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.5567103624343872, zero_point=62
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(192, 1152, kernel_size=(1, 1), stride=(1, 1), scale=0.19077259302139282, zero_point=47)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(1152, 1152, kernel_size=(5, 5), stride=(1, 1), scale=0.38248512148857117, zero_point=91, padding=(2, 2), groups=1152)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(1152, 192, kernel_size=(1, 1), stride=(1, 1), scale=0.2738204598426819, zero_point=65)
        (bn3): Identity()
      )
      (3): InvertedResidual(
        (skip_add): QFunctional(
          scale=0.6205083727836609, zero_point=62
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(192, 1152, kernel_size=(1, 1), stride=(1, 1), scale=0.15164275467395782, zero_point=59)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(1152, 1152, kernel_size=(5, 5), stride=(1, 1), scale=0.29384535551071167, zero_point=80, padding=(2, 2), groups=1152)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(1152, 192, kernel_size=(1, 1), stride=(1, 1), scale=0.24689887464046478, zero_point=63)
        (bn3): Identity()
      )
    )
    (6): Sequential(
      (0): InvertedResidual(
        (skip_add): QFunctional(
          scale=1.0, zero_point=0
          (activation_post_process): Identity()
        )
        (conv_pw): ConvReLU6(
          (0): QuantizedConv2d(192, 1152, kernel_size=(1, 1), stride=(1, 1), scale=0.20717555284500122, zero_point=64)
          (1): QuantizedReLU6()
        )
        (bn1): Identity()
        (act1): Identity()
        (conv_dw): ConvReLU6(
          (0): QuantizedConv2d(1152, 1152, kernel_size=(3, 3), stride=(1, 1), scale=0.3554805517196655, zero_point=68, padding=(1, 1), groups=1152)
          (1): QuantizedReLU6()
        )
        (bn2): Identity()
        (act2): Identity()
        (conv_pwl): QuantizedConv2d(1152, 320, kernel_size=(1, 1), stride=(1, 1), scale=0.2588821351528168, zero_point=63)
        (bn3): Identity()
      )
    )
  )
  (conv_head): ConvReLU6(
    (0): QuantizedConv2d(320, 1280, kernel_size=(1, 1), stride=(1, 1), scale=0.2839420437812805, zero_point=80)
    (1): QuantizedReLU6(inplace=True)
  )
  (bn2): Identity()
  (act2): Identity()
  (global_pool): SelectAdaptivePool2d (output_size=1, pool_type=avg)
  (quant): Quantize(scale=tensor([0.0374]), zero_point=tensor([57]), dtype=torch.quint8)
  (dequant): DeQuantize()
  (classifier): QuantizedLinear(in_features=1280, out_features=1000, scale=0.14930474758148193, zero_point=34, qscheme=torch.per_channel_affine)
)

Is there anything I’m missing? I can provide the conversion code and other information if needed.

Thanks in advance!

We just released Numeric Suite as prototype feature in PyTorch 1.6 to support quantization debugging, you can try it out to see which layer is problematic. The tutorial can be found at: https://pytorch.org/tutorials/prototype/numeric_suite_tutorial.html

Thanks!
I tried checking the quantization error for each of the layers and got the following:

conv_stem.0.weight tensor(44.4819)
blocks.0.0.conv_dw.0.weight tensor(45.0884)
blocks.0.0.conv_pw.weight tensor(42.8196)
blocks.1.0.conv_pw.0.weight tensor(43.1310)
blocks.1.0.conv_dw.0.weight tensor(46.9183)
blocks.1.0.conv_pwl.weight tensor(43.1703)
blocks.1.1.conv_pw.0.weight tensor(44.4646)
blocks.1.1.conv_dw.0.weight tensor(45.7783)
blocks.1.1.conv_pwl.weight tensor(39.9211)
blocks.2.0.conv_pw.0.weight tensor(44.0625)
blocks.2.0.conv_dw.0.weight tensor(45.3749)
blocks.2.0.conv_pwl.weight tensor(41.9430)
blocks.2.1.conv_pw.0.weight tensor(43.8883)
blocks.2.1.conv_dw.0.weight tensor(42.4965)
blocks.2.1.conv_pwl.weight tensor(40.5602)
blocks.3.0.conv_pw.0.weight tensor(43.9803)
blocks.3.0.conv_dw.0.weight tensor(47.7440)
blocks.3.0.conv_pwl.weight tensor(41.9959)
blocks.3.1.conv_pw.0.weight tensor(43.2630)
blocks.3.1.conv_dw.0.weight tensor(45.7537)
blocks.3.1.conv_pwl.weight tensor(41.7492)
blocks.3.2.conv_pw.0.weight tensor(43.5795)
blocks.3.2.conv_dw.0.weight tensor(45.5840)
blocks.3.2.conv_pwl.weight tensor(41.2215)
blocks.4.0.conv_pw.0.weight tensor(42.7768)
blocks.4.0.conv_dw.0.weight tensor(41.5424)
blocks.4.0.conv_pwl.weight tensor(41.2056)
blocks.4.1.conv_pw.0.weight tensor(43.2486)
blocks.4.1.conv_dw.0.weight tensor(43.3677)
blocks.4.1.conv_pwl.weight tensor(41.5483)
blocks.4.2.conv_pw.0.weight tensor(43.2695)
blocks.4.2.conv_dw.0.weight tensor(43.2045)
blocks.4.2.conv_pwl.weight tensor(41.8538)
blocks.5.0.conv_pw.0.weight tensor(42.5763)
blocks.5.0.conv_dw.0.weight tensor(46.0717)
blocks.5.0.conv_pwl.weight tensor(41.6060)
blocks.5.1.conv_pw.0.weight tensor(42.4102)
blocks.5.1.conv_dw.0.weight tensor(44.6428)
blocks.5.1.conv_pwl.weight tensor(40.9154)
blocks.5.2.conv_pw.0.weight tensor(42.4992)
blocks.5.2.conv_dw.0.weight tensor(44.1465)
blocks.5.2.conv_pwl.weight tensor(40.3739)
blocks.5.3.conv_pw.0.weight tensor(42.2826)
blocks.5.3.conv_dw.0.weight tensor(44.1184)
blocks.5.3.conv_pwl.weight tensor(40.7068)
blocks.6.0.conv_pw.0.weight tensor(42.2656)
blocks.6.0.conv_dw.0.weight tensor(47.4642)
blocks.6.0.conv_pwl.weight tensor(41.3921)
conv_head.0.weight tensor(42.7725)
classifier._packed_params._packed_params tensor(39.3391)

I’m not sure if these are large values or standard for this type of quantization, but it doesn’t seem that a specific layer is significantly worse than others.
Maybe it is something regarding the AdaptiveAvgPool2d?
I saw there were changes to it in the release notes.

I just ran the activation comparison suggested in the guide provided, and got:

conv_stem.0.stats tensor(28.3666)
conv_stem.1.stats tensor(28.3666)
blocks.0.0.conv_dw.0.stats tensor(16.1361)
blocks.0.0.conv_dw.1.stats tensor(16.1361)
blocks.0.0.conv_pw.stats tensor(8.5438)
blocks.1.0.conv_pw.0.stats tensor(7.0812)
blocks.1.0.conv_pw.1.stats tensor(10.7929)
blocks.1.0.conv_dw.0.stats tensor(10.3284)
blocks.1.0.conv_dw.1.stats tensor(11.8796)
blocks.1.0.conv_pwl.stats tensor(6.0492)
blocks.1.1.conv_pw.0.stats tensor(9.7360)
blocks.1.1.conv_pw.1.stats tensor(11.2618)
blocks.1.1.conv_dw.0.stats tensor(8.9654)
blocks.1.1.conv_dw.1.stats tensor(9.1349)
blocks.1.1.conv_pwl.stats tensor(5.3888)
blocks.2.0.conv_pw.0.stats tensor(8.9415)
blocks.2.0.conv_pw.1.stats tensor(10.1787)
blocks.2.0.conv_dw.0.stats tensor(12.5325)
blocks.2.0.conv_dw.1.stats tensor(14.5331)
blocks.2.0.conv_pwl.stats tensor(5.8452)
blocks.2.1.conv_pw.0.stats tensor(10.9424)
blocks.2.1.conv_pw.1.stats tensor(11.9166)
blocks.2.1.conv_dw.0.stats tensor(10.7086)
blocks.2.1.conv_dw.1.stats tensor(11.7042)
blocks.2.1.conv_pwl.stats tensor(3.9516)
blocks.3.0.conv_pw.0.stats tensor(7.9058)
blocks.3.0.conv_pw.1.stats tensor(8.7798)
blocks.3.0.conv_dw.0.stats tensor(13.6778)
blocks.3.0.conv_dw.1.stats tensor(15.0221)
blocks.3.0.conv_pwl.stats tensor(7.0661)
blocks.3.1.conv_pw.0.stats tensor(11.2245)
blocks.3.1.conv_pw.1.stats tensor(12.1855)
blocks.3.1.conv_dw.0.stats tensor(10.3169)
blocks.3.1.conv_dw.1.stats tensor(7.3186)
blocks.3.1.conv_pwl.stats tensor(5.9016)
blocks.3.2.conv_pw.0.stats tensor(10.9814)
blocks.3.2.conv_pw.1.stats tensor(12.2782)
blocks.3.2.conv_dw.0.stats tensor(11.5729)
blocks.3.2.conv_dw.1.stats tensor(6.8540)
blocks.3.2.conv_pwl.stats tensor(4.0227)
blocks.4.0.conv_pw.0.stats tensor(9.5918)
blocks.4.0.conv_pw.1.stats tensor(10.4552)
blocks.4.0.conv_dw.0.stats tensor(11.8454)
blocks.4.0.conv_dw.1.stats tensor(12.2951)
blocks.4.0.conv_pwl.stats tensor(4.5780)
blocks.4.1.conv_pw.0.stats tensor(9.8242)
blocks.4.1.conv_pw.1.stats tensor(9.5439)
blocks.4.1.conv_dw.0.stats tensor(12.6775)
blocks.4.1.conv_dw.1.stats tensor(10.9211)
blocks.4.1.conv_pwl.stats tensor(2.9198)
blocks.4.2.conv_pw.0.stats tensor(9.9729)
blocks.4.2.conv_pw.1.stats tensor(9.4751)
blocks.4.2.conv_dw.0.stats tensor(14.5569)
blocks.4.2.conv_dw.1.stats tensor(12.2109)
blocks.4.2.conv_pwl.stats tensor(3.3256)
blocks.5.0.conv_pw.0.stats tensor(10.7336)
blocks.5.0.conv_pw.1.stats tensor(9.2929)
blocks.5.0.conv_dw.0.stats tensor(19.4747)
blocks.5.0.conv_dw.1.stats tensor(21.1074)
blocks.5.0.conv_pwl.stats tensor(8.3158)
blocks.5.1.conv_pw.0.stats tensor(12.8702)
blocks.5.1.conv_pw.1.stats tensor(12.2446)
blocks.5.1.conv_dw.0.stats tensor(14.1980)
blocks.5.1.conv_dw.1.stats tensor(12.0078)
blocks.5.1.conv_pwl.stats tensor(7.1764)
blocks.5.2.conv_pw.0.stats tensor(13.4789)
blocks.5.2.conv_pw.1.stats tensor(12.8941)
blocks.5.2.conv_dw.0.stats tensor(15.1403)
blocks.5.2.conv_dw.1.stats tensor(13.3021)
blocks.5.2.conv_pwl.stats tensor(6.3677)
blocks.5.3.conv_pw.0.stats tensor(13.3304)
blocks.5.3.conv_pw.1.stats tensor(13.2739)
blocks.5.3.conv_dw.0.stats tensor(16.0722)
blocks.5.3.conv_dw.1.stats tensor(14.6379)
blocks.5.3.conv_pwl.stats tensor(8.0309)
blocks.6.0.conv_pw.0.stats tensor(12.9786)
blocks.6.0.conv_pw.1.stats tensor(13.6662)
blocks.6.0.conv_dw.0.stats tensor(16.3897)
blocks.6.0.conv_dw.1.stats tensor(17.3638)
blocks.6.0.conv_pwl.stats tensor(6.5583)
conv_head.0.stats tensor(3.8746)
conv_head.1.stats tensor(3.8746)
quant.stats tensor(34.5170)
classifier.stats tensor(6.9768)

Is there anything suspicious here?

I also calculated the cosine similarity of the activations in each of the layers:

conv_stem.0.stats tensor(0.9992)
conv_stem.1.stats tensor(0.9992)
blocks.0.0.conv_dw.0.stats tensor(0.9882)
blocks.0.0.conv_dw.1.stats tensor(0.9882)
blocks.0.0.conv_pw.stats tensor(0.9376)
blocks.1.0.conv_pw.0.stats tensor(0.9126)
blocks.1.0.conv_pw.1.stats tensor(0.9569)
blocks.1.0.conv_dw.0.stats tensor(0.9549)
blocks.1.0.conv_dw.1.stats tensor(0.9677)
blocks.1.0.conv_pwl.stats tensor(0.8856)
blocks.1.1.conv_pw.0.stats tensor(0.9488)
blocks.1.1.conv_pw.1.stats tensor(0.9625)
blocks.1.1.conv_dw.0.stats tensor(0.9364)
blocks.1.1.conv_dw.1.stats tensor(0.9385)
blocks.1.1.conv_pwl.stats tensor(0.8623)
blocks.2.0.conv_pw.0.stats tensor(0.9364)
blocks.2.0.conv_pw.1.stats tensor(0.9518)
blocks.2.0.conv_dw.0.stats tensor(0.9711)
blocks.2.0.conv_dw.1.stats tensor(0.9819)
blocks.2.0.conv_pwl.stats tensor(0.8685)
blocks.2.1.conv_pw.0.stats tensor(0.9585)
blocks.2.1.conv_pw.1.stats tensor(0.9671)
blocks.2.1.conv_dw.0.stats tensor(0.9565)
blocks.2.1.conv_dw.1.stats tensor(0.9647)
blocks.2.1.conv_pwl.stats tensor(0.7922)
blocks.3.0.conv_pw.0.stats tensor(0.9168)
blocks.3.0.conv_pw.1.stats tensor(0.9344)
blocks.3.0.conv_dw.0.stats tensor(0.9773)
blocks.3.0.conv_dw.1.stats tensor(0.9831)
blocks.3.0.conv_pwl.stats tensor(0.8967)
blocks.3.1.conv_pw.0.stats tensor(0.9597)
blocks.3.1.conv_pw.1.stats tensor(0.9683)
blocks.3.1.conv_dw.0.stats tensor(0.9532)
blocks.3.1.conv_dw.1.stats tensor(0.8986)
blocks.3.1.conv_pwl.stats tensor(0.8574)
blocks.3.2.conv_pw.0.stats tensor(0.9549)
blocks.3.2.conv_pw.1.stats tensor(0.9664)
blocks.3.2.conv_dw.0.stats tensor(0.9599)
blocks.3.2.conv_dw.1.stats tensor(0.8697)
blocks.3.2.conv_pwl.stats tensor(0.7916)
blocks.4.0.conv_pw.0.stats tensor(0.9387)
blocks.4.0.conv_pw.1.stats tensor(0.9521)
blocks.4.0.conv_dw.0.stats tensor(0.9650)
blocks.4.0.conv_dw.1.stats tensor(0.9685)
blocks.4.0.conv_pwl.stats tensor(0.8268)
blocks.4.1.conv_pw.0.stats tensor(0.9460)
blocks.4.1.conv_pw.1.stats tensor(0.9414)
blocks.4.1.conv_dw.0.stats tensor(0.9698)
blocks.4.1.conv_dw.1.stats tensor(0.9566)
blocks.4.1.conv_pwl.stats tensor(0.7595)
blocks.4.2.conv_pw.0.stats tensor(0.9490)
blocks.4.2.conv_pw.1.stats tensor(0.9423)
blocks.4.2.conv_dw.0.stats tensor(0.9809)
blocks.4.2.conv_dw.1.stats tensor(0.9683)
blocks.4.2.conv_pwl.stats tensor(0.7715)
blocks.5.0.conv_pw.0.stats tensor(0.9567)
blocks.5.0.conv_pw.1.stats tensor(0.9359)
blocks.5.0.conv_dw.0.stats tensor(0.9930)
blocks.5.0.conv_dw.1.stats tensor(0.9949)
blocks.5.0.conv_pwl.stats tensor(0.9064)
blocks.5.1.conv_pw.0.stats tensor(0.9649)
blocks.5.1.conv_pw.1.stats tensor(0.9595)
blocks.5.1.conv_dw.0.stats tensor(0.9741)
blocks.5.1.conv_dw.1.stats tensor(0.9583)
blocks.5.1.conv_pwl.stats tensor(0.8771)
blocks.5.2.conv_pw.0.stats tensor(0.9700)
blocks.5.2.conv_pw.1.stats tensor(0.9648)
blocks.5.2.conv_dw.0.stats tensor(0.9810)
blocks.5.2.conv_dw.1.stats tensor(0.9709)
blocks.5.2.conv_pwl.stats tensor(0.8566)
blocks.5.3.conv_pw.0.stats tensor(0.9687)
blocks.5.3.conv_pw.1.stats tensor(0.9679)
blocks.5.3.conv_dw.0.stats tensor(0.9842)
blocks.5.3.conv_dw.1.stats tensor(0.9755)
blocks.5.3.conv_pwl.stats tensor(0.8879)
blocks.6.0.conv_pw.0.stats tensor(0.9637)
blocks.6.0.conv_pw.1.stats tensor(0.9655)
blocks.6.0.conv_dw.0.stats tensor(0.9802)
blocks.6.0.conv_dw.1.stats tensor(0.9842)
blocks.6.0.conv_pwl.stats tensor(0.8535)
conv_head.0.stats tensor(0.6853)
conv_head.1.stats tensor(0.6853)
quant.stats tensor(0.9998)
classifier.stats tensor(0.7695)

it seems as if the conv_pwl layers are mostly different, along with the conv_head and classifier.
What might cause that?

@kfir_goldberg Did you solve it? I’m planing to quantize an EfficientNet-Lite0 for smartphone deployment. I chose EfficientNet-Lite as it seems to be quantization friendly, or at least, it’s what google claims.

Thank you

Hi,
Unfortunately, I didn’t solve it. One of the issues that bothered me is that fusing Conv-Bn-Act is not possible with Relu6 as of now so I had to implement it myself, and I’m unsure if it worked right.
The best I managed to do is get about a 5% drop in accuracy (75% to 70%), and I eventually stopped trying.

we typically replace relu6 with relu, e.g.: https://github.com/pytorch/vision/blob/master/torchvision/models/quantization/mobilenet.py#L75

also did you try quantization aware training? https://pytorch.org/tutorials/advanced/static_quantization_tutorial.html#quantization-aware-training

@jerryzh168 I don’t see how replacing relu6 with relu doesn’t translate to a performance drop. It may work with MobileNetV2 but it isn’t a general approach.

Also, I don’t understand why Pytorch doesn’t offer ConvReLU2d similar modules for relu6 and hardswish. They are a common pattern for nets that runs on low end or IoT devices like smartphones. At least, a ConvReLU6_2d for relu6 as it’s like a relu.

Moreover, I think it’s worth to add both patterns as like Regnet paper says “We find that Swish outperforms ReLU at low flops, but ReLU is better at highflops. Interestingly, if g is restricted to be 1 (depthwiseconv), Swish performs much better than ReLU. This sugggests that depthwise conv and Swish interact favorably, although the underlying reason is not at all clear.” So, for lower flops and quantized network you may want to use hardswish instead of relu as activation.

1 Like

I see, I’m not sure why we don’t support ConvReLU6_2d, maybe @raghuramank100 knows.

Hi @kfir_goldberg,
I’m about quantize Efficientnet-lite as well. Could you please share the conversion code? Maybe I’ll found sth. That would save me some time of doing conversion so that I could concentrate just to get similar accuracy for both versions.

Thanks
Tomek

This is the code I used to support ConvBnReLU6 and ConvReLU6

def fuse_model(model):
    for m in model.modules():
        if type(m) == DepthwiseSeparableConv:
            torch.quantization.fuse_modules(m, ['conv_dw', 'bn1', 'act1'], inplace=True,
                                            fuser_func=fuse_known_modules_mod)
            # torch.quantization.fuse_modules(m, ['conv_pw', 'bn2', 'act2'], inplace=True,
            torch.quantization.fuse_modules(m, ['conv_pw', 'bn2'], inplace=True,
                                            fuser_func=fuse_known_modules_mod)
        elif type(m) == InvertedResidual:
            torch.quantization.fuse_modules(m, ['conv_pw', 'bn1', 'act1'], inplace=True,
                                            fuser_func=fuse_known_modules_mod)
            torch.quantization.fuse_modules(m, ['conv_dw', 'bn2', 'act2'], inplace=True,
                                            fuser_func=fuse_known_modules_mod)
            torch.quantization.fuse_modules(m, ['conv_pwl', 'bn3'], inplace=True, fuser_func=fuse_known_modules_mod)
    torch.quantization.fuse_modules(model, ['conv_head', 'bn2', 'act2'], inplace=True,
                                    fuser_func=fuse_known_modules_mod)
    torch.quantization.fuse_modules(model, ['conv_stem', 'bn1', 'act1'], inplace=True,
                                    fuser_func=fuse_known_modules_mod)


def fuse_known_modules_mod(mod_list):
    r"""Returns a list of modules that fuses the operations specified
     in the input module list.
    Fuses only the following sequence of modules:
    conv, bn
    conv, bn, relu
    conv, relu
    linear, relu
    For these sequences, the first element in the output module list performs
    the fused operation. The rest of the elements are set to nn.Identity()
    """

    OP_LIST_TO_FUSER_METHOD = {
        (torch.nn.Conv2d, torch.nn.BatchNorm2d): fuse_conv_bn,
        (torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU): fuse_conv_bn_relu,
        (torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU6): fuse_conv_bn_relu6,
        (Conv2dSame, torch.nn.BatchNorm2d, torch.nn.ReLU6): fuse_conv_bn_relu6,
        (torch.nn.Conv2d, torch.nn.ReLU): torch.nn.intrinsic.ConvReLU2d,
        (torch.nn.Conv2d, torch.nn.ReLU6): ConvReLU6,
        (torch.nn.Linear, torch.nn.ReLU): torch.nn.intrinsic.LinearReLU
    }

    types = tuple(type(m) for m in mod_list)
    fuser_method = OP_LIST_TO_FUSER_METHOD.get(types, None)
    if fuser_method is None:
        raise NotImplementedError("Cannot fuse modules: {}".format(types))
    new_mod = [None] * len(mod_list)
    new_mod[0] = fuser_method(*mod_list)

    for i in range(1, len(mod_list)):
        new_mod[i] = torch.nn.Identity()
        new_mod[i].training = mod_list[0].training

    return new_mod


class ConvReLU6(nn.Sequential):
    def __init__(self, conv, relu6):
        super(ConvReLU6, self).__init__(conv, relu6)


class ConvBnReLU6(torch.nn.Sequential):
    def __init__(self, conv, bn, relu6):
        super(ConvBnReLU6, self).__init__(conv, bn, relu6)


def fuse_conv_bn_relu6(conv, bn, relu6):
    assert (conv.training == bn.training == relu6.training), \
        "Conv and BN both must be in the same mode (train or eval)."

    if conv.training:
        return ConvBnReLU6(conv, bn, relu6)
    else:
        return ConvReLU6(
            torch.nn.utils.fusion.fuse_conv_bn_eval(conv, bn), relu6)
1 Like

Thanks @kfir_goldberg,
do I understand correctly that you’ve created a new model class that inherited from timm.models.efficientnet_lite0 and 1) placed your fuse_model() there 2) modified forward method to add quantization support? Can you share code of this class? I’m not sure where exactly to put quant/dequant.

Then you made:

model_.qconfig = torch.quantization.get_default_qconfig('fbgemm')
torch.quantization.prepare(model_, inplace=True)

and finally

evaluate(model_, criterion, train_loader, neval_batches=num_calibration_batches)
torch.quantization.convert(model_, inplace=True)

to calibrate the model with training set and convert, right?

I’m asking because when calibrating model (evaluate) I got:

RuntimeError: Could not run 'quantized::conv2d.new' with arguments from the 'CPU' backend. 'quantized::conv2d.new' is only available for these backends: [QuantizedCPU].

and I found here that this is probably connected with the fact that QuantStub is not placed in the right place.

Have you followed this tutorial? They say that when performance drops per channel quantization may be needed or Quantization-aware training.

Many thanks!