How to export a correct quantized model to onnx format

print(‘args.workers = {}’.format(args.workers))
val_loader = torch.utils.data.DataLoader(
val_dataset, batch_size=1000, shuffle=False,
num_workers=args.workers, pin_memory=True, sampler=val_sampler)

val_loader_onnx = torch.utils.data.DataLoader(
    val_dataset, batch_size=1000, shuffle=False,
    num_workers=16, pin_memory=True, sampler=val_sampler)


criterion = nn.CrossEntropyLoss().to(device)


**print('QAT Model In Pytorch (SimQuant):')**
validate(val_loader, model, criterion, args)

quantized_eval_model = copy.deepcopy(model.module) 
quantized_eval_model.eval()
quantized_eval_model.to(torch.device("cpu"))
torch.ao.quantization.convert(quantized_eval_model, inplace=True)


**print('Converted Model in Pytorch (Quantized):')**
acc1_quantized = validate_cpu(val_loader, quantized_eval_model, criterion, args)

############################################### Here we convert quantized_model to onnx format ########
input_size = torch.randn(1000, 3, 224, 224)
import onnx
import onnxsim
quantized_eval_model(input_size)
torch.onnx.export(quantized_eval_model,  # model being run
                  input_size,  # model input (or a tuple for multiple inputs)
                  './quantized_mobilenetv3_qat.onnx',  # where to save the model (can be a file or file-like object)
                  export_params=True,  # store the trained parameter weights inside the model file
                  opset_version=18,  # the ONNX version to export the model to
                  # do_constant_folding=True,  # whether to execute constant folding for optimization
                  input_names=['input'],  # the model's input names
                  output_names=['output'],  # the model's output names
                #   dynamic_axes={"input":{0: "batch_size"}, "output":{0: "batch_size"},}
                  # example_outputs=traced(input_fp32)
                  )
model_onnx = onnx.load('./quantized_mobilenetv3_qat.onnx')
onnx.checker.check_model(model_onnx)
model_onnx, check = onnxsim.simplify(
    model_onnx,
    dynamic_input_shape=False,
    overwrite_input_shapes={'input': list(input_size.shape)}
    )
assert check, 'assert check failed'
onnx.save(model_onnx, 'quantized_mobilenetv3_sim_qat.onnx')
print(" ")
print('Model has been converted to onnx successfully.')
#######################################################################################################

################################## do onnx-runtime inference #########################################
import onnx
import onnxruntime as ort
import numpy as np
print(‘onnx runtime gpu with CUDA’)
sessionOption = ort.SessionOptions()
sessionOption.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL
sess = ort.InferenceSession(“./quantized_mobilenetv3_sim_qat.onnx”, sess_options=sessionOption,
providers=[‘CUDAExecutionProvider’, ‘CPUExecutionProvider’])
input_name = sess.get_inputs()[0].name
output_name= sess.get_outputs()[0].name
output_shape = sess.get_outputs()[0].shape

**print('Converted Model in Onnx (Quantized):')**
validate_onnx(val_loader_onnx, sess, criterion, args)

#######################################################################################################

def validate_onnx(val_loader, ort_session, criterion, args):

def run_validate(loader, base_progress=0):
    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(loader):
            i = base_progress + i
            images = images.to('cpu')
            target = target.to('cpu')

            # compute output
            input_data = images.detach().cpu().numpy()
            input_name = ort_session.get_inputs()[0].name
            output_name= ort_session.get_outputs()[0].name
            res = ort_session.run([output_name], {input_name: input_data})
            # out = res
            
            import numpy as np
            out = np.array(res)

            output_tensor = torch.from_numpy(out[0])
            # print(output_tensor.shape)
            # exit()

            loss = criterion(output_tensor, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output_tensor, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i + 1)
            # break

batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
losses = AverageMeter('Loss', ':.4e', Summary.NONE)
top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
progress = ProgressMeter(
    len(val_loader),
    [batch_time, losses, top1, top5],
    prefix='Test_ONNX: ')


run_validate(val_loader)
progress.display_summary()

return top1.avg

def validate(val_loader, model, criterion, args):

def run_validate(loader, base_progress=0):
    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(loader):
            i = base_progress + i
            if torch.cuda.is_available():
                images = images.cuda()

            if torch.cuda.is_available():
                target = target.cuda()

            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i + 1)

batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
losses = AverageMeter('Loss', ':.4e', Summary.NONE)
top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
progress = ProgressMeter(
    len(val_loader),
    [batch_time, losses, top1, top5],
    prefix='Test: ')

# switch to evaluate mode
model.eval()

run_validate(val_loader)

progress.display_summary()

return top1.avg

def validate_cpu(val_loader, model, criterion, args):

def run_validate(loader, base_progress=0):
    with torch.no_grad():
        end = time.time()
        for i, (images, target) in enumerate(loader):
            i = base_progress + i
            images = images.to('cpu')
            target = target.to('cpu')
            # compute output
            output = model(images)
            loss = criterion(output, target)

            # measure accuracy and record loss
            acc1, acc5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), images.size(0))
            top1.update(acc1[0], images.size(0))
            top5.update(acc5[0], images.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % args.print_freq == 0:
                progress.display(i + 1)

batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
losses = AverageMeter('Loss', ':.4e', Summary.NONE)
top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
progress = ProgressMeter(
    len(val_loader),
    [batch_time, losses, top1, top5],
    prefix='Test_quantized: ')

# switch to evaluate mode
model.to('cpu')
model.eval()

run_validate(val_loader)
progress.display_summary()

return top1.avg

the log as follows:

QAT Model In Pytorch (SimQuant):
Test: [ 1/50] Time 19.430 (19.430) Loss 4.4070e-01 (4.4070e-01) Acc@1 87.70 ( 87.70) Acc@5 97.40 ( 97.40)
Test: [11/50] Time 0.282 ( 2.129) Loss 7.0866e-01 (7.4780e-01) Acc@1 78.10 ( 80.42) Acc@5 97.00 ( 94.98)
Test: [21/50] Time 0.278 ( 1.248) Loss 1.1263e+00 (7.6287e-01) Acc@1 71.70 ( 79.76) Acc@5 91.20 ( 95.11)
Test: [31/50] Time 0.282 ( 0.936) Loss 1.0681e+00 (8.8677e-01) Acc@1 74.10 ( 77.07) Acc@5 90.90 ( 93.68)
Test: [41/50] Time 0.279 ( 0.776) Loss 1.1727e+00 (9.6547e-01) Acc@1 71.50 ( 75.39) Acc@5 90.10 ( 92.52)

  • Acc@1 74.776 Acc@5 92.252

Converted Model in Pytorch (Quantized):
Test_quantized: [ 1/50] Time 17.260 (17.260) Loss 4.7875e-01 (4.7875e-01) Acc@1 88.20 ( 88.20) Acc@5 97.00 ( 97.00)
Test_quantized: [11/50] Time 2.513 ( 4.091) Loss 7.3136e-01 (7.6658e-01) Acc@1 79.30 ( 80.25) Acc@5 96.70 ( 94.70)
Test_quantized: [21/50] Time 2.554 ( 3.346) Loss 1.2043e+00 (7.8450e-01) Acc@1 68.50 ( 79.26) Acc@5 90.70 ( 94.75)
Test_quantized: [31/50] Time 2.561 ( 3.079) Loss 1.0992e+00 (9.1164e-01) Acc@1 74.10 ( 76.61) Acc@5 90.00 ( 93.15)
Test_quantized: [41/50] Time 2.560 ( 2.950) Loss 1.2187e+00 (9.9316e-01) Acc@1 69.40 ( 74.74) Acc@5 89.70 ( 92.14)

  • Acc@1 74.184 Acc@5 91.934
    ============= Diagnostic Run torch.onnx.export version 2.0.1+cu117 =============
    verbose: False, log level: Level.ERROR
    ======================= 0 NONE 0 NOTE 0 WARNING 0 ERROR ========================

Model has been converted to onnx successfully.
onnx runtime gpu with CUDA
Converted Model in Onnx (Quantized):
Test_ONNX: [ 1/50] Time 24.736 (24.736) Loss 4.9879e-01 (4.9879e-01) Acc@1 86.10 ( 86.10) Acc@5 97.10 ( 97.10)
Test_ONNX: [11/50] Time 7.599 (10.646) Loss 8.6524e-01 (8.7809e-01) Acc@1 75.10 ( 77.11) Acc@5 95.30 ( 93.49)
Test_ONNX: [21/50] Time 7.718 ( 9.352) Loss 1.2917e+00 (8.8729e-01) Acc@1 64.70 ( 76.34) Acc@5 90.10 ( 93.80)
Test_ONNX: [31/50] Time 8.425 ( 8.953) Loss 1.1920e+00 (1.0216e+00) Acc@1 71.10 ( 73.64) Acc@5 89.40 ( 92.00)
Test_ONNX: [41/50] Time 7.646 ( 8.619) Loss 1.2383e+00 (1.0992e+00) Acc@1 68.30 ( 72.02) Acc@5 89.50 ( 90.94)

  • Acc@1 71.432 Acc@5 90.598

How to export a correct quantized model for onnx? Acc1@1 71.423 for onnxruntime drops about 3% accuracy compared to Acc@74.148 for quantied model in pytorch. WHY? Anyone can help me to figure out?? Thanks in advance.

I figure out that the bias quantization should take care. Here bias is the final bias, i.e. the bias from the fused CNN+BN.

it looks like the quantization part is working but the onnx export is whats causing an issue, you may have better luck asking some of the onnx folks or make a github issue and tag the onnx: oncall since i don’t think anyone in the quantization team has too much onnx experience. If you want to debug this further, i’d probably export each module individually, convert to onnx, and compare the output.

1 Like

Thank, I figured out that what the problem was. It is the issue from the bias quantization. If the scale is extreme small, the bias quantization will beyond the range of int32.

I encounter the same problem when export quantized param by ONNX. We can obtain fused layer ConvBnReLU int weight, but the bias remains float. Do you have any ideas?

@jinshubai , what was your fix? manually upscale? or force a custom bias for the quantization?