Does ONNX increase inference efficiency compared to pytorch model?

alireza_khatami · January 7, 2024, 12:54pm

So I have been using Hugginface wave2vecCTC for speech recognition. I want to do as much optimization as possible. One way I have found during my searches was to turn the model into ONNX. So I wrote a Python log script to keep track of GPU, CPU, and runtime duration, with different settings ( Half options-float16-, CPU or GPU, and different batch sizes). Oddly, the Pytorch model outperforms ONNX one. So my question is, is this normal, I thought ONNX is much more efficient when it comes to optimization and inference time. Here is the code I have been using for benchmarking( If you think I am assessing the performance wrong or something is wrong with my code please let me know) :

# from onnxconverter_common import auto_mixed_precision
import os
import onnxruntime
import torch
import psutil
import time
import threading
import pandas as pd
import librosa
import numpy as np
import matplotlib.pyplot as plt
from onnxconverter_common import float16
import onnx
import GPUtil
import argparse
import subprocess
def get_gpu_stats():
    command = "nvidia-smi --query-gpu=utilization.gpu,memory.used,memory.total --format=csv,nounits,noheader"#
    result = subprocess.run(command, stdout=subprocess.PIPE, shell=True, text=True)
    output_lines = result.stdout.strip().split('\n')
    gpu_stats = []
    for line in output_lines:
        values = line.split(',')
        utilization = float(values[0])
        memory_used = int(values[1])
        memory_total = int(values[2])
    # return output_lines[0].split(',')
    return [utilization , memory_used, memory_total]
def load_and_prepare_model_inputs (model_path, inputs_, export_onnx, halved, device):
    # Load the model from ONNX if it exists; otherwise, export and load it
    model = Wav2Vec2ForCTC.from_pretrained(model_path )

    temp  =  "halved" if halved else ""
    ONNX_Half_Dir = os.path.join("ONNX-Models",  f"model" + temp + ".onnx")
    ONNXmodelDir = os.path.join("ONNX-Models",  f"model.onnx")
    dtype = torch.float16 if halved else torch.float32
    inputs = inputs_.input_values
    masks = inputs_.attention_mask
    model.eval()

    if export_onnx :
        # dataType= torch.float16 if halved else  torch.float32

        if not os.path.exists(ONNXmodelDir):
            os.makedirs("ONNX-Models",exist_ok = True)
            dummy_input = torch.unsqueeze(inputs[0] , dim=0) 
            dummy_masks = torch.unsqueeze(masks[0].to(dummy_input.dtype) , dim=0) 
            torch.onnx.export(
                model,
                (dummy_input, dummy_masks),
                ONNXmodelDir,
                # f"model.onnx",
                input_names=["input", "masks"],
                output_names=["output"],
                dynamic_axes={"input": {0: "batch_size", 1: "sequence_length"},
                            "masks": {0: "batch_size", 1: "sequence_length"},
                            "output": {0: "batch_size", 1: "sequence_length"}},
                opset_version=11,  # You can adjust the opset version based on your needs
            )
            print ("Finished saving the ONNX model")
            if halved and not os.path.exists(ONNX_Half_Dir):
                model = onnx.load(ONNXmodelDir)
                model_fp16 = float16.convert_float_to_float16(model)
                onnx.save(model_fp16, ONNX_Half_Dir)
                del model_fp16
        options = onnxruntime.SessionOptions()
        options.enable_profiling = True
        provider  = 'CPUExecutionProvider' if device == 'cpu' else 'CUDAExecutionProvider'
        Dir = ONNX_Half_Dir if halved else ONNXmodelDir
        model = onnxruntime.InferenceSession(Dir
                                            , sess_options=options
                                            , providers= [provider])
        input_name = model.get_inputs()[0].name
        mask_name = model.get_inputs()[1].name
        output_name = model.get_outputs()[0].name
        inputs = [[output_name], {input_name: inputs.to(dtype).numpy(),
                    mask_name: inputs.to(dtype).numpy()}]
        print("#############################\n Finished loading  the ONNX model and the Inputs\n#############################")
    #transfer the model and data to cpu or gpu
    else:
        # if not halved:
        #
model = model.to(torch.float32)
        model = model.half() if halved else model 
        inputs = inputs.half() if halved else inputs 
        masks = masks.half() if halved else masks 
        model = model.to(device)
        inputs = [inputs.to(device), masks.to(device)]
        print("#############################\nFinished loading  the Pytorch model and the Inputs\n#############################")

    return model, inputs

# Function to perform inference and measure time and memory usage
def inference(model, inputs, device, halved, batch_size, export_onnx=False):
    fileNames , inputs = inputs
    gpu_first_stat = get_gpu_stats()
    result = {  'time': 0,
                # 'gpu_memory_reserved_amount': [getMB(torch.cuda.memory_reserved()) if device == 'cuda' else 0],
                # 'gpu_memory_util_amount': [getMB(torch.cuda.memory_allocated()) if device == 'cuda' else 0],
                'gpu_utilization':[gpu_first_stat[0]],
                'gpu_memory_used':[gpu_first_stat[1]],
                'gpu_memory_total':[gpu_first_stat[2]],
                'cpu_memory_util_percent': [round(psutil.virtual_memory().percent,3)],
                'cpu_util_percent': [round(psutil.cpu_percent(),3)],
                'cpu_memory_util_amount': [getMB(psutil.virtual_memory().used)],
                'decoder_time':[],
                'done': False}
    # Initialize the thread for memory monitoring
    mode =  "onnx" if export_onnx else "pytorch"
    memory_thread = threading.Thread(target=monitor_memory, args=(device, result))
    memory_thread.start()
    model_time_track = []
    decoder_time_track = []
    # Inference
    if export_onnx:
        for _ in range(5):
            # Perform inference 10 times for more accurate timing
            start_time = time.time()
            logits = model.run(inputs[0], inputs[1] )
            model_time_track.append(time.time() - start_time)
            predicted_ids = logits[0].argmax(axis=-1)
            decoder_start_time  = time.time()
            predicted_sentences = processor.batch_decode(predicted_ids)
            decoder_time_track.append(time.time() - decoder_start_time)

    else:
        inputs, masks = inputs 
        for _ in range(5): 
            with torch.no_grad():
                # Perform inference 10 times for more accurate timing
                mdoel_start_time = time.time()
                logits = model(inputs, attention_mask =masks).logits
                model_time_track.append(time.time() - mdoel_start_time)
                decoder_start_time  = time.time()
                predicted_ids = torch.argmax(logits, dim=-1)
                predicted_sentences = processor.batch_decode(predicted_ids)
                decoder_time_track.append(time.time() -decoder_start_time)
        del masks

    result['outputs'] = predicted_sentences
    result["filenames"] = fileNames

    # Stop memory monitoring thread
    result
    result['done'] = True
    del model
    del inputs
    memory_thread.join()
    model_time, decoder_time = round(np.median(model_time_track[2:]),4) ,round(np.median(decoder_time_track),4) 

    # Calculate time and save results
    result = {key: value for key,value  in result.items() if isinstance(value, list) and len(value) > 1}
    result = manage_dictionary_lengths(result)
    overall_stats =  [
                                      round(interval*np.sum(result['cpu_util_percent']),2)
                                    , round(interval*np.sum(result['gpu_memory_used']),1)
                                    , round (interval*np.sum(result['gpu_utilization']), 2)
                                    , round(interval*np.sum(result['cpu_memory_util_percent']),1)
                                    , round(interval*np.sum(result['cpu_memory_util_amount']),1 )
                        ]
    folder_name = f'Inference_logs_{mode}_{device}_halved_{halved}_batch_{batch_size}'
    overall_results[folder_name] = [  mode, device, halved, batch_size
                                    , model_time, decoder_time 
                                    ,
overall
_stats[0]
                                    , overall_stats[1]
                                    , overall_stats[2]
                                    , overall_stats[3]
                                    , overall_stats[4]
                                    , round(overall_stats[0] / batch_size, 4)
                                    , round(overall_stats[1] / batch_size, 4)
                                    , round(overall_stats[2] / batch_size, 4)
                                    , round(overall_stats[3] / batch_size, 4)
                                    , round(overall_stats[4] / batch_size, 4)
                                    , round(model_time / batch_size, 5)
                                    , round(decoder_time  / batch_size, 5)
                                ]
    plot_individual_results(result,mode,  device, halved, batch_size, [model_time_track], [decoder_time_track] ,os.getcwd())


# Example usage
if name == "__main__":
    # parser = argparse.ArgumentParser(description="ONNX Inference Benchmark Script")
    # parser.add_argument("--modelDir", type=str, default="../ImanSavedData", help="Location of the model folder")
    # args = parser.parse_args()
    # model_dir = args.modelDir
    test_path = "Samples"
    speech_arrays = []
    fileNames = [f for f in os.listdir(test_path) if f.endswith('.wav') or f.endswith('.mp3')]
    # fileNames = ['11.wav']
    paths = [os.path.join(test_path,f) for f in  fileNames]
    speech_arrays = [librosa.load(path, sr=16_000)[0] for path in paths]
    # Define configurations
    # devices = ['cuda'] if torch.cuda.is_available() else ['cpu']
    devices = ['cuda']#,'cpu']
    halved_options = [ False]#True, 
    batch_sizes = [1,5,11]  # Adjust based on your needs
    model_dir = "./model/"
    # Perform inference for different configurations
    for onnxmode in [ True, False]:#
        for device in devices:
            for halved in halved_options:
                for batch_size in batch_sizes:
                    model_dir = "./model/"
                    model_dir =  model_dir#+'_half' if halved else model_dir
                    processor = Wav2Vec2Processor.from_pretrained(model_dir)
                    inputs = processor(speech_arrays[:batch_size], sampling_rate=16_000, return_tensors="pt", padding="max_length",max_length =240320, truncation= True  )
                    # Load and prepare the model
                    halved = False if device=='cpu' else halved
                    model, inputs = load_and_prepare_model_inputs(model_dir, inputs, export_onnx=onnxmode, halved=halved, device=device)
                    # Perform inference
                    fileNamesTemp = fileNames[:batch_size]
                    inference(model, [fileNamesTemp , inputs], device, halved, batch_size, export_onnx=onnxmode)

    custom_index = [
               "mode","device", "halved", "batch-size" , "model-time" , "decoder-time" , 
               "cpu-util-percent", "gpu-memory-used", "gpu-utilization", 
               "cpu-memory-util-percent", "cpu-memory-util-amount",
               'cpu-util-percent-batch-size-gain', "gpu-memory-used-batch-size-gain" , "gpu-utilization-batch-size-gain",
               "cpu-memory-util-percent-batch-size-gain", "cpu-memory-util-amount-batch-size-gain",
               "model-time-batch-size-gain" ,  "decoder-time-batch-size-gain"   
               ]
    df = pd.DataFrame(overall_results, index=custom_index)
    df.to_excel('overall_logs.xlsx',index=True ) #float_format='%.4f' ,index=True )
    print(f"**** device:{device}-halfed:{halved}-batchsize:{batch_size}, isONNX:{onnxmode}-- Done  *****\n\n")

and Here are the results

smth · January 7, 2024, 1:44pm

The ONNX runtime isn’t always more efficient. It is more efficient for things that the ONNX team optimized for. In the same way, TensorRT is more efficient for many cases where the TRT team optimized for.
Fundamentally, there isn’t a technical advantage to ONNX or to TRT relative to the PyTorch runtime.

alireza_khatami · January 9, 2024, 6:56am

Thank you @smth , Is there any resources out there regarding which architecture or layer blocks are supported for increased efficiency in ONNX?

marksaroufim · January 24, 2024, 6:39am

Generally a good signal is to google for blog posts that the ONNX or TRT teams have published since those will typically contain the results a team is proudest of