Expected resnet50 CPU throughput?

Hi,

I’m fairly new to pytorch so this will probably seem like a silly question, but here we go: I’m curious about the expected throughput of inference on CPUs while using various modes of pytorch. Using one of the pretrained models I benchmarked it on an 8-core ryzen machine with the below script but I’m seeing times that seem rather slow (around ~2.4 seconds for a batch size of 16, plus or minus a bit depending on eager/trace/script mode). Could someone tell me what I’m doing wrong? Much appreciated :slight_smile:

import torchvision
import torch

import pprint

import timeit
from timeit import default_timer as timer

def benchmark_pytorch_cpu(repeat_runs, model, example):
    results = list()

    # warmup
    for i in range(3):
        model(example)

    for i in range(repeat_runs):
        start = timer()
        with torch.no_grad():
            model(example)
        end = timer()
        results.append(end - start)

    return results

def pytorch_eager_mode(config):
    repeat_runs = config['repeat_runs']
    example = torch.rand(config['batch_size'], 3, 224, 224)

    model = torchvision.models.resnet50(pretrained=True)
    model.eval()

    return benchmark_pytorch_cpu(repeat_runs, model, example)

def pytorch_script_mode(config):
    repeat_runs = config['repeat_runs']
    example = torch.rand(config['batch_size'], 3, 224, 224)

    model = torchvision.models.resnet50(pretrained=True)
    with torch.jit.optimized_execution(True):
        model = torch.jit.script(model, example)
        model.eval()

        return benchmark_pytorch_cpu(repeat_runs, model, example)

def pytorch_trace_mode(config):
    repeat_runs = config['repeat_runs']
    example = torch.rand(config['batch_size'], 3, 224, 224)

    model = torchvision.models.resnet50(pretrained=True)
    with torch.jit.optimized_execution(True):
        model = torch.jit.trace(model, example)
        model.eval()

        return benchmark_pytorch_cpu(repeat_runs, model, example)

config = {
    'repeat_runs': 30,
    'batch_size': 16,
    'mode': 'all'
}

result = {
    'eager_mode': pytorch_eager_mode(config),
    'trace_mode': pytorch_trace_mode(config),
    'script_mode': pytorch_script_mode(config)
}
pprint.pprint(result)

I think the times are ok for a ResNet50 running in cpu.

edit: I run your code in a amd fx 8150, and this is the result

{'eager_mode': [3.1236065919983957,
                2.8740777429993614,
                3.215031881998584,
                3.2408665209986793,
                3.0817273770007887,
                3.182126468000206,
                3.217468533999636,
                3.0495879129994137,
                3.2823124830028974,
                2.98734099399735,
                3.04414951400031,
                3.1191899989971716,
                3.2561472420020436,
                2.989001477999409,
                2.8834749839988945,
                3.2728907790005906,
                2.7947310479976295,
                2.7229248879993975,
                3.1714364620020206,
                3.187919829000748,
                2.7909267199975147,
                2.7637479750010243,
                3.2621708949991444,
                3.2539125790026446,
                2.758795009998721,
                3.099266881999938,
                3.2731154520006385,
                3.321376304000296,
                3.2308814560019528,
                3.0906896560009045],
 'script_mode': [5.663022714001272,
                 3.1545001340018644,
                 3.20035366799857,
                 3.2082225320009456,
                 3.0376815849995182,
                 3.23846422600036,
                 3.1248215429986885,
                 3.2207794620007917,
                 3.1486177900005714,
                 2.992629713000497,
                 3.273916879999888,
                 3.2852682630000345,
                 2.965044250999199,
                 3.24503594899943,
                 2.7061276380009076,
                 3.1414647249985137,
                 3.2591017490012746,
                 3.071839774002001,
                 3.269749651000893,
                 3.2680227549972187,
                 2.9813807400023506,
                 3.1570755319989985,
                 3.2570094219991006,
                 3.2274487190006766,
                 3.2254650680006307,
                 3.1624261510005454,
                 3.307589273001213,
                 3.277479943997605,
                 3.0594900569994934,
                 3.168268078003166],
 'trace_mode': [3.331946746002359,
                3.454950250998081,
                3.384993150997616,
                3.2832865930031403,
                3.296919559001253,
                3.3446977690000494,
                3.334649086002173,
                3.267488012003014,
                3.3727113850000023,
                3.276030206998257,
                3.2170279210004082,
                3.447721204996924,
                3.373897843001032,
                3.3251263899983314,
                3.363704613002483,
                3.4962456700013718,
                3.459224797999923,
                3.4048159969970584,
                3.380363905002014,
                3.399266141001135,
                3.4834778490003373,
                3.366177444000641,
                3.4348102459989605,
                3.3493297409986553,
                3.402436749998742,
                3.3380109899990202,
                3.4338676759980444,
                3.3019120470016787,
                3.317763601000479,
                3.3335056399992027]}
1 Like

Great, thank you for the sanity check!

1 Like

I’m using an Intel(R) Core(TM) i7-5820K CPU @ 3.30GHz and get these results:

{'eager_mode': [0.5560707160038874,
                0.542530203005299,
                0.5525323470355943,
                0.5797108679544181,
                0.6222631660057232,
                0.5825122060487047,
                0.5474026179872453,
                0.5619657889474183,
                0.6081688110716641,
                0.5623527530115098,
                0.5286695619579405,
                0.53724156296812,
                0.5400190929649398,
                0.5279522869968787,
                0.5264217259828001,
                0.537107459967956,
                0.5290657769655809,
                0.5312061479780823,
                0.6145813700277358,
                0.6153471870347857,
                0.5620701040606946,
                0.5770812450209633,
                0.5714077320881188,
                0.5512035259744152,
                0.5524422349408269,
                0.5311381409410387,
                0.5539593469584361,
                0.5258807480568066,
                0.5210533021017909,
                0.5369119530078024],
 'script_mode': [0.5929233940551057,
                 0.5274949680315331,
                 0.5209233119385317,
                 0.622672584024258,
                 0.6760230989893898,
                 0.5266846240265295,
                 0.5286901260260493,
                 0.5306539159500971,
                 0.5264731040224433,
                 0.5290824898984283,
                 0.5209352619713172,
                 0.51720414403826,
                 0.5615924299927428,
                 0.5254444689489901,
                 0.5163348630303517,
                 0.5389109910465777,
                 0.6820635860785842,
                 0.7724095960147679,
                 0.5444801079574972,
                 0.5194973460165784,
                 0.5303034379612654,
                 0.5160098229534924,
                 0.52997019700706,
                 0.5386869909707457,
                 0.5330231039552018,
                 0.660333115956746,
                 0.7001192759489641,
                 0.5578745729289949,
                 0.5785470649134368,
                 0.5504308270756155],
 'trace_mode': [0.754324565990828,
                0.739053672994487,
                0.7153798349900171,
                0.5243895800085738,
                0.5526426800061017,
                0.5383491329848766,
                0.5282454779371619,
                0.5380610059946775,
                0.5243045300012454,
                0.5218377009732649,
                0.6819699059706181,
                0.7381546709220856,
                0.7484542910242453,
                0.7030549009796232,
                0.5683261760277674,
                0.5502430080669001,
                0.5257022560108453,
                0.535970926983282,
                0.5225016670301557,
                0.523550137062557,
                0.535585105069913,
                0.5230692230397835,
                0.5247276799054816,
                0.536925571039319,
                0.5437257509911433,
                0.5220493579981849,
                0.6128906130325049,
                0.527433141018264,
                0.545051172026433,
                0.562636021990329]}

Hmm okay that did seem too weird to be true - can I ask if you’re using Linux/what package versions you’re on? I reran the script on an Intel® Core™ i7-8550U CPU @ 1.80 GHz and got similar results as before…

OS: Windows 10
pytorch version: 1.4.0+cpu
torchvision version: 0.5.0+cpu
python version: 3.5.6 (Anaconda)

My results:

{'eager_mode': [2.7745824,
                2.6500037999999995,
                2.6533983,
                2.6689963999999993,
                2.6377159999999993,
                2.6842899000000013,
                2.5568492999999997,
                2.597765599999999,
                2.6008908999999996,
                2.5944234999999978],
 'script_mode': [5.247614200000001,
                 2.586303700000002,
                 2.549111499999995,
                 2.592132399999997,
                 2.5910770000000127,
                 2.729830899999996,
                 2.8766263000000123,
                 2.5445603999999946,
                 2.596778499999999,
                 2.6283513999999997],
 'trace_mode': [3.0458361999999966,
                3.0095516000000018,
                2.896102599999999,
                2.869847600000007,
                2.8926903999999922,
                2.875762299999991,
                2.919233399999996,
                2.9229768999999948,
                2.954907900000009,
                3.0075180000000046]}

I’m on Ubuntu18.04, PyTorch 1.5.0.dev20200208, torchvision 0.6.0a0+e2573a7 and Python 3.6.

I’m not too familiar with the CPU path, but I assume some acceleration might be used, e.g. Intel MKL-DNN.

These are the results for Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz:

{'eager_mode': [0.41640684998128563,
                0.7120672950986773,
                1.2824941909639165,
                1.3912344899727032,
                0.40768929792102426,
                0.4139971520053223,
                0.38925268698949367,
                0.402484250953421,
                0.3884121150476858,
                0.4785128979710862,
                0.4067832449218258,
                0.545752897975035,
                0.5732599410694093,
                0.4258149970555678,
                0.3968868920346722,
                0.48566950601525605,
                0.43550701497588307,
                0.4440762479789555,
                0.7503554919967428,
                0.38166197296231985,
                0.3821106309769675,
                0.44661023293156177,
                0.38337251101620495,
                0.3819082749541849,
                0.37819257692899555,
                1.1893695519538596,
                1.5388466010335833,
                1.525529143982567,
                0.48465632600709796,
                0.45776070607826114],
'script_mode': [0.4976730769267306,
                 0.43659254198428243,
                 0.773980311001651,
                 0.478324937983416,
                 0.4924765909090638,
                 0.5786028039874509,
                 0.4377919939579442,
                 0.4363677880028263,
                 0.5326749039813876,
                 0.4483696899842471,
                 0.4391987449489534,
                 0.5024611499393359,
                 1.5762734409654513,
                 1.595031026052311,
                 0.8721332249697298,
                 0.450302682002075,
                 0.4429084720322862,
                 0.44220093509647995,
                 0.4409501899499446,
                 0.44036652601789683,
                 0.441320842015557,
                 0.48320439900271595,
                 0.4452905689831823,
                 0.44349693809635937,
                 0.4408123919274658,
                 0.4539205359760672,
                 0.4490093319909647,
                 0.44213455193676054,
                 0.44304988998919725,
                 0.75286894896999],
 'trace_mode': [0.3898515000473708,                                                                                                                                                                 [0/1929]
                0.3109738149214536,
                0.40837487496901304,
                0.3113917140290141,
                0.30807038000784814,
                0.30801420705392957,
                0.39118307502940297,
                0.3760412859264761,
                0.3090534210205078,
                0.3969404149102047,
                0.30979375808965415,
                0.31236658501438797,
                0.31614084099419415,
                0.3127995830727741,
                0.31163546594325453,
                0.34516514698043466,
                0.31197512603830546,
                0.3150515320012346,
                0.31840068206656724,
                0.324663263047114,
                0.31104419799521565,
                0.35024457494728267,
                0.31061191798653454,
                0.311683111009188,
                0.31441817595623434,
                0.31754313490819186,
                0.3199888019589707,
                0.3113331060158089,
                0.3175580579554662,
                0.449888464063406]}

Could you check out the latest nightly binaries, as (if I’m not mistaken) MKL-DNN was recently activated for Windows.

1 Like

Thank you for your help! I’m inclined to believe it might be an OS specific difference given all of this - I updated to a set of recent nightly build packages (torch 1.5.0.dev20200301+cpu, torchvision 0.6.0.dev20200302+cpu) and while I am still seeing better throughput, it’s still not close (run on a Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.50 GHz):

{'eager_mode': [1.3861324000000002,
                1.3435306,
                1.3823757999999997,
                1.3703496,
                1.3936329,
                1.4234957000000001,
                1.3723590000000012,
                1.3078323000000012,
                1.4537896999999997,
                1.2923495999999997],
 'script_mode': [1.4593779999999938,
                 1.3242908,
                 1.3686729,
                 1.3443377999999981,
                 1.3476385000000022,
                 1.3000840999999994,
                 1.3655263999999931,
                 1.3246092999999917,
                 1.365785399999993,
                 1.3713202999999936],
 'trace_mode': [1.5728131000000012,
                1.5735659999999996,
                1.5698326999999992,
                1.544288299999998,
                1.5589613,
                1.5596863000000027,
                1.5536323000000039,
                1.568505100000003,
                1.6192179000000024,
                1.593777799999998]}