VGG16 faster than Mobilenet_v2 when batch_size equal to 1 on inference,use pytorch 1.4.0 Cuda10.0

I use pytorch to evaluate the run time of VGG16 and Mobilenet_v2,and I use batch size equal to 1.

When batch size is 1,my VGG16 result is Avg time: 0.0051 s (197.2 fps).

Mobilenet_v2 result is Avg time: 0.0063 s (157.8 fps).

I wonder why Mobilenet_v2 is slower than VGG16 when batch size 1.

if Batch size equal to 12,the result of VGG16 is Avg time: 0.0349 s (28.7 fps).

While Mobilenet_v2 is Avg time: 0.0064 s (156.7 fps).

is my code have something wrong? Why is VGG16 so faster or Mobilenet_v2 slow? Or evaluation run time can’t use batch size equal to 1.

Thank.

My environment

Python 3.6.10
Pytorch 1.4.0
torchvision 0.5.0
Cuda 10.0.130
cuDNN 7603
GPU RTX2080Ti
CPU Intel® Xeon® CPU E3-1231 v3 @ 3.40GHz

This is my code.

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision import models
import torch.utils.model_zoo as model_zoo
import torch.nn.functional as F
import torch.optim as optim
import os
import numpy as np
import cv2
from tensorboardX import SummaryWriter
import argparse
from torchsummaryX import summary
import time
import warnings
warnings.filterwarnings("ignore")

if __name__ == "__main__":
    device = torch.device('cuda')
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic =True
    parser = argparse.ArgumentParser()
    parser.add_argument('--backbone', help='the backbone model', type=str, default="Mobile_v2")
    args = parser.parse_args()
    #print(args)
    phase = 'train'
    shape_r,shape_c = 224,224
    backbone_name = args.backbone
    model_name = f'{backbone_name}_pre'
    print(model_name)
 
    if(backbone_name == 'VGG16'):
        backbone = models.vgg16(pretrained=True).to(device)
    if(backbone_name == 'Mobile_v2'):
        backbone = torch.hub.load('pytorch/vision:v0.5.0', 'mobilenet_v2', pretrained=True).to(device)
    #Time it
    def time_cal2(m):
        times = []
        global device
        batchsize = 1
        #with torch.autograd.profiler.profile(use_cuda=True) as prof:
        with torch.no_grad():
            m.eval()
            for i in range(10):
                x_i = torch.rand(batchsize,3,shape_r,shape_c).to(device).contiguous()
                y = m.forward(x_i)
            print(y.size())
            for i in range(100):
                x_i = torch.rand(batchsize,3,shape_r,shape_c).to(device).contiguous()
                torch.cuda.synchronize()
                t0 = time.time()
                y = m.forward(x_i)
                torch.cuda.synchronize()
                dt = time.time() - t0
                times.append(dt)
        dt = sum(times) / len(times)
        print(f"Avg time: {dt:.4f} s ({1 / dt:.1f} fps)")
        #print(prof.key_averages().table())
        return dt
    #summary(backbone,torch.rand(1,3,shape_r,shape_c).to(device))
    print(backbone_name)
    time_cal2(backbone)