I am trying to increase the inference rate for a pre-trained network. The code for the inference is as follows:
import argparse
import torch
import skimage.transform
from skimage.io import imsave
import torchvision
from PIL import Image
import imageio
import torch.optim
import RedNet_model
from utils import utils
from utils.utils import load_ckpt
from torch import nn
import numpy as np
import os
import glob
from torch.utils.data import DataLoader
import RedNet_data
import time
import torch.backends.cudnn as cudnn
parser = argparse.ArgumentParser(description='Semantic Segmentation')
parser.add_argument('--data-dir', default=None, metavar='DIR',
help='path to Data Directory')
parser.add_argument('-o', '--output', default='', metavar='DIR',
help='path to output')
parser.add_argument('--cuda', action='store_true', default=False,
help='enables CUDA training')
parser.add_argument('--last-ckpt', default='', type=str, metavar='PATH',
help='path to latest checkpoint (default: none)')
parser.add_argument('-b', '--batch-size', default=10, type=int,
metavar='N', help='mini-batch size (default: 10)')
args = parser.parse_args()
device = torch.device("cuda:0" if args.cuda and torch.cuda.is_available() else "cpu")
image_w = 640
image_h = 480
def inference():
test_data = RedNet_data.InferenceData(phase_train=False, data_dir=args.data_dir)
test_loader = DataLoader(test_data, batch_size = args.batch_size, shuffle=False, num_workers=1, pin_memory=True)
num_test = len(test_data)
model = RedNet_model.RedNet(pretrained=False)
model = nn.DataParallel(model).cuda()#Need to add because model trained on multiple gpu's
load_ckpt(model, None, args.last_ckpt, device)
model.eval()
model.to(device)
cudnn.benchmark = True
start = time.time()
torch.no_grad()
for batch_idx, (sample, idx, img_paths, depth_paths) in enumerate(test_loader):
image = sample['image'].numpy()
depth = sample['depth'].numpy()
if batch_idx % 1 == 0:
print('No. of Batches Done: [{0}/{1}]\t'.format(batch_idx,len(test_loader)))
i = 0
for im, d in zip(image, depth):
im = skimage.transform.resize(im, (image_h, image_w), order=1,
mode='reflect', preserve_range=True)
# Nearest-neighbor
d = skimage.transform.resize(d, (image_h, image_w), order=0,
mode='reflect', preserve_range=True)
fileName1 = os.path.basename(img_paths[i])
im = im / 255
im = torch.from_numpy(im).float()
d = torch.from_numpy(d).float()
im = im.permute(2, 0, 1)
d.unsqueeze_(0)
im = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])(im)
d = torchvision.transforms.Normalize(mean=[19050], std=[9650])(d)
im = im.to(device).unsqueeze_(0)
d = d.to(device).unsqueeze_(0)
pred = model(im, d)
output = utils.color_label(torch.max(pred, 1)[1]+1)[0]
imageio.imsave(args.output+fileName1, output.numpy().transpose((1, 2, 0)))
i += 1
end = time.time()
elapsed = end-start
print("Time elapsed in seconds:", elapsed)
print("Inference Rate(Images per second): ", num_test/elapsed)
if __name__ == '__main__':
inference()
Now, I noticed something weird when running this inference on the cloud(4* Tesla VX-100’s GPU’s) and on my local machine (1* Nvidia GTX 1080Ti ). It actually runs faster on my local machine! (4 images/sec on the cloud versus 6.6 images/sec on my local machine) I am still new to Deep Learning and I cant seem to understand how this could be possible. Another possible follow-up question I have is: If I load a specific batch of images via the DataLoader class shouldnt it be able to process all those images simultaneously? And, Lastly, How can I optimize this further. I need to be able to improve the inference rate by almost 200% so that I can perform the inference in realtime. Thanks.