Hello all,
I am using a TorchVision model with pre-trained weights and checking its accuracy using the original ImageNet validation set (downloaded from Kaggle). Also, I try to use the latest up-to-date API for reproducibility.
The problem is I am not getting exactly the same accuracy as reported in the documentation (see Models and pre-trained weights — Torchvision main documentation or Introducing TorchVision’s New Multi-Weight Support API | PyTorch)
The code is pretty straightforward:
import os
from torch.utils.data import Dataset
from PIL import Image
import json
from torch.utils.data import DataLoader
from torchvision import transforms
import torch
import torchvision
from tqdm import tqdm
FOLDER = "/home/Documents/ImageNet"
class ImageNetKaggle(Dataset):
def __init__(self, root, split, transform=None):
self.samples = []
self.targets = []
self.transform = transform
self.syn_to_class = {}
with open(os.path.join(root, "imagenet_class_index.json"), "rb") as f:
json_file = json.load(f)
for class_id, v in json_file.items():
self.syn_to_class[v[0]] = int(class_id)
with open(os.path.join(root, "ILSVRC2012_val_labels.json"), "rb") as f:
self.val_to_syn = json.load(f)
samples_dir = os.path.join(root, "ILSVRC/Data/CLS-LOC", split)
for entry in os.listdir(samples_dir):
if split == "train":
syn_id = entry
target = self.syn_to_class[syn_id]
syn_folder = os.path.join(samples_dir, syn_id)
for sample in os.listdir(syn_folder):
sample_path = os.path.join(syn_folder, sample)
self.samples.append(sample_path)
self.targets.append(target)
elif split == "val":
syn_id = self.val_to_syn[entry]
target = self.syn_to_class[syn_id]
sample_path = os.path.join(samples_dir, entry)
self.samples.append(sample_path)
self.targets.append(target)
def __len__(self):
return len(self.samples)
def __getitem__(self, idx):
x = Image.open(self.samples[idx]).convert("RGB")
if self.transform:
x = self.transform(x)
return x, self.targets[idx]
weights = torchvision.models.ResNet50_Weights.DEFAULT # or IMAGENET1K_V2
model = torchvision.models.resnet50(weights=weights)
model.eval().cuda() # Needs CUDA, don't bother on CPUs
dataset = ImageNetKaggle(FOLDER, "val", weights.transforms())
dataloader = DataLoader(
dataset,
batch_size=64, # may need to reduce this depending on your GPU
num_workers=8, # may need to reduce this depending on your num of CPUs and RAM
shuffle=False,
drop_last=False,
pin_memory=True
)
# Run inference
top1 = 0
K=5
top5 = 0
total = 0
with torch.no_grad():
for x, y in tqdm(dataloader):
y_pred = model(x.cuda())
_, y_pred_k = y_pred.topk(k=K, dim=1)
y_pred_k = y_pred_k.t()
top1 += (y_pred.argmax(axis=1) == y.cuda()).sum().item()
correct = (y_pred_k == y.unsqueeze(dim=0).cuda()).expand_as(y_pred_k)
top5 += (correct[:K].reshape(-1)).sum().item()
total += len(y)
print(f"Acc@1: {100 * top1 / total:.3f}% Acc@5: {100 * top5 / total:.3f}%")
However, the accuracies are slightly different from those indicated in the official documentation, which are 80.858 and 95.434 top-1 and top-5, respectively, for this ResNet50 model.
$ python inference.py
y
100%|█████████████████████████████████████| 782/782 [01:51<00:00, 7.02it/s]
Acc@1: 80.854% Acc@5: 95.438%
Is this any kind of issue, or am I doing something wrong? Could this just depend on the specific hardware that runs the inference?
I get the same accuracy values through multiple executions, and observed the behavior for other models such as GoogLeNet.
I am using PyTorch version 1.13.1+cu116
and TorchVision 0.14.1+cu116
.