ResNext is performing weaker than Vgg-16 on CIFAR10, is this normal?

Hi everyone, as a beginner in deep learning, I am trying to recover classical neural networks manually.

Since imagenet is too large, I chose the smaller CIFAR10, I take the designed model from torchvision.models and modify its final fully connected layer to fit the output.

First I tried alexnet and achieved 70.5% classification accuracy after convergence. Then I tried vgg16(in the order of models being published), and gratifyingly it improved the accuracy to 84.5%. I tried migration learning using torchvision’s pre-trained model, which increased the accuracy by an additional two percentage points.

Finally I tried ResNext, which I think is a relatively newer network and should yield better results. But after 20 epochs of training, it converged to 62% accuracy, even worse than Alexnet.

I would like to know what caused this and if it was due to my use error. Here’s the full code:

import os,sys,time,pickle,random
import matplotlib.pyplot as plt
import numpy as np 
import torch
from torch import nn
from torchvision import datasets, models
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision.transforms import ToTensor, Lambda, Resize, Compose, InterpolationMode

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))
torch.backends.cudnn.benchmark=True

# Download training data from open datasets.
training_data = datasets.CIFAR10(
    root=".\\data\\cifar10",
    train=True,
    download=True,
    transform=Compose([
        Resize((64, 64), InterpolationMode.BICUBIC),
        ToTensor()
    ])
)

# Download test data from open datasets.
test_data = datasets.CIFAR10(
    root=".\\data\\cifar10",
    train=False,
    download=True,
    transform=Compose([
        Resize((64, 64), InterpolationMode.BICUBIC),
        ToTensor()
    ])
)

def imshow(training_data):
    labels_map = {
        0: "plane",
        1: "car",
        2: "bird",
        3: "cat",
        4: "deer",
        5: "dog",
        6: "frog",
        7: "horse",
        8: "ship",
        9: "truck",
    }
    cols, rows = 3, 3
    figure = plt.figure(figsize=(8,8))
    for i in range(1, cols * rows + 1):
        sample_idx = torch.randint(len(training_data), size=(1,)).item()
        img, label = training_data[sample_idx]
        img = img.swapaxes(0,1)
        img = img.swapaxes(1,2) 
        figure.add_subplot(rows, cols, i)
        plt.title(labels_map[label])
        plt.axis("off")
        plt.imshow(img)
    plt.show()

# imshow(training_data)

def train_loop(dataloader, net, loss_fn, optimizer):
    size = len(dataloader)
    train_loss = 0
    for batch_idx, (X, tag) in enumerate(dataloader):
        X, tag = X.to(device), tag.to(device)
        pred = net(X)
        loss = loss_fn(pred, tag)
        train_loss += loss.item()

        # Back propagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    train_loss /= size 
    return train_loss

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    return test_loss, correct

net = models.resnext50_32x4d().to(device)
net.fc = nn.Linear(2048, 10).to(device)

learning_rate = 0.01
batch_size = 128
weight_decay = 0.0005

train_dataloader = DataLoader(training_data, batch_size = batch_size)
test_dataloader = DataLoader(test_data, batch_size = batch_size)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(net.parameters(), lr = learning_rate)

epochs = 20
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    st_time = time.time()
    train_loss = train_loop(train_dataloader, net, loss_fn, optimizer)
    test_loss, correct = test_loop(test_dataloader, net, loss_fn)
    print(f"Train loss: {train_loss:>8f}, Test loss: {test_loss:>8f}, Accuracy: {(100*correct):>0.1f}%, Epoch time: {time.time() - st_time:.2f}s\n")
print("Done!")
torch.save(net.state_dict(), 'resnext1-50_32x4d.model')

I don’t know how well models pretrained on ImageNet would generalize to CIFAR10. Given that the resolution of the images is different, I would expect to see some gaps. Also note that you are not normalizing the images to a zero mean and a unit variance (which is done during the ImageNet pretraining), so you might want to add it and see if this could improve the results.