Resnet50 gets only 93% accuracy on MNIST

Hi, I recently tried some code using Resnet50 on MNIST, however I only got 93% while other results on the web said that they got to around 99%, can anyone check my code to see if there is anything wrong ? Thank you very much.

Here is my code:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

Input data files are available in the read-only “…/input/” directory

For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk(‘/kaggle/input’):
for filename in filenames:
print(os.path.join(dirname, filename))

You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using “Save & Run All”

You can also write temporary files to /kaggle/temp/, but they won’t be saved outside of the current session


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.autograd import Variable

import torchvision
import torchvision.transforms as transforms
from import Dataset, DataLoader
from sklearn.metrics import confusion_matrix

from pdb import set_trace


device = torch.device(“cuda:0” if torch.cuda.is_available() else “cpu”)


train_augment_transforms = transforms.Compose([

transforms.RandomAffine(degrees=10, translate=(0.1, 0.1)),




#train_set = torchvision.datasets.FashionMNIST(“./data”, download=True, transform=


#test_set = torchvision.datasets.FashionMNIST(“./data”, download=True, train=False, transform=


train_set = torchvision.datasets.MNIST(“./data”, download=True, train=True, transform=
test_set = torchvision.datasets.MNIST(“./data”, download=True, train=False, transform=

train_loader =,
test_loader =,


def output_label(label):
output_mapping = {
0: “T-shirt/Top”,
1: “Trouser”,
2: “Pullover”,
3: “Dress”,
4: “Coat”,
5: “Sandal”,
6: “Shirt”,
7: “Sneaker”,
8: “Bag”,
9: “Ankle Boot”
input = (label.item() if type(label) == torch.Tensor else label)
return output_mapping[input]

#demo_loader =, batch_size=10)
#batch = next(iter(demo_loader))
#images, labels = batch

#grid = torchvision.utils.make_grid(images, nrow=10)

#plt.figure(figsize=(15, 20))
#plt.imshow(np.transpose(grid, (1, 2, 0)))
#print("labels: “, end=” ")
#for i, label in enumerate(labels):

print(output_label(label), end=", ")


def plot_grad_flow(named_parameters):
ave_grads = []
layers = []
for n, p in named_parameters:
if(p.requires_grad) and (“bias” not in n):
plt.plot(ave_grads, alpha=0.3, color=“b”)
plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color=“k” )
plt.xticks(range(0,len(ave_grads), 1), layers, rotation=“vertical”)
plt.xlim(xmin=0, xmax=len(ave_grads))
plt.ylabel(“average gradient”)
plt.title(“Gradient flow”)


class BasicConv2d(nn.Module):

def __init__(self, input_channels, output_channels, **kwargs):
    self.conv = nn.Conv2d(input_channels, output_channels, bias=False, **kwargs) = nn.BatchNorm2d(output_channels)
    self.relu = nn.ReLU(inplace=True)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    # set_trace()
def forward(self, x):
    x = self.conv(x)
    x =
    x = self.relu(x)
    x = self.pool(x)

    return x

class Flatten(nn.Module):
def forward(self, input):
return input.view(input.size(0), -1)

def multi_acc(y_pred, y_test):
y_pred_softmax = torch.log_softmax(y_pred, dim = 0)
_, y_pred_tags = torch.max(y_pred_softmax, dim = 1)
correct_pred = (y_pred_tags == y_test).float()
acc = correct_pred.sum() / len(correct_pred)
acc = torch.round(acc * 100)
return acc

def make_train_step(model, model_name, loss_fn, optimizer):
“”" Function to make one training step
def perform_train_step(X_train_batch, y_train_batch):
# Finetuning Torchvision Models — PyTorch Tutorials 1.2.0 documentation
if model_name == “inception_v3”:
y_train_pred, aux_outputs = model(X_train_batch)
y_train_pred = y_train_pred.squeeze()
aux_outputs = aux_outputs.squeeze()
train_loss_1 = loss_fn(y_train_pred, y_train_batch)
train_loss_2 = loss_fn(aux_outputs, y_train_batch)
train_loss = train_loss_1 + 0.4*train_loss_2
y_train_pred = model(X_train_batch).squeeze()
train_loss = loss_fn(y_train_pred, y_train_batch)

    train_acc = multi_acc(y_train_pred, y_train_batch)
    return (train_acc.item(), train_loss.item())

return perform_train_step

def make_val_step(model, model_name, loss_fn, optimizer):
“”" Function to make one validation step
def perform_val_step(X_val_batch, y_val_batch):
y_val_pred = model(X_val_batch).squeeze()
val_acc = multi_acc(y_val_pred, y_val_batch)
val_loss = loss_fn(y_val_pred, y_val_batch)
return (val_acc.item(), val_loss.item())

return perform_val_step

def mini_batch(device, data_loader, step_fn):
“”" Function to run through a mini-batch (train or validation)
mini_batch_acc_list = []
mini_batch_loss_list = []
for (X_batch, y_batch) in data_loader:
X_batch =
y_batch =

    (mini_batch_acc, mini_batch_loss) = step_fn(X_batch, y_batch)


loss = np.mean(mini_batch_loss_list)
acc = np.mean(mini_batch_acc_list)


return (acc, loss)

import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
expansion = 4
def init(self, in_channels, out_channels, i_downsample=None, stride=1):
super(Bottleneck, self).init()

    self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
    self.batch_norm1 = nn.BatchNorm2d(out_channels)
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
    self.batch_norm2 = nn.BatchNorm2d(out_channels)
    self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, stride=1, padding=0)
    self.batch_norm3 = nn.BatchNorm2d(out_channels*self.expansion)
    self.i_downsample = i_downsample
    self.stride = stride
    self.relu = nn.ReLU()
def forward(self, x):
    identity = x.clone()
    x = self.relu(self.batch_norm1(self.conv1(x)))
    x = self.relu(self.batch_norm2(self.conv2(x)))
    x = self.conv3(x)
    x = self.batch_norm3(x)
    #downsample if needed
    if self.i_downsample is not None:
        identity = self.i_downsample(identity)
    #add identity
    return x

class Block(nn.Module):
expansion = 1
def init(self, in_channels, out_channels, i_downsample=None, stride=1):
super(Block, self).init()

    self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
    self.batch_norm1 = nn.BatchNorm2d(out_channels)
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, stride=stride, bias=False)
    self.batch_norm2 = nn.BatchNorm2d(out_channels)

    self.i_downsample = i_downsample
    self.stride = stride
    self.relu = nn.ReLU()

def forward(self, x):
  identity = x.clone()

  x = self.relu(self.batch_norm2(self.conv1(x)))
  x = self.batch_norm2(self.conv2(x))

  if self.i_downsample is not None:
      identity = self.i_downsample(identity)
  x += identity
  x = self.relu(x)
  return x

class ResNet(nn.Module):
def init(self, ResBlock, layer_list, num_classes, num_channels=3):
super(ResNet, self).init()
self.in_channels = 64

    self.conv1 = nn.Conv2d(num_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
    self.batch_norm1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU()
    self.max_pool = nn.MaxPool2d(kernel_size = 3, stride=2, padding=1)
    self.layer1 = self._make_layer(ResBlock, layer_list[0], planes=64)
    self.layer2 = self._make_layer(ResBlock, layer_list[1], planes=128, stride=2)
    self.layer3 = self._make_layer(ResBlock, layer_list[2], planes=256, stride=2)
    self.layer4 = self._make_layer(ResBlock, layer_list[3], planes=512, stride=2)
    self.avgpool = nn.AdaptiveAvgPool2d((1,1))
    self.fc = nn.Linear(512*ResBlock.expansion, num_classes)
def forward(self, x):
    x = self.relu(self.batch_norm1(self.conv1(x)))
    x = self.max_pool(x)

    x = self.layer1(x)
    x = self.layer2(x)
    x = self.layer3(x)
    x = self.layer4(x)
    x = self.avgpool(x)
    x = x.reshape(x.shape[0], -1)
    x = self.fc(x)
    return x
def _make_layer(self, ResBlock, blocks, planes, stride=1):
    ii_downsample = None
    layers = []
    if stride != 1 or self.in_channels != planes*ResBlock.expansion:
        ii_downsample = nn.Sequential(
            nn.Conv2d(self.in_channels, planes*ResBlock.expansion, kernel_size=1, stride=stride),
    layers.append(ResBlock(self.in_channels, planes, i_downsample=ii_downsample, stride=stride))
    self.in_channels = planes*ResBlock.expansion
    for i in range(blocks-1):
        layers.append(ResBlock(self.in_channels, planes))
    return nn.Sequential(*layers)

def ResNet50(num_classes, channels=3):
return ResNet(Bottleneck, [3,4,6,3], num_classes, channels)

model = ResNet50(10, channels=1)

error = nn.CrossEntropyLoss()

learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


num_epochs = 300
train_step_fn = make_train_step(model, “FashionCNN”, error, optimizer)
val_step_fn = make_val_step(model, “FashionCNN”, error, optimizer)

def perform_training(model, train_loader, val_loader, num_epochs = 20):
count = 0
accuracy_stats = {
‘train’: [],
“val”: []
loss_stats = {
‘train’: [],
“val”: []

for epoch in range(num_epochs):
    (train_epoch_acc, train_epoch_loss) = mini_batch(device, train_loader, train_step_fn)
    with torch.no_grad():
        (val_epoch_acc, val_epoch_loss) = mini_batch(device, val_loader, val_step_fn)
    print("Epoch: %d, train loss: %f, val loss: %f, train acc: %f, val acc: %f"  % (epoch, train_epoch_loss, val_epoch_loss, train_epoch_acc, val_epoch_acc))

perform_training(model, train_loader, test_loader, num_epochs=num_epochs)

If I recall correctly, models that perform in the 99% percentile are from pre-trained weights and just replacing the model head for fine-tuning. See here for fine-tuning methods:

TLDR: A CNN classifier that has ‘seen’ a lot more than just the current dataset is better equipped to deal with outliers.

1 Like

Hi I was wondering if my code has any bug in it, previously I tried FashionMNIST and for some reason the train accuracy is higher than test accuracy. I just tried MNIST using pretrained weights and the test accuracy reaches around 98%, I hope it’s okay.

1 Like