Resnet Loss barely changes, accuracy stays near 0

I am new to pytorch, and have been using it for a school project. My overall goal for the project is to build a resnet50 to identify the images in imagenet. I went through some tutorials and built a simple resnet for analysing the MNIST dataset, but now that I have moved to a more complicated resnet, my training accuracy and loss are stuck near 0 and I am not sure why.

I have tried varying the momentum from 0.9 to 0.1 and learning rate from 0.001 to sqrt(0.1) but I still am unable to train my model. I suspect my dataloader is incorrect, as the dimensions on my xb variable in my training loop are [64, 3, 224, 224] which is different from the [64, 784] dimensional input I saw when training on MNIST but I am not sure. Any help with this would be greatly appreciated. I have inserted my code and a clip of the structure of the resnet my project is trying to emulate. The resnet in the image is for a video classifier, but we were told to just modify it so it worked for image analysis by removing the time dimension. Thank you in advance for your help.

import torch 
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from import TensorDataset
from import DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from import TensorDataset
import as data

from datetime import datetime
import math
import torchvision
import time

initialOutputSize = 56
block1Size = 32
convBlock2Size = 64
convBlock3Size = 128
convBlock4Size = 256
convBlock5Size = 512

def outSizeCalculator(inSize, padding, kernelSize, stride):
  return floor((inSize+2*padding - kernelSize)/stride + 1)

class InitialConvblock(nn.Module):
  def __init__(self, in_size, out_size): #In size of 224, outsize of 56
    self.kernelSize = 5
    self.stride = 2
    self.pad = 2
    self.conv1 = nn.Conv2d(in_size, out_size, self.kernelSize, self.stride, self.pad)
    self.batchnorm1 = nn.BatchNorm2d(out_size)
    self.maxpool = nn.AdaptiveMaxPool2d(initialOutputSize)

  def InitialConvblock(self, x):    
    x = self.maxpool(F.relu(self.batchnorm1(self.conv1(x))))
    return x

  def forward(self, x): return self.InitialConvblock(x)

class ResBlock(nn.Module):
  def __init__(self, in_size, hidden_layer_size, out_size, cutSize=False):
    self.kernelSize1 = 1
    self.kernelSize3 = 3
    self.stride1 = 1
    self.stride2 = 2
    if cutSize:
    self.pad0 = 0
    self.pad1 = 1
    self.conv1 = nn.Conv2d(in_size, hidden_layer_size, self.kernelSize1, self.initialStride, self.pad0)
    self.conv2 = nn.Conv2d(hidden_layer_size, hidden_layer_size, self.kernelSize3, self.stride1, self.pad0)
    self.conv3 = nn.Conv2d(hidden_layer_size, out_size, self.kernelSize1, self.stride1, self.pad1)

    self.convShortcut = nn.Conv2d(in_size, out_size, self.kernelSize1, self.stride1, self.pad0)
    self.convShortcutResize = nn.Conv2d(in_size, out_size, self.kernelSize1, self.stride2, self.pad0) #Use if Final shortcut for block
    self.batchnorm1 = nn.BatchNorm2d(hidden_layer_size)
    self.batchnorm2 = nn.BatchNorm2d(out_size)

  def resblock(self, x, resizeShortcut=False):
    if resizeShortcut:
      shortcut = self.convShortcutResize(x)
      shortcut = self.convShortcut(x)
    blockOutput = F.relu(self.batchnorm2(self.conv3(F.relu(self.batchnorm1(self.conv2(F.relu(self.batchnorm1(self.conv1(x))))))))) 

    return shortcut + blockOutput

  def forward(self, x, resizeShortcut=False): return self.resblock(x, resizeShortcut)

class ResNet(nn.Module):

  def __init__(self, n_classes=1000):

    self.nClasses = n_classes
    self.convBlock1 = InitialConvblock(numChannels, block1Size)
    self.block2a = ResBlock(block1Size, convBlock2Size, convBlock2Size*4)
    self.block2b = ResBlock(convBlock2Size*4, convBlock2Size,convBlock2Size*4)
    self.block3a = ResBlock(convBlock3Size*2, convBlock3Size, convBlock3Size*4, True)
    self.block3b = ResBlock(convBlock3Size*4, convBlock3Size, convBlock3Size*4)

    self.block4a = ResBlock(convBlock4Size*2, convBlock4Size, convBlock4Size*4, True)
    self.block4b = ResBlock(convBlock4Size*4, convBlock4Size, convBlock4Size*4)

    self.block5a = ResBlock(convBlock5Size*2, convBlock5Size, convBlock5Size*4, True)
    self.block5b = ResBlock(convBlock5Size*4, convBlock5Size, convBlock5Size*4)

    self.globalAvgPool = nn.AvgPool2d(kernel_size=7, stride=1)
    self.linearLayer = nn.Linear(2048, self.nClasses) #num classes
    self.softmax = nn.Softmax(dim=1)

  def forward(self, x):
    x = self.convBlock1(x)
    x = self.block2b(self.block2b(self.block2a(x)))
    x = self.block3b(self.block3b(self.block3b(self.block3a(x, True))))
    x = self.block4b(self.block4b(self.block4b(self.block4b(self.block4b(self.block4a(x, True))))))
    x = self.block5b(self.block5b(self.block5a(x, True)))
    x = self.globalAvgPool(x)
    x = x.view(-1, 2048)
    x = F.relu(self.linearLayer(x))
    x = self.softmax(x)

    return x

def loss_batch(model, loss_func, xb, yb, opt=None, scheduler=None):  

  loss = loss_func(model(xb), yb.long())
  acc = accuracy(model(xb), yb)
  print("Batch Loss: ", loss.item())
  print("Batch Accuracy: ", acc.item())
  if opt is not None:
      if scheduler is not None:
  return acc, loss.item(), len(xb)

def accuracy(out, yb):  
  preds = torch.argmax(out, dim=1)
  return (preds == yb).float().mean() #Need to convert to float for mean to work

def train_model(epochs, model, loss_func, opt, train_dl, test_dl, device, savePath, scheduler=None):
  for epoch in range(epochs):
    now =
    current_time = now.strftime("%H:%M:%S")
    print("Beginning Epoch at ", current_time)
    i = 0
    running_loss = 0
    displayInterval = 50

    for xb, yb, in train_dl: #xb and yb could be wrong size?
      xb =
      yb =
      loss, acc, num = loss_batch(model, loss_func, xb, yb, opt, scheduler)
      running_loss += loss

      if i % displayInterval == displayInterval - 1:
        print("Batch Number: ", i)
        currLoss = running_loss/displayInterval
        print("Running Loss: ", currLoss)
        running_loss = 0
      i += 1

    print("Evaluating Model")
    #No gradient computation for evaluation mode
    with torch.no_grad():
      accs = []
      losses = []
      nums = []
      for xb, yb in test_dl:
        acc, loss, num = loss_batch(model, loss_func,,
      val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
      val_acc = np.sum(np.multiply(accs, nums)) / np.sum(nums)

      print("Epoch:", epoch+1)
      print("Loss: ", val_loss)
      print("Accuracy: ", val_acc), savePath)

def get_train_dataset(dir):
  imageSize = 224
  trainTransforms = transforms.Compose([
    transforms.Normalize(mean=[0.485, 0.455, 0.405], std=[0.229, 0.224, 0.225]) #Check if std deviation should be set

  trainDataset = datasets.ImageFolder(dir, trainTransforms)

  return trainDataset

def get_val_dataset(dir):
  valTransforms = transforms.Compose([    
  transforms.Normalize(mean=[0.485, 0.455, 0.405], std=[0.229, 0.224, 0.225]) #Check if std deviation should be set
  valDataset = datasets.ImageFolder(dir, valTransforms)
  return valDataset

def getDataloaders(dataDir, valDir, batchSize, workers=0, pin_memory=False): #Can also add pin_memory 
  trainDataset = get_train_dataset(dataDir)
  valDataset = get_val_dataset(valDir)

  trainLoader =, batch_size = batchSize, shuffle=True, num_workers=workers, pin_memory=pin_memory, sampler=None)
  valLoader =, batch_size = batchSize, shuffle=False, num_workers=workers, pin_memory=pin_memory)
  return trainLoader, valLoader

bs = 64 #Max batch size is 128
lr = math.sqrt(0.1)
n_epochs = 2
loss_func = F.cross_entropy
numChannels = 3
numClasses = 200
outputFile = '/home/mcvi0001/A^2_Net_Code/Imagenet_resnet.pth'

trainDir = "/home/mcvi0001/A^2_Net_Code/tiny-imagenet-200/train"
testDir = "/home/mcvi0001/A^2_Net_Code/tiny-imagenet-200/test"
valDir = "/home/mcvi0001/A^2_Net_Code/tiny-imagenet-200/val"

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

trainDL, testDL = getDataloaders(trainDir, testDir, bs)

model = ResNet(numClasses)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  model = nn.DataParallel(model)

train_model(n_epochs, model, loss_func, optimizer, trainDL, testDL, device, outputFile)

now =

current_time = now.strftime("%H:%M:%S")
print("Finished Training at", current_time), outputFile)
print("Finished Saving")

F.cross_entropy and its module equivalent nn.CrossEntropyLoss expect raw logits as the model output, so remove the last softmax layer.

Hi Peter,

Thank you for your help. I removed that I’m still seeing the same issue. After three epochs I see an accuracy of
epoch1: loss=7.85208056640625 acc=0
epoch2: loss=5.965506981658936 acc=0.0125
epoch3: loss=23.37786726989746 acc=0.0004

I tried adding batch normalization to my shortcut functions (as I wasn’t doing that before for the convolutions there) but I am still seeing lower accuracy than I would expect for these epochs. Do you see anything else I might change? After further investigating I think there is something wrong with my overall structure but I’m not sure as to what it is.

That might be the case and you could compare and debug your model architecture using working implementations such as the torchvision.models.resnet**.

Ok, looks like using this line
summary(models.resnet50(False).cuda(), (3, 224, 224))
gives me some differences from my architecture. Thank you for the suggestion, I’ll investigate further.

Hmm I think my architecture is ok. Here is the first few lines but the rest follows the same pattern

           Conv2d-1         [-1, 64, 112, 112]           4,864
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
 AdaptiveMaxPool2d-4           [-1, 64, 56, 56]               0
  InitialConvblock-5           [-1, 64, 56, 56]               0
            Conv2d-6          [-1, 256, 56, 56]          16,640
       BatchNorm2d-7          [-1, 256, 56, 56]             512
            Conv2d-8           [-1, 64, 56, 56]           4,160
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11           [-1, 64, 54, 54]          36,928
      BatchNorm2d-12           [-1, 64, 54, 54]             128
             ReLU-13           [-1, 64, 54, 54]               0
           Conv2d-14          [-1, 256, 56, 56]          16,640
      BatchNorm2d-15          [-1, 256, 56, 56]             512
             ReLU-16          [-1, 256, 56, 56]               0
         ResBlock-17          [-1, 256, 56, 56]               0
           Conv2d-18          [-1, 256, 56, 56]          65,792
      BatchNorm2d-19          [-1, 256, 56, 56]             512

The strange thing is I am able to train MNIST on this model to a good degree of accuracy. It only seems to be with imagenet that I run into difficulty. Can you think of any other reason I might not be seeing my loss decrease/what I can do to fix my training?

Looks like I was able to figure it out. I made the loss very small, 0.00001, and it began converging. I also made sure to set the bias=False in the shortcut convolution layers, although it may have worked even without that. Thank you for your help Peter.

