Discrepancy of validation loss during training run and separate evaluation

Hi,

I am training an image classification model which does a validation at the end of every epoch. I will save out model weights with the best validation loss. However, when I do a separate evaluation by loading the saved model, the calculated validation loss is higher than the validation loss it had when it was saved. On the other hand, the validation accuracy is consistent for both.

Could you advise on the discrepancy? Is there something I’ve overlooked in my codes?

My training code train.py:

import time
import math
import copy
import argparse
import numpy as np
from pathlib import Path

import torch
from torch import nn, optim

from mask_model import build_model
from data_loader import get_data_loaders

def main(root=None, save_dir='weights', arch='resnet18', context='model', lr=0.01, batch_size=32, num_epochs=25, device='cuda:0', class_weighted_loss=False, verbose_step=500, load=False):
    model = build_model(arch, pretrained=True, num_classes=2, device='cuda:0')

    if load:
        load_pkl = 'datasets.pkl'
    else:
        load_pkl = None
    dataloaders, class_names, class_weights = get_data_loaders( batch_size=batch_size, load=load_pkl)

    if class_weighted_loss:
        print('Cross Entropy Loss weights @ {}'.format(class_weights))
        class_weights = torch.Tensor(class_weights).to(device) # tensor of size num_classes
        criterion_train = nn.CrossEntropyLoss(weight=class_weights)
    else:
        criterion_train = nn.CrossEntropyLoss(weight=None)
    
    criterion_valid = nn.CrossEntropyLoss(weight=None)
    # Observe that all parameters are being optimized
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

    # optimizer = optim.Adam(params, lr=0.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

    model, best_acc, best_loss, best_epoch, total_epoch = train_model(model, dataloaders, criterion_train, criterion_valid, optimizer, exp_lr_scheduler, num_epochs=num_epochs, device=device, verbose_step=verbose_step)

    save_path = Path(save_dir) / '{}_bestval_loss{:0.3f}_acc{:0.3f}_ep{}of{}_{}.pth'.format(arch, best_loss, best_acc, best_epoch, total_epoch, context)
    torch.save(model.state_dict(), save_path)
    print('Best val weights saved to {}'.format(save_path))

def train_model(model, dataloaders, criterion_train, criterion_valid, optimizer, scheduler, num_epochs=25, device='cuda:0', verbose_step=500, early_stopping_threshold=3):
    phases = ['train', 'valid']
    since = time.time()

    dataset_sizes = { s : len(dataloaders[s].dataset) for s in phases }

    total_steps_per_epoch = dataset_sizes['train'] // dataloaders['train'].batch_size + 1

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_loss = math.inf
    best_epoch = 1
    early_stopping_strike = 0
    try:
        for epoch in range(num_epochs):
            print('-' * 10)
            print('Epoch {}/{}'.format(epoch+1, num_epochs))
            print('-' * 10)

            # Each epoch has a training and validation phase
            for phase in phases:
                if phase == 'train':
                    model.train()  # Set model to training mode
                    criterion = criterion_train
                else:
                    model.eval()   # Set model to evaluate mode
                    criterion = criterion_valid

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                for step, data in enumerate(dataloaders[phase]):
                    inputs, labels = data
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                
                    if phase == 'train' and (step+1) % verbose_step == 0:
                        num_imgs_so_far = (step+1)*dataloaders['train'].batch_size
                        verbose_loss = running_loss / num_imgs_so_far
                        verbose_acc = running_corrects.double() /num_imgs_so_far

                        print('[{}] Step: {}/{} | Loss: {:.4f} Acc: {:.4f}'.format(phase, step+1, total_steps_per_epoch, verbose_loss, verbose_acc))
                
                if phase == 'train':
                    scheduler.step()
                    # lr_now = scheduler.get_lr()
                    lr_now = scheduler.get_last_lr()
                    print('LR:', lr_now)

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                print('[{}] Loss: {:.4f} Acc: {:.4f}'.format(
                    phase, epoch_loss, epoch_acc))

                # deep copy the model
                if phase == 'valid': 
                    if epoch_loss < best_loss:
                        best_acc = epoch_acc
                        best_loss = epoch_loss
                        best_model_wts = copy.deepcopy(model.state_dict())
                        best_epoch = epoch + 1
                        early_stopping_strike = 0 # reset
                        print('Best val checkpointed.')
                    else:
                        early_stopping_strike += 1
                        print('Val not best, strike:{}/{}'.format(early_stopping_strike, early_stopping_threshold))

            print()
            if early_stopping_strike >= early_stopping_threshold:
                print('Terminating training as val not best for {} strikes'.format(early_stopping_strike))
                break

    except KeyboardInterrupt:
        print('Training interupted manually!')
    finally:
        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print('Best val acc: {:4f}'.format(best_acc))
        print('Best val loss: {:4f}'.format(best_loss))
        print('achieved at epoch {}/{}'.format(best_epoch, epoch))

        # load best model weights
        model.load_state_dict(best_model_wts)
        return model, best_acc, best_loss, best_epoch, epoch

My evaluation code eval.py:

import argparse
from tqdm import tqdm
from pathlib import Path

import pandas as pd
import seaborn as sn
from sklearn import metrics
import matplotlib.pyplot as plt

import torch
from torch import nn

from mask_model import build_model
from data_loader import get_data_loaders

ap = argparse.ArgumentParser()
ap.add_argument('arch', help='resnet18, resnet50 or resnext50', type=str)
ap.add_argument('weights', help='Path to weights file')
args = ap.parse_args()

batch_size = 32
# batch_size = 16 
device = 'cuda:0'

model = build_model(args.arch, pretrained=False, num_classes=2, device=device, inference=True)

state_dict = torch.load(args.weights, map_location=torch.device(device))
model.load_state_dict(state_dict)
model.eval()

criterion_valid = nn.CrossEntropyLoss(weight=None)

dataloaders, class_names, class_weights = get_data_loaders( batch_size=batch_size, load='datasets.pkl')

dataloader = dataloaders['valid']

dataset_size = len(dataloader.dataset)

running_loss = 0.0
running_corrects = 0
for imgs, labels in tqdm(dataloader):
    labels = labels.to(device)
    imgs = imgs.to(device)

    with torch.set_grad_enabled(False):
    # with torch.no_grad():
        res = model(imgs)
        _, preds = torch.max(res, 1)

        loss = criterion_valid(res, labels)
    
    running_loss += loss.item() * labels.size(0)
    running_corrects += torch.sum(preds == labels.data)

epoch_loss = running_loss / dataset_size
loss_str = 'Validation Loss: {}'.format(epoch_loss)
print(loss_str)

epoch_acc = running_corrects.double() / dataset_size
print('Validation Acc: {}'.format(epoch_acc))

For example, I’ve a resnet18 model with a validation loss of 0.161 saved during training, but when I did the separate evaluation the validation loss became 0.375 on the same validation set. The validation accuracy, however, is same for both at 0.945

Thank you!

It’s a bit weird, that the losses are different while you get the same accuracy.
Based on this it seems that the state_dict is properly loaded, but somehow the loss calculation might be different.

Could you run a test and store the losses (unreduced) for all batches in separate lists, and compare them for the two scripts?
I cannot see any obvious issue in the code at the moment.

So sorry! I just found the mistake while writing out the losses and outputs, it was an oversight on my part: in eval.py when we build our model with the inference=True argument, it will slap on a softmax layer at the end of the resnet, thereby contributing to the difference in output features when computing the validation losses!

All is good now after fixing that part. Thank you for your reply!