Hi,
I am training an image classification model which does a validation at the end of every epoch. I will save out model weights with the best validation loss. However, when I do a separate evaluation by loading the saved model, the calculated validation loss is higher than the validation loss it had when it was saved. On the other hand, the validation accuracy is consistent for both.
Could you advise on the discrepancy? Is there something I’ve overlooked in my codes?
My training code train.py
:
import time
import math
import copy
import argparse
import numpy as np
from pathlib import Path
import torch
from torch import nn, optim
from mask_model import build_model
from data_loader import get_data_loaders
def main(root=None, save_dir='weights', arch='resnet18', context='model', lr=0.01, batch_size=32, num_epochs=25, device='cuda:0', class_weighted_loss=False, verbose_step=500, load=False):
model = build_model(arch, pretrained=True, num_classes=2, device='cuda:0')
if load:
load_pkl = 'datasets.pkl'
else:
load_pkl = None
dataloaders, class_names, class_weights = get_data_loaders( batch_size=batch_size, load=load_pkl)
if class_weighted_loss:
print('Cross Entropy Loss weights @ {}'.format(class_weights))
class_weights = torch.Tensor(class_weights).to(device) # tensor of size num_classes
criterion_train = nn.CrossEntropyLoss(weight=class_weights)
else:
criterion_train = nn.CrossEntropyLoss(weight=None)
criterion_valid = nn.CrossEntropyLoss(weight=None)
# Observe that all parameters are being optimized
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
# optimizer = optim.Adam(params, lr=0.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
model, best_acc, best_loss, best_epoch, total_epoch = train_model(model, dataloaders, criterion_train, criterion_valid, optimizer, exp_lr_scheduler, num_epochs=num_epochs, device=device, verbose_step=verbose_step)
save_path = Path(save_dir) / '{}_bestval_loss{:0.3f}_acc{:0.3f}_ep{}of{}_{}.pth'.format(arch, best_loss, best_acc, best_epoch, total_epoch, context)
torch.save(model.state_dict(), save_path)
print('Best val weights saved to {}'.format(save_path))
def train_model(model, dataloaders, criterion_train, criterion_valid, optimizer, scheduler, num_epochs=25, device='cuda:0', verbose_step=500, early_stopping_threshold=3):
phases = ['train', 'valid']
since = time.time()
dataset_sizes = { s : len(dataloaders[s].dataset) for s in phases }
total_steps_per_epoch = dataset_sizes['train'] // dataloaders['train'].batch_size + 1
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
best_loss = math.inf
best_epoch = 1
early_stopping_strike = 0
try:
for epoch in range(num_epochs):
print('-' * 10)
print('Epoch {}/{}'.format(epoch+1, num_epochs))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in phases:
if phase == 'train':
model.train() # Set model to training mode
criterion = criterion_train
else:
model.eval() # Set model to evaluate mode
criterion = criterion_valid
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for step, data in enumerate(dataloaders[phase]):
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train' and (step+1) % verbose_step == 0:
num_imgs_so_far = (step+1)*dataloaders['train'].batch_size
verbose_loss = running_loss / num_imgs_so_far
verbose_acc = running_corrects.double() /num_imgs_so_far
print('[{}] Step: {}/{} | Loss: {:.4f} Acc: {:.4f}'.format(phase, step+1, total_steps_per_epoch, verbose_loss, verbose_acc))
if phase == 'train':
scheduler.step()
# lr_now = scheduler.get_lr()
lr_now = scheduler.get_last_lr()
print('LR:', lr_now)
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('[{}] Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'valid':
if epoch_loss < best_loss:
best_acc = epoch_acc
best_loss = epoch_loss
best_model_wts = copy.deepcopy(model.state_dict())
best_epoch = epoch + 1
early_stopping_strike = 0 # reset
print('Best val checkpointed.')
else:
early_stopping_strike += 1
print('Val not best, strike:{}/{}'.format(early_stopping_strike, early_stopping_threshold))
print()
if early_stopping_strike >= early_stopping_threshold:
print('Terminating training as val not best for {} strikes'.format(early_stopping_strike))
break
except KeyboardInterrupt:
print('Training interupted manually!')
finally:
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val acc: {:4f}'.format(best_acc))
print('Best val loss: {:4f}'.format(best_loss))
print('achieved at epoch {}/{}'.format(best_epoch, epoch))
# load best model weights
model.load_state_dict(best_model_wts)
return model, best_acc, best_loss, best_epoch, epoch
My evaluation code eval.py
:
import argparse
from tqdm import tqdm
from pathlib import Path
import pandas as pd
import seaborn as sn
from sklearn import metrics
import matplotlib.pyplot as plt
import torch
from torch import nn
from mask_model import build_model
from data_loader import get_data_loaders
ap = argparse.ArgumentParser()
ap.add_argument('arch', help='resnet18, resnet50 or resnext50', type=str)
ap.add_argument('weights', help='Path to weights file')
args = ap.parse_args()
batch_size = 32
# batch_size = 16
device = 'cuda:0'
model = build_model(args.arch, pretrained=False, num_classes=2, device=device, inference=True)
state_dict = torch.load(args.weights, map_location=torch.device(device))
model.load_state_dict(state_dict)
model.eval()
criterion_valid = nn.CrossEntropyLoss(weight=None)
dataloaders, class_names, class_weights = get_data_loaders( batch_size=batch_size, load='datasets.pkl')
dataloader = dataloaders['valid']
dataset_size = len(dataloader.dataset)
running_loss = 0.0
running_corrects = 0
for imgs, labels in tqdm(dataloader):
labels = labels.to(device)
imgs = imgs.to(device)
with torch.set_grad_enabled(False):
# with torch.no_grad():
res = model(imgs)
_, preds = torch.max(res, 1)
loss = criterion_valid(res, labels)
running_loss += loss.item() * labels.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / dataset_size
loss_str = 'Validation Loss: {}'.format(epoch_loss)
print(loss_str)
epoch_acc = running_corrects.double() / dataset_size
print('Validation Acc: {}'.format(epoch_acc))
For example, I’ve a resnet18 model with a validation loss of 0.161 saved during training, but when I did the separate evaluation the validation loss became 0.375 on the same validation set. The validation accuracy, however, is same for both at 0.945
Thank you!