Model performs well on both validation and testing dataset, when saved and load, performs poor on the same dataset

Hello all, I have inistalised the resnet50 model and default weights, and have trained/validated/tested the model to distinguish 2 classes using the following code

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Dataset
from torchvision.datasets import CocoDetection
from torchvision.transforms import ToTensor, Compose, RandomHorizontalFlip, RandomResizedCrop, Normalize, Resize
import torchvision.models as models
import matplotlib.pyplot as plt
from pycocotools.coco import COCO
from PIL import Image
import os

class CocoDetection(Dataset):
    def __init__(self, root, ann_file, transform=None):
        """
        Args:
            root (str): Root directory where images are stored.
            ann_file (str): Path to the annotation file in COCO format.
            transform (callable, optional): A function/transform to apply to the images.
        """
        self.root = root
        self.coco = COCO(ann_file)
        self.ids = list(self.coco.imgs.keys())
        self.transform = transform

    def __getitem__(self, index):
        """
        Args:
            index (int): Index of the item to fetch.

        Returns:
            tuple: (image, target) where target is the class label.
        """
        coco = self.coco
        img_id = self.ids[index]
        img_info = coco.loadImgs(img_id)[0]
        path = img_info['file_name']
        img = Image.open(os.path.join(self.root, path)).convert('RGB')

        # Get annotations for the image
        ann_ids = coco.getAnnIds(imgIds=img_id)
        annotations = coco.loadAnns(ann_ids)

        # Extract class ids from annotations
        class_ids = [ann['category_id'] for ann in annotations]
        # Assuming only one class per image; adjust if necessary
        label = int(class_ids[0] - 1) if class_ids else -1

        if self.transform is not None:
            img = self.transform(img)

        return img, torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.ids)



img_dir = "datasets/maybegotthisright/images/default"
ann_file = "datasets/maybegotthisright/annotations/instances_default.json"
num_classes = 2

transform = Compose([
    # RandomResizedCrop(224),
    RandomResizedCrop(448),
    RandomHorizontalFlip(),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# no random trasnform
# transform = Compose([
#     Resize((224, 224)),
#     ToTensor(),
# ])

dataset = CocoDetection(img_dir, ann_file, transform=transform)

num_samples = len(dataset)
train_size = int(0.75 * num_samples)  # 75% for training
val_size = int(0.15 * num_samples)  # 15% for validation
test_size = num_samples - train_size - val_size  # 10% for testing

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

train_losses = []
val_losses = []

num_epochs = 125
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        # print("INPUTS")
        # print(inputs)
        # print("LABELS")
        # print(labels)
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step() 
        
        running_loss += loss.item()
    
    train_losses.append(running_loss / len(train_loader))
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}')

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    val_losses.append(val_loss / len(val_loader))
    print(f'Validation Loss: {val_loss / len(val_loader):.4f}, Accuracy: {100 * correct / total:.2f}%')

    # scheduler.step()

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss over Epochs')
plt.legend()
plt.savefig("loss_graph.png")
plt.show()

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the model on the test set: {100 * correct / total:.2f}%')

torch.save(model, 'models/model.pt')

this gives a great result after ~40 epochs, here is a snippet of the terminal output.

Epoch [102/125], Loss: 0.0573
Validation Loss: 0.0557, Accuracy: 98.11%
Epoch [103/125], Loss: 0.0789
Validation Loss: 0.0785, Accuracy: 96.23%
Epoch [104/125], Loss: 0.0650
Validation Loss: 0.2081, Accuracy: 92.45%
Epoch [105/125], Loss: 0.1045
Validation Loss: 0.0638, Accuracy: 98.11%
Epoch [106/125], Loss: 0.0852
Validation Loss: 0.0914, Accuracy: 98.11%
Epoch [107/125], Loss: 0.0682
Validation Loss: 0.0433, Accuracy: 98.11%
Epoch [108/125], Loss: 0.0801
Validation Loss: 0.0335, Accuracy: 98.11%
Epoch [109/125], Loss: 0.1045
Validation Loss: 0.0615, Accuracy: 98.11%
Epoch [110/125], Loss: 0.0748
Validation Loss: 0.0561, Accuracy: 98.11%
Epoch [111/125], Loss: 0.0646
Validation Loss: 0.0196, Accuracy: 100.00%
Epoch [112/125], Loss: 0.0842
Validation Loss: 0.2006, Accuracy: 86.79%
Epoch [113/125], Loss: 0.0603
Validation Loss: 0.0755, Accuracy: 96.23%
Epoch [114/125], Loss: 0.0832
Validation Loss: 0.0231, Accuracy: 100.00%
Epoch [115/125], Loss: 0.0733
Validation Loss: 0.0333, Accuracy: 100.00%
Epoch [116/125], Loss: 0.1071
Validation Loss: 0.0361, Accuracy: 98.11%
Epoch [117/125], Loss: 0.0858
Validation Loss: 0.0154, Accuracy: 100.00%
Epoch [118/125], Loss: 0.0508
Validation Loss: 0.1709, Accuracy: 90.57%
Epoch [119/125], Loss: 0.0678
Validation Loss: 0.0307, Accuracy: 100.00%
Epoch [120/125], Loss: 0.1346
Validation Loss: 0.1028, Accuracy: 96.23%
Epoch [121/125], Loss: 0.0780
Validation Loss: 0.0486, Accuracy: 98.11%
Epoch [122/125], Loss: 0.0649
Validation Loss: 0.0445, Accuracy: 98.11%
Epoch [123/125], Loss: 0.0357
Validation Loss: 0.1038, Accuracy: 96.23%
Epoch [124/125], Loss: 0.0707
Validation Loss: 0.1060, Accuracy: 98.11%
Epoch [125/125], Loss: 0.0881
Validation Loss: 0.0482, Accuracy: 96.23%

Accuracy of the model on the test set: 97.30%

However, when i load the model (have tried both state_dict and full model methods), the model is unable to make any reasonable inferences on an image from the dataset used to train the model, it tends to get it right ~20% of the time, i believe i am handling loading the model correctly, but how would the performance be so poor, if both validation and testing provided great results, and the images used for inferencing are from the complete dataset used to train the model?

Here is my inference code.

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision.datasets import CocoDetection
from torchvision.transforms import ToTensor
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

from enum import Enum

class ClassLabel(Enum):
    redacted1 = 0
    redacted2 = 1

    @staticmethod
    def get_label(index):
        print(index)
        for label in ClassLabel:
            if label.value == index:
                return label.name.replace('_', ' ')
        return "Unknown"

# model_path = "models/model.pth"

# num_classes = 2
# model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
# num_ftrs = model.fc.in_features
# model.fc = nn.Linear(num_ftrs, num_classes)

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
# model.load_state_dict(torch.load(model_path))
# model.eval()  # Set model to evaluation mode

model = torch.load("models/model.pt")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model.eval()

def infer_image(image_path):
    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([
        # transforms.Resize((224, 224)),
        transforms.Resize((448, 448)),
        transforms.ToTensor(),
    ])
    image = transform(image)
    image = image.unsqueeze(0)
    image = image.to(device)

    # Perform inference
    with torch.no_grad():
        outputs = model(image)
        probabilities = torch.softmax(outputs, dim=1)
        _, predicted = torch.max(outputs.data, 1)
    
    print(f'Raw output: {outputs}')
    predicted_class_index = predicted.item()
    probability = probabilities[0, predicted_class_index].item()
    print(f'Probability: {probability:.2f}')
    class_label = ClassLabel.get_label(predicted_class_index)
    return class_label

Any help would be greatly appreciated, I sadly cannot provide the dataset due to legal constraints, but I hope the code provides enough detail.

I think the problem can be the way you are saving the model. It seems like you are saving just the model and not the weights.
Try this way saving the state dictionary:

torch.save(obj=model.state_dict(),
           f='models/model.pt)

Give it a try and let me know.

Hello Eduardo, thank you for the reply. I have tried to use the state dictionary/weights before with no luck, but was not verbose with obj=model.state_dict().

I have tried using the state dictionary as defined in your reply. Here is the code i have used for saving/loading.

torch.save(obj=model.state_dict(), f='models/model.pt')
model_path = "models/model.pth"

num_classes = 2
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, num_classes)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()

Same result, performs well (>95%) on both validation and test, but still performs poor when saved to file, loaded, and inference images from the train-validate-test dataset.

1 Like

Maybe the error is in the path/type you are using to load? You are saving in

'models/model.pt'

but loading with:

"models/model.pth"

There is a difference, the saved one is .pt and the loaded one is .pth

Check if a static input (e.g. torch.ones) is producing approximately the same outputs before and after saving the model. If so, check the data processing next.

Sorry Eduardo, I have tried using the correct filenames, with no sucess, I decided I needed to try something different and used a faster rcnn model and re-worked the dataset to have bounding boxes, and have achieved ~99% accuracy on inference after saving and loading the model.

I have reworked my dataset to have labeled bounding boxes and switched to a faster rcnn model, this has worked well with ~99% accuracy. I will call this sorted, thank you anyways.

1 Like