Resnet Model always predicting same label

I am trying to successfully attempt facial recognition on a custom dataset of 48 classes with over 5000 images using Resnet50.
In the code below when I do NOT use transforms to create the image datasets, then i achieve 99% accuracy on training and validation set. After training, when I attempt to use the predict_image function below in order to classify a handful of images, i always seem to get the same label. Would you know what the reason for this is? Furthermore, if do consider using transform to create the datasets, i achieve accuracies in the order of 0.04. I do not know why this is? I would appreciate if any one could help me?

from torchvision import datasets, transforms, models
import torch
from torch.utils.data import DataLoader, Dataset
import torchvision

from torch import Tensor
from torch.autograd import Variable

import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
import cv2
from PIL import Image

from matplotlib import pyplot as plt


from sklearn.model_selection import train_test_split

#from sklearn.neural_network import MLPClassifier



from untilities import create_directory


from dataset_loader import DatasetLoader
from face_extractor import FaceExtracter
from data_augmentation import DataAugmentation



# loading dataset
base_dir = os.path.dirname(os.getcwd())
resources_absolute_path = os.path.join(base_dir, 'resources')
image_dir = os.path.join(resources_absolute_path, 'samples')

model_outputs_dir = os.path.join(resources_absolute_path,'model_outputs')

if(not os.path.exists(model_outputs_dir)):
    create_directory(model_outputs_dir)

# Create directory 'dataset' if it does not exist
labelled_dir = os.path.join(resources_absolute_path,'dataset', 'labelled3')
        
#unlabelled directory for exracted faces from group images/videos
unlabelled_dir = os.path.join(resources_absolute_path, 'dataset','unlabelled')         

##Pre-processing, detecting and extracting faces
if(not os.path.exists(os.path.join(resources_absolute_path, 'dataset'))):
    extracter = FaceExtracter(image_dir,resources_absolute_path)
    extracter.extract()
    augmenter = DataAugmentation()
    augmenter.augment(labelled_dir)
    #augmenter.remove_augmented_files(labelled_dir)


#loading extracted faces and labels
dataset = DatasetLoader(labelled_dir)
images,labels = dataset.load()

images_array = np.asarray(images)
labels_array = np.asarray(labels)

#del images,labels
labelencoder = LabelEncoder()

X_train, X_val, y_train, y_val = train_test_split(images_array, labels_array, test_size=0.2, random_state=1)
del images_array,labels_array

##label encode ytrainn and ytest
y_train_encoded = labelencoder.fit_transform(y_train)
y_val_encoded = labelencoder.transform(y_val)




########################################pytorch ########################################


import torchvision.transforms.functional as TF
class CustomTensorDataset(Dataset):
    """TensorDataset with support of transforms.
    """
    def __init__(self, tensors, transform=None):
        #assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        self.tensors = tensors
        self.transform = transform

    def __getitem__(self, index):
        x = self.tensors[0][index]

        if self.transform:
            x = TF.to_pil_image(x)
            x = self.transform(x)

        y = self.tensors[1][index]

        return x, y

    def __len__(self):
        return self.tensors[0].size(0)
    
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomRotation(degrees=15),
        transforms.ColorJitter(),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

#image_datasets = {x: CustomTensorDataset(tensors=(eval("X_{}".format(x)), eval("y_{}".format(x)), transform=data_transforms[x] for x in ['train', 'val']}


image_datasets = {'train':CustomTensorDataset(tensors=(Tensor(X_train), Tensor(y_train_encoded)), transform=data_transforms['train']),
                  'val':CustomTensorDataset(tensors=(Tensor(X_val), Tensor(y_val_encoded)), transform=data_transforms['val']) }

dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=5,shuffle=True,num_workers=0)
              for x in ['train', 'val']}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}


###Visualising Dataset


import numpy as np

def imshow(image, ax=None, title=None, normalize=True):
    if ax is None:
        fig, ax = plt.subplots()
    image = image.permute(0,2,1)
    image = image.numpy().transpose((1, 2, 0))
    
    if normalize:
        mean = np.array([0.485, 0.456, 0.406])
        std = np.array([0.229, 0.224, 0.225])
        image = std * image + mean
        image = np.clip(image, 0, 1)

    ax.imshow(image)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.tick_params(axis='both', length=0)
    ax.set_xticklabels('')
    ax.set_yticklabels('')

    return ax

train_iter = iter(dataloaders['val'])
images, labels = train_iter.next()

fig, axes = plt.subplots(figsize=(12, 12), ncols=5)
print('training images')
for i in range(5):
    axe1 = axes[i] 
    imshow(images[i], ax=axe1, normalize=False)

print(images[0].size())



"""mean = 0.
std = 0.
nb_samples = len(X_train)
for data,_ in dataloaders['train']:
    batch_samples = data.size(0)
    data = data.view(batch_samples, data.size(1), -1)
    mean += data.mean(2).sum(0)
    std += data.std(2).sum(0)
mean /= nb_samples
std /= nb_samples

dataiter = iter(dataloaders['train'])
i, l = dataiter.next()"""


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import time
from torch import optim
from torch.optim import lr_scheduler
import copy
def train_model(model, criterion, optimizer, scheduler = None, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                # it-disables-dropout
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                #inputs = inputs.permute(0,3,1,2)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels.long())

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.long().data)
            if phase == 'train' and scheduler:
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model



model_ft = models.resnet50(pretrained=True)

num_ftrs = model_ft.fc.in_features


# Here the size of each output sample is set to 48.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
#model_ft.fc = nn.Linear(num_ftrs, len(set(y_train)))

####RESNET Classifier ################
model_ft.fc = nn.Sequential(
                      nn.Linear(num_ftrs, 256), 
                      nn.ReLU(), 
                      nn.Dropout(0.4),
                      nn.Linear(256, len(set(y_train))),                   
                      nn.LogSoftmax(dim=1))
model_ft = model_ft.to(device)
#model_ft.load_state_dict(torch.load(os.path.join(model_outputs_dir,'resnet2.pt')))


#model_vg.load_state_dict(torch.load(os.path.join(model_outputs_dir,'vgg162.pt')))
criterion = nn.CrossEntropyLoss()

#Optimization algorithm
optimizer = torch.optim.Adam(model_ft.parameters(), lr = 0.0001)# lr: learning rate

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.0001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)

model_ft = train_model(model_ft, criterion, optimizer_ft,exp_lr_scheduler,5)

torch.save(model_ft.state_dict(), os.path.join(model_outputs_dir,'resnet2.pt'))




image = Image.open(os.path.join(labelled_dir,'09','IMG_6884.JPG'))

def predict_image(image,model):
    #modelX=torch.load('trained_models/facial_.pth')
    model.eval()

    
    test_transforms = transforms.Compose([transforms.Resize([224,224]),
                          transforms.Grayscale(num_output_channels=3),
                           transforms.ToTensor(),
                           ])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    image_tensor = test_transforms(image).float()
    image_tensor = image_tensor.unsqueeze_(0)
    input = Variable(image_tensor)
    input = input.to(device)
    output = model(input)
    index = output.data.cpu().numpy().argmax()
    return index

print(labelencoder.inverse_transform([predict_image(image, model_ft)]))

Multiple things here. The default values for some of the transforms are really bad. I’d look at removing RandomResizedCrop or setting appropriate values for each default parameter, especially scale. Rotations of 15 degrees also seem excessive.

Then, in predict_image, you are removing the color from input (unsure if your data is already grayscale), but more importantly, ToTensor() will divide values by 255, which might not be accounted for when you train without applying the transforms.

Hope that helps.

Hi @futscdav

Thank you for your suggestions. I have removed RandomResizedCrop and set the rotation to 5 degrees. The data i have are face images 224x224 in color, so i have removed the grayscale transform within predict_image.I have re-run the training and observe accuracies of 0.30 for training and validation for 10 epochs. I want improve this upto 0.9. What can you recommend? When i perform a prediction on a number of images, i am still experiencing same indexes/labels being outputted.Any other suggestions please?

Thank you in advance.