I am trying to successfully attempt facial recognition on a custom dataset of 48 classes with over 5000 images using Resnet50.
In the code below when I do NOT use transforms to create the image datasets, then i achieve 99% accuracy on training and validation set. After training, when I attempt to use the predict_image
function below in order to classify a handful of images, i always seem to get the same label. Would you know what the reason for this is? Furthermore, if do consider using transform to create the datasets, i achieve accuracies in the order of 0.04. I do not know why this is? I would appreciate if any one could help me?
from torchvision import datasets, transforms, models
import torch
from torch.utils.data import DataLoader, Dataset
import torchvision
from torch import Tensor
from torch.autograd import Variable
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
import numpy as np
import os
import cv2
from PIL import Image
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
#from sklearn.neural_network import MLPClassifier
from untilities import create_directory
from dataset_loader import DatasetLoader
from face_extractor import FaceExtracter
from data_augmentation import DataAugmentation
# loading dataset
base_dir = os.path.dirname(os.getcwd())
resources_absolute_path = os.path.join(base_dir, 'resources')
image_dir = os.path.join(resources_absolute_path, 'samples')
model_outputs_dir = os.path.join(resources_absolute_path,'model_outputs')
if(not os.path.exists(model_outputs_dir)):
create_directory(model_outputs_dir)
# Create directory 'dataset' if it does not exist
labelled_dir = os.path.join(resources_absolute_path,'dataset', 'labelled3')
#unlabelled directory for exracted faces from group images/videos
unlabelled_dir = os.path.join(resources_absolute_path, 'dataset','unlabelled')
##Pre-processing, detecting and extracting faces
if(not os.path.exists(os.path.join(resources_absolute_path, 'dataset'))):
extracter = FaceExtracter(image_dir,resources_absolute_path)
extracter.extract()
augmenter = DataAugmentation()
augmenter.augment(labelled_dir)
#augmenter.remove_augmented_files(labelled_dir)
#loading extracted faces and labels
dataset = DatasetLoader(labelled_dir)
images,labels = dataset.load()
images_array = np.asarray(images)
labels_array = np.asarray(labels)
#del images,labels
labelencoder = LabelEncoder()
X_train, X_val, y_train, y_val = train_test_split(images_array, labels_array, test_size=0.2, random_state=1)
del images_array,labels_array
##label encode ytrainn and ytest
y_train_encoded = labelencoder.fit_transform(y_train)
y_val_encoded = labelencoder.transform(y_val)
########################################pytorch ########################################
import torchvision.transforms.functional as TF
class CustomTensorDataset(Dataset):
"""TensorDataset with support of transforms.
"""
def __init__(self, tensors, transform=None):
#assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
self.tensors = tensors
self.transform = transform
def __getitem__(self, index):
x = self.tensors[0][index]
if self.transform:
x = TF.to_pil_image(x)
x = self.transform(x)
y = self.tensors[1][index]
return x, y
def __len__(self):
return self.tensors[0].size(0)
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomRotation(degrees=15),
transforms.ColorJitter(),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
#image_datasets = {x: CustomTensorDataset(tensors=(eval("X_{}".format(x)), eval("y_{}".format(x)), transform=data_transforms[x] for x in ['train', 'val']}
image_datasets = {'train':CustomTensorDataset(tensors=(Tensor(X_train), Tensor(y_train_encoded)), transform=data_transforms['train']),
'val':CustomTensorDataset(tensors=(Tensor(X_val), Tensor(y_val_encoded)), transform=data_transforms['val']) }
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=5,shuffle=True,num_workers=0)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
###Visualising Dataset
import numpy as np
def imshow(image, ax=None, title=None, normalize=True):
if ax is None:
fig, ax = plt.subplots()
image = image.permute(0,2,1)
image = image.numpy().transpose((1, 2, 0))
if normalize:
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
image = std * image + mean
image = np.clip(image, 0, 1)
ax.imshow(image)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.tick_params(axis='both', length=0)
ax.set_xticklabels('')
ax.set_yticklabels('')
return ax
train_iter = iter(dataloaders['val'])
images, labels = train_iter.next()
fig, axes = plt.subplots(figsize=(12, 12), ncols=5)
print('training images')
for i in range(5):
axe1 = axes[i]
imshow(images[i], ax=axe1, normalize=False)
print(images[0].size())
"""mean = 0.
std = 0.
nb_samples = len(X_train)
for data,_ in dataloaders['train']:
batch_samples = data.size(0)
data = data.view(batch_samples, data.size(1), -1)
mean += data.mean(2).sum(0)
std += data.std(2).sum(0)
mean /= nb_samples
std /= nb_samples
dataiter = iter(dataloaders['train'])
i, l = dataiter.next()"""
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import time
from torch import optim
from torch.optim import lr_scheduler
import copy
def train_model(model, criterion, optimizer, scheduler = None, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
# it-disables-dropout
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
#inputs = inputs.permute(0,3,1,2)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels.long())
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.long().data)
if phase == 'train' and scheduler:
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
model_ft = models.resnet50(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 48.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
#model_ft.fc = nn.Linear(num_ftrs, len(set(y_train)))
####RESNET Classifier ################
model_ft.fc = nn.Sequential(
nn.Linear(num_ftrs, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, len(set(y_train))),
nn.LogSoftmax(dim=1))
model_ft = model_ft.to(device)
#model_ft.load_state_dict(torch.load(os.path.join(model_outputs_dir,'resnet2.pt')))
#model_vg.load_state_dict(torch.load(os.path.join(model_outputs_dir,'vgg162.pt')))
criterion = nn.CrossEntropyLoss()
#Optimization algorithm
optimizer = torch.optim.Adam(model_ft.parameters(), lr = 0.0001)# lr: learning rate
# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.0001, momentum=0.9)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft,exp_lr_scheduler,5)
torch.save(model_ft.state_dict(), os.path.join(model_outputs_dir,'resnet2.pt'))
image = Image.open(os.path.join(labelled_dir,'09','IMG_6884.JPG'))
def predict_image(image,model):
#modelX=torch.load('trained_models/facial_.pth')
model.eval()
test_transforms = transforms.Compose([transforms.Resize([224,224]),
transforms.Grayscale(num_output_channels=3),
transforms.ToTensor(),
])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_tensor = test_transforms(image).float()
image_tensor = image_tensor.unsqueeze_(0)
input = Variable(image_tensor)
input = input.to(device)
output = model(input)
index = output.data.cpu().numpy().argmax()
return index
print(labelencoder.inverse_transform([predict_image(image, model_ft)]))