Hello,
I am training a pytorch model for sign language classification. There are 29 classes. I am using torchvision augmentation. For weeks I have been trying to train the model. But the loss keeps hovering around the number where it starts, and the accuracy to remains where it started(accuracy is as good as choosing a random label). I first tried smaller models, then bigger models, now pytorch’s inbuilt models, all of which give me the same result. I read online and tried weight decay, different hyperparameters, none of which seem to help.
Here is the code:
!unzip /content/drive/MyDrive/asl.zip -d /content/data
from torch.utils.data import Dataset, DataLoader
import torch
import torchvision.transforms as T
import cv2 as cv
import glob
import numpy as np
import random
import os
from PIL import Image
import torchvision.models as models
os.rename('/content/data/asl_alphabet_train/asl_alphabet_train', '/content/data/train')
os.rename('/content/data/asl_alphabet_test/asl_alphabet_test', '/content/data/test')
os.removedirs('/content/data/asl_alphabet_train')
os.removedirs('/content/data/asl_alphabet_test')
train_transf = T.Compose(
[
T.GaussianBlur(9),
T.RandomRotation((0, 5)),
T.RandomPerspective(),
T.RandomHorizontalFlip(),
T.RandomVerticalFlip(),
T.ToTensor(),
T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
]
)
test_transf = T.Compose(
[
T.GaussianBlur(9),
T.RandomRotation((0, 180)),
T.RandomHorizontalFlip(),
T.RandomVerticalFlip(),
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
T.ToTensor()
]
)
train_data_path ='/content/data/train'
test_data_path ='/content/data/test'
train_img_paths = []
#classes = []
for path in glob.glob(train_data_path + '/*'):
#classes.append(path.split('/')[-1])
for img_path in glob.glob(path + '/*'):
#new_fn = img_path[:20] + img_path.split('/')[-2] + '/' + ''.join([i for i in img_path.split('/')[-1] if i.isdigit() or i == '.' or i == '/']) + 'jpg'
train_img_paths.append(img_path)
idx_to_class = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
'del', 'nothing', 'space']
class_to_idx = {value:key for key,value in enumerate(idx_to_class)}
class DS(Dataset):
def __init__(self, image_paths, transforms):
self.image_paths = image_paths
self.transforms = transforms
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
filename = self.image_paths[idx]
#image = cv.imread(filename)
#image = cv.cvtColor(image, cv.COLOR_BGR2RGB)
image = None
try:
image = Image.open(filename)
except:
filename = self.image_paths[idx+1]
image = Image.open(filename)
label = filename.split('/')[-2]
label = class_to_idx[label]
if self.transforms is not None:
image = self.transforms(image)
return image, label
import matplotlib.pyplot as plt
import copy
%matplotlib inline
def vis_augment(dataset, index = 0, samples = 10, cols = 5, rand_img = False):
dataset = copy.deepcopy(dataset)
dataset.transforms = T.Compose(
[
T.GaussianBlur(9),
T.RandomRotation((0, 10)),
T.RandomHorizontalFlip(),
T.RandomVerticalFlip(),
]
)
rows = samples//cols
fig, ax = plt.subplots(nrows = rows, ncols = cols, figsize = (12, 8))
for i in range(samples):
if rand_img:
idx = np.random.randint(1,len(train_img_paths))
image, lab = dataset[idx]
ax.ravel()[i].imshow(image)
ax.ravel()[i].set_axis_off()
ax.ravel()[i].set_title(idx_to_class[lab])
plt.tight_layout(pad=1)
plt.show()
vis_augment(train_ds,np.random.randint(1,len(train_img_paths)), rand_img = True)
lr = 0.1
weight_decay = 0.1
batch_size = 1
num_epochs = 5
num_classes = 29
train_dtldr = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class CNN(nn.Module):
def __init__(self, input_size = (200, 200), num_classes = 29):
super(CNN, self).__init__()
#self.conv1=nn.Conv2d(in_channels=3,out_channels=12,kernel_size=3,stride=1,padding=1)
#Shape= (256,12,150,150)
#self.bn1=nn.BatchNorm2d(num_features=12)
#Shape= (256,12,150,150)
#self.relu1=nn.ReLU()
#Shape= (256,12,150,150)
#self.pool=nn.MaxPool2d(kernel_size=2)
#Reduce the image size be factor 2
#Shape= (256,12,75,75)
#self.conv2=nn.Conv2d(in_channels=12,out_channels=20,kernel_size=3,stride=1,padding=1)
#Shape= (256,20,75,75)
#self.relu2=nn.ReLU()
#Shape= (256,20,75,75)
#self.conv3=nn.Conv2d(in_channels=20,out_channels=32,kernel_size=3,stride=1,padding=1)
#Shape= (256,32,75,75)
#self.bn3=nn.BatchNorm2d(num_features=32)
#Shape= (256,32,75,75)
#self.relu3=nn.ReLU()
#Shape= (256,32,75,75)
self.model_vgg = models.SqueezeNet(num_classes=29)
self.model_vgg.classifier[1] = nn.Conv2d(512, num_classes, kernel_size=(1,1), stride=(1,1))
def forward(self, input):
#output=self.conv1(input)
#output=self.bn1(output)
#output=self.relu1(output)
#output=self.pool(output)
#output=self.conv2(output)
#output=self.relu2(output)
#output=self.conv3(output)
#output=self.bn3(output)
#output=self.relu3(output)
#output=output.view(-1,320000)
#output=self.fc(output)
output = self.model_vgg(input)
return output
model = CNN((200, 200), 29).to(device)
criterion = nn.CrossEntropyLoss()
adam = optim.Adam(model.model_vgg.parameters(), lr=lr, weight_decay=weight_decay)
total_right=0
step = 1
for i in range(num_epochs):
for _, (x, y) in enumerate(train_dtldr):
model.train()
x = x.to(device)
y = y.to(device)
adam.zero_grad()
scores = model(x).to(device)
loss = criterion(scores, y).to(device)
#print(scores.shape, y.shape)
total_right+=int(torch.sum(torch.argmax(scores)==y.data))
if step % 100 == 0:
print(f'epoch:{i}\tstep:{step}')
if step % 100 == 0:
print(scores.data, y)
print(f'-------------\npred = {torch.argmax(scores.data)}\nreal = {y.data}\nloss = {loss} \ntotal_right = {total_right} accuracy = {(total_right/(step+1))*100.00}%')
loss.backward()
adam.step()
step+=1