Loss and accuracy is not changing significantly in my model

I’m very new to the pytorch. i have developed a basic CNN model to recognize the tmnist characters. i tried so many stuffs but my loss and accuracy is not changing or improving. can someone please help.
below is my code.

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 16, 3)
        self.fc1 = nn.Linear(400, 128)
        self.fc2 = nn.Linear(128, 94)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 400)
        #x = x.view(x.size(0), -1)
        #print(x.shape)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

model=Net()
class traindataset(Dataset):
    def __init__(self,data,train_end_idx,augmentation = None):
        '''
        data: pandas dataframe
        
        '''
        self.data=data
        self.augmentation=augmentation
        self.train_end=train_end_idx
        self.target=self.data.iloc[:self.train_end,1].values
        self.image=self.data.iloc[:self.train_end,2:].values#contains full data
        
    def __len__(self):
        return len(self.target);
    def __getitem__(self,idx):
        
        self.target=self.target
        self.ima=self.image[idx].reshape(1,784) #only takes the selected index
        if self.augmentation is not None:
            self.ima = self.augmentation(self.ima)
        
        return torch.tensor(self.target[idx]),torch.tensor(self.ima)
 class valdataset(Dataset):
    def __init__(self,data,val_start_idx,val_end_idx,augmentation=None):
        self.data=data
        self.augmentation=augmentation
        self.val_start_idx=val_start_idx
        self.val_end_idx=val_end_idx
        self.target=self.data.iloc[self.val_start_idx:self.val_end_idx,1].values
        self.image=self.data.iloc[self.val_start_idx:self.val_end_idx,2:].values
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self,idx):
        target=self.target
        self.ima=self.image[idx].reshape(1,784)
        if self.augmentation is not None:
            self.ima = self.augmentation(self.ima)
        return torch.tensor(target[idx]),torch.tensor(self.ima)

error = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

data=pd.read_csv(path,nrows=10000)    ## data is stored in a csv file of shape(10000,786) where unnesecceary columns are removed and labels are encoded. image is stored in col1,...,col784 which is later reshaped to (28,28)
le=preprocessing.LabelEncoder()
data.labels=le.fit_transform(data.labels)
  
trainds=traindataset(data,8000)
train_loader=DataLoader(trainds,batch_size=64,shuffle=False)
valds=valdataset(data,8001,10000)
val_loader=DataLoader(valds,batch_size=64,shuffle=False)

#training and val loop
num_epochs=20
for epoch in range(num_epochs):
    correct = 0
    total = 0
    train_loss=0
    train_acc=0
    test_acc=0
    for i, (labels, images) in enumerate(train_loader):
        images=images/255.0
        train = Variable(images.view(-1,1,28,28))
        labels = Variable(labels)
        model.train()
        
        optimizer.zero_grad()
        
        
        outputs = model(train)
        
        
        loss = error(outputs,labels)
        
        
        loss.backward()
        
        
        optimizer.step()
        pred=torch.max(outputs.data, 1)[1]
        train_loss+=loss.item()
        train_acc+=(pred==labels).sum()
        
    avg_tl=train_loss/len(train_loader)
    avg_ta=train_acc/len(trainds)
    #print(epoch,"------>","train loss---->",train_loss,"averagetl----->",avg_tl)
    print(epoch,"------>","averagetl----->",avg_tl,"avg trainacc---->",avg_ta)
    
    for labels,images in val_loader:
        images=images/255.0
        test = Variable(images.view(-1,1,28,28))
        model.eval()
        
        outputs = model(test)
        
        predicted = torch.max(outputs.data, 1)[1]
        test_acc += (predicted == labels).sum()
    avgtest_acc=test_acc/len(valds)
    print("val_accuracy----->",avgtest_acc)

you can get the full unable code here, you can even edit and run without downloading any dataset Test_tmnist_pytorch | Kaggle
here is the log:

0 ------> averagetl-----> 4.545530921936035 avg trainacc----> tensor(0.0155)
val_accuracy-----> tensor(0.0235)
1 ------> averagetl-----> 4.545183544158935 avg trainacc----> tensor(0.0156)
val_accuracy-----> tensor(0.0235)
2 ------> averagetl-----> 4.544704376220703 avg trainacc----> tensor(0.0166)
val_accuracy-----> tensor(0.0235)
3 ------> averagetl-----> 4.544133281707763 avg trainacc----> tensor(0.0156)
val_accuracy-----> tensor(0.0235)
4 ------> averagetl-----> 4.544296829223633 avg trainacc----> tensor(0.0159)
val_accuracy-----> tensor(0.0235)
5 ------> averagetl-----> 4.543637203216552 avg trainacc----> tensor(0.0166)
val_accuracy-----> tensor(0.0235)
6 ------> averagetl-----> 4.543308197021484 avg trainacc----> tensor(0.0166)
val_accuracy-----> tensor(0.0235)
7 ------> averagetl-----> 4.543105251312256 avg trainacc----> tensor(0.0172)
val_accuracy-----> tensor(0.0235)
8 ------> averagetl-----> 4.542947734832763 avg trainacc----> tensor(0.0161)
val_accuracy-----> tensor(0.0225)
9 ------> averagetl-----> 4.542420799255371 avg trainacc----> tensor(0.0169)
val_accuracy-----> tensor(0.0225)
10 ------> averagetl-----> 4.54195539855957 avg trainacc----> tensor(0.0168)
val_accuracy-----> tensor(0.0225)
11 ------> averagetl-----> 4.541777168273926 avg trainacc----> tensor(0.0168)
val_accuracy-----> tensor(0.0220)
12 ------> averagetl-----> 4.541561840057373 avg trainacc----> tensor(0.0172)
val_accuracy-----> tensor(0.0215)
13 ------> averagetl-----> 4.541049587249756 avg trainacc----> tensor(0.0178)
val_accuracy-----> tensor(0.0220)
14 ------> averagetl-----> 4.540925842285156 avg trainacc----> tensor(0.0184)
val_accuracy-----> tensor(0.0220)
15 ------> averagetl-----> 4.54046138381958 avg trainacc----> tensor(0.0164)
val_accuracy-----> tensor(0.0220)
16 ------> averagetl-----> 4.540088047027588 avg trainacc----> tensor(0.0170)
val_accuracy-----> tensor(0.0215)
17 ------> averagetl-----> 4.540048648834229 avg trainacc----> tensor(0.0165)
val_accuracy-----> tensor(0.0215)
18 ------> averagetl-----> 4.539735778808594 avg trainacc----> tensor(0.0181)
val_accuracy-----> tensor(0.0215)
19 ------> averagetl-----> 4.5394417114257815 avg trainacc----> tensor(0.0170)
val_accuracy-----> tensor(0.0215)

I can’t see any obvious issues in your code besides the usage of the deprecated Variable and the unsupported usage of the .data attribute, so you should remove them.
Once this is done, try to overfit a small dataset, e.g. just 10 samples by playing around with some hyperparameters. Once your model is able to overfit this dataset, try to scale up the use case again.

thank you for the suggestions @ptrblck i have tried the same code with few weeks and it worked fine. so I applied augmentation using torchvision transform. and model started doing worse and accuracy is back to the original state where its not performing well/not changing. so I started debugging one interesting thing I have noticed along the way is after applying augmentation images looks completely distorted. I’m attaching the before and after augmentation pictures as well as the augmentation code, can you please help me figure ou where I’m making wrong.

Augmentation

torchvision_transform = transforms.Compose([
    np.uint8,
    transforms.ToPILImage(),
    transforms.Resize((28,28)),
    transforms.RandomRotation([45,135]),
    #transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
    ])

code used to display augmented image

transformed=torchvision_transform(x)
plt.imshow(transformed.squeeze().numpy(), interpolation='nearest')
plt.show()

code used to display without augmentation images.




x=data.iloc[:1,2:].values
plt.imshow(x.reshape(28,28), interpolation='nearest')
plt.show()

without_augmentation1
with_augmentation1
First image is without augmentation and second is with augmentation.

The transformations look reasonable in my setup:

torchvision_transform = transforms.Compose([
    np.uint8,
    transforms.ToPILImage(),
    transforms.Resize((28,28)),
    transforms.RandomRotation([45,135]),
    transforms.ToTensor()
    ])

dataset_trans = datasets.MNIST(root="./data", download=False, transform=torchvision_transform)
dataset = datasets.MNIST(root="./data", download=False, transform=transforms.ToTensor())

x, y = dataset[0]
plt.imshow(x[0].numpy())

x, y = dataset_trans[0]
plt.imshow(x[0].numpy())

No augmentation:
image

Augmentation:
image

@ptrblck i have tried albumentation instead of torchvision transform and its working fine/ as expected. I’m wondering what’s the issue with the torchvision.transform.

I don’t know, as I’m unable to reproduce the issue.
Are you seeing unexpected results using my code snippet or see any difference which could explain why it’s working in my setup as expected?

@ptrblck I have copied your augmentation code and used the same but no luck, I’m getting the same result as I used to. the only difference between my code and yours is I used a different dataset but which is almost similar to mnist but it has more labels that’s all.
you can edit the code here.

@ptrblck i have reshaped the array to 28*28 before applying the augmentation and augmentation is working fine now. but the interesting thing is. after applying the augmentation model is performing worse, the loss is not at all decreasing. I tried the same augmentation parameters with albumentation and the model does really well in that. even without augmentation also model does well. check the same link for the code.