Why the train loss and train acc never change?

Aaaaakko · November 2, 2020, 10:01am

The code can run but the train loss and train acc never change
train_loss = 0.69, train_acc = 0.5
I think the model does not be trained, but I can’t find my fault.
I try all solution i can find, change lr, reset_parameters, normalize and so on.
Maybe the preprocessing? The image are Gray image

I use model.named_parameters() , find grad_requirs: True, but weight and grad not change

CODE:

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision.models as models
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from torch.optim import lr_scheduler
import torch.optim as optim
import shutil

import time

from PIL import Image

import os

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def gaussian1(x):
	mean = torch.mean(x)
	std = torch.std(x)
	return torch.exp(-((x-mean)**2)/(torch.std(x))**2) 
def gaussian2(x):
  mean = torch.mean(x)
  std = torch.std(x)
  return 0.5 * torch.exp(-((x-mean)**2)/(torch.std(x))**2) 

KV = torch.tensor([[-1,2,-2,2,-1],
      [2,-6,8,-6,2],
      [-2,8,-12,8,-2],
      [2,-6,8,-6,2],
      [-1,2,-2,2,-1]])/12.
KV = KV.view(1,1,5,5).to(device=device, dtype=torch.float)
KV = torch.autograd.Variable(KV, requires_grad=False)
  
class GNCNN(nn.Module):
  def __init__(self):
    super(GNCNN, self).__init__()
    self.gaussian1 = gaussian1
    self.gaussian2 = gaussian2
    self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=1, padding=0, bias=True)
    self.avg_pool1 = nn.AvgPool2d(kernel_size=3, stride=2, padding=1)

    self.conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=0, bias=True)
    self.avg_pool2 =nn.AvgPool2d(kernel_size=3, stride=2, padding=1)

    self.conv3 = nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=0, bias=True)
    self.avg_pool3 = nn.AvgPool2d(kernel_size=3, stride=2, padding=0)

    self.conv4 = nn.Conv2d(16, 16, kernel_size=3, stride=1, padding=0, bias=True)

    self.avg_pool4 = nn.AvgPool2d(kernel_size=3, stride=2, padding=0)

    self.conv5 = nn.Conv2d(16, 16, kernel_size=5, stride=1, padding=0, bias=True)
    self.avg_pool5 = nn.AvgPool2d(kernel_size=3, stride=2, padding=0)

    self.fc1 = nn.Linear(16*4*4, 128)
    self.fc2 = nn.Linear(128, 128)
    self.fc3 = nn.Linear(128, 2)
    self.reset_parameters()

		
  def forward(self, x):
    prep = F.conv2d(x, KV, padding=2)
    out = self.avg_pool1(gaussian1(self.conv1(prep)))
    out = self.avg_pool2(gaussian2(self.conv2(out)))
    out = self.avg_pool3(gaussian2(self.conv3(out)))
    out = self.avg_pool4(gaussian2(self.conv4(out)))
    out = self.avg_pool5(gaussian2(self.conv5(out)))
    out = out.view(out.size(0), -1)
    out = F.relu(self.fc1(out))
    out = F.relu(self.fc2(out))
    out = self.fc3(out)
    return out

  def reset_parameters(self):
    for mod in self.modules():
      if isinstance(mod, nn.Conv2d):
        nn.init.xavier_uniform_(self.conv1.weight)
      elif isinstance(mod, nn.Linear):
        nn.init.kaiming_normal_(mod.weight.data)



def accuracy(outputs, labels):
  _, argmax = torch.max(outputs, 1)
  return (labels == argmax.squeeze()).float().mean()
    
def default_loader(path):
  try:
    img = Image.open(path)
    return img.convert('RGB') 
  except:
    print("Cannot read image: {}".format(path))

class BOSSBaseDataset(Dataset):
  def __init__(self, txt, transforms, loader=default_loader):
    super(BOSSBaseDataset, self).__init__()
    fh = open(txt, 'r')
    imgs = []
    for line in fh:
      line = line.strip('\n')
      line = line.rstrip('\n')
      words = line.split('#')
      imgs.append((words[0],int(words[1])))      
      self.imgs = imgs
      self.transforms = transforms
      self.loader = loader

  def __getitem__(self, index):
    fn, label = self.imgs[index]
    img = self.loader(fn) 
    if self.transforms is not None:
      img = self.transforms(img) 
    return img,label

  def __len__(self):
    return len(self.imgs)


def train_model(model, criterion, optimizer, num_epochs,  batch_size, use_gpu):
  since = time.time()

  best_model_wts = model.state_dict()
  best_acc = 0.0

  val_acc_history = []
  val_loss_history = []
  is_best = False
  for epoch in range(num_epochs):
    begin_time = time.time()
    print('Epoch {}/{}'.format(epoch, num_epochs - 1))
    print('-' * 10)

    count_batch = 0

    epoch_loss = 0.
    epoch_accuracy = 0.
    running_loss = 0.
    running_accuracy = 0.
    val_loss = 0.
    val_accuracy = 0.
    test_loss = 0.
    test_accuracy = 0.

    
    for data in train_loader:
      # count_batch += 1
      inputs, labels = data


      if use_gpu:
        inputs = Variable(inputs.cuda())
        labels = Variable(labels.cuda())
      else:
        inputs, labels = Variable(inputs), Variable(labels)
      
      
      optimizer.zero_grad()
      
      outputs = model(inputs)
      # accuracys = accuracy(outputs, labels).item()
      # running_accuracy += accuracys
      # epoch_accuracy += accuracys
      loss = criterion(outputs, labels)
      running_loss += loss.data.item()
      epoch_loss += loss.data.item()
      loss.backward()
      optimizer.step()


    epoch_loss /= (train_sizes/batch_size)
    train_loss_history.append(epoch_loss)
    # epoch_accuracy /= train_sizes
    # train_acc_history.append(epoch_accuracy)
    # print('\nTrain: Epoch [{}] Loss: {:.4f} Acc: {:.4f}%'.format(epoch, epoch_loss, 100*epoch_accuracy))
    print('\nTrain: Epoch [{}] Loss: {:.4f}'.format(epoch, epoch_loss))


if __name__ == '__main__':

  use_gpu = torch.cuda.is_available()
  

  batch_size = 180
  learning_rate = 0.001



  atransforms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.Grayscale(1),
    # transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, ], [0.229, ])
  ])

  train_sets = BOSSBaseDataset(txt='/content/drive/My Drive/Dataset/train.txt', 
                                    transforms=atransforms)



  train_loader = DataLoader(train_sets, batch_size = batch_size, shuffle=True, num_workers=16)


  train_sizes = len(train_sets)


  model = GNCNN()
  model.reset_parameters()


  if use_gpu:
    model = model.cuda()


  criterion = nn.CrossEntropyLoss().cuda()


  optimizer = optim.SGD(model.parameters(), lr = learning_rate, momentum=0.9)



  train_acc_history = []
  train_loss_history = []

  model = train_model(model=model, criterion=criterion, optimizer=optimizer,
                      num_epochs=50, batch_size=batch_size, use_gpu=use_gpu)

SOME RESULT:

Train: Epoch [7] Loss: 0.6954
-->name: conv1.weight -->grad_requirs: True --weight tensor(0.0042, device='cuda:0')  -->grad_value: tensor(-0.0005, device='cuda:0')
-->name: conv1.bias -->grad_requirs: True --weight tensor(-0.0086, device='cuda:0')  -->grad_value: tensor(-6.9849e-10, device='cuda:0')
-->name: conv2.weight -->grad_requirs: True --weight tensor(-0.0001, device='cuda:0')  -->grad_value: tensor(7.2851e-05, device='cuda:0')
-->name: conv2.bias -->grad_requirs: True --weight tensor(-0.0114, device='cuda:0')  -->grad_value: tensor(1.1059e-09, device='cuda:0')
-->name: conv3.weight -->grad_requirs: True --weight tensor(0.0007, device='cuda:0')  -->grad_value: tensor(9.7395e-06, device='cuda:0')
-->name: conv3.bias -->grad_requirs: True --weight tensor(-0.0105, device='cuda:0')  -->grad_value: tensor(-1.3388e-09, device='cuda:0')
-->name: conv4.weight -->grad_requirs: True --weight tensor(-0.0002, device='cuda:0')  -->grad_value: tensor(3.7657e-05, device='cuda:0')
-->name: conv4.bias -->grad_requirs: True --weight tensor(-0.0098, device='cuda:0')  -->grad_value: tensor(2.4447e-09, device='cuda:0')
-->name: conv5.weight -->grad_requirs: True --weight tensor(-0.0001, device='cuda:0')  -->grad_value: tensor(-2.4289e-06, device='cuda:0')
-->name: conv5.bias -->grad_requirs: True --weight tensor(-0.0004, device='cuda:0')  -->grad_value: tensor(6.9849e-10, device='cuda:0')
-->name: fc1.weight -->grad_requirs: True --weight tensor(8.4385e-05, device='cuda:0')  -->grad_value: tensor(6.8088e-05, device='cuda:0')
-->name: fc1.bias -->grad_requirs: True --weight tensor(-0.0016, device='cuda:0')  -->grad_value: tensor(0.0002, device='cuda:0')
-->name: fc2.weight -->grad_requirs: True --weight tensor(-0.0012, device='cuda:0')  -->grad_value: tensor(-2.2372e-05, device='cuda:0')
-->name: fc2.bias -->grad_requirs: True --weight tensor(0.0015, device='cuda:0')  -->grad_value: tensor(-0.0001, device='cuda:0')
-->name: fc3.weight -->grad_requirs: True --weight tensor(-0.0153, device='cuda:0')  -->grad_value: tensor(-4.8385e-10, device='cuda:0')
-->name: fc3.bias -->grad_requirs: True --weight tensor(0.0303, device='cuda:0')  -->grad_value: tensor(-7.4506e-09, device='cuda:0')
Epoch 8/49
----------

Train: Epoch [8] Loss: 0.6957
-->name: conv1.weight -->grad_requirs: True --weight tensor(0.0042, device='cuda:0')  -->grad_value: tensor(0.0004, device='cuda:0')
-->name: conv1.bias -->grad_requirs: True --weight tensor(-0.0086, device='cuda:0')  -->grad_value: tensor(3.6380e-10, device='cuda:0')
-->name: conv2.weight -->grad_requirs: True --weight tensor(-0.0001, device='cuda:0')  -->grad_value: tensor(0.0002, device='cuda:0')
-->name: conv2.bias -->grad_requirs: True --weight tensor(-0.0114, device='cuda:0')  -->grad_value: tensor(-3.4197e-10, device='cuda:0')
-->name: conv3.weight -->grad_requirs: True --weight tensor(0.0007, device='cuda:0')  -->grad_value: tensor(1.5599e-05, device='cuda:0')
-->name: conv3.bias -->grad_requirs: True --weight tensor(-0.0105, device='cuda:0')  -->grad_value: tensor(-1.2224e-09, device='cuda:0')
-->name: conv4.weight -->grad_requirs: True --weight tensor(-0.0002, device='cuda:0')  -->grad_value: tensor(3.0600e-06, device='cuda:0')
-->name: conv4.bias -->grad_requirs: True --weight tensor(-0.0098, device='cuda:0')  -->grad_value: tensor(-2.3865e-09, device='cuda:0')
-->name: conv5.weight -->grad_requirs: True --weight tensor(-0.0001, device='cuda:0')  -->grad_value: tensor(-3.1800e-06, device='cuda:0')
-->name: conv5.bias -->grad_requirs: True --weight tensor(-0.0004, device='cuda:0')  -->grad_value: tensor(3.6089e-09, device='cuda:0')
-->name: fc1.weight -->grad_requirs: True --weight tensor(8.2432e-05, device='cuda:0')  -->grad_value: tensor(-1.3619e-05, device='cuda:0')
-->name: fc1.bias -->grad_requirs: True --weight tensor(-0.0016, device='cuda:0')  -->grad_value: tensor(-4.5320e-05, device='cuda:0')
-->name: fc2.weight -->grad_requirs: True --weight tensor(-0.0012, device='cuda:0')  -->grad_value: tensor(2.6668e-05, device='cuda:0')
-->name: fc2.bias -->grad_requirs: True --weight tensor(0.0015, device='cuda:0')  -->grad_value: tensor(0.0001, device='cuda:0')
-->name: fc3.weight -->grad_requirs: True --weight tensor(-0.0153, device='cuda:0')  -->grad_value: tensor(-1.9463e-10, device='cuda:0')
-->name: fc3.bias -->grad_requirs: True --weight tensor(0.0303, device='cuda:0')  -->grad_value: tensor(0., device='cuda:0')

thanks alot

Henry_Chibueze · November 2, 2020, 12:45pm

but your losses from epoch 7 and 8 aren’t exactly the same

Can you print a small part of ur training data as well as the training data when passed through the guassian1( x ) and gussian2( x ) function?

Thanks

Aaaaakko · November 3, 2020, 3:19pm

These are 10 inputs images and labels

tensor([[[[-0.0745, -0.0745, -0.0745,  ...,  0.3804,  0.3804,  0.3804],
          [-0.0667, -0.0667, -0.0667,  ...,  0.3804,  0.3804,  0.3804],
          [-0.0667, -0.0667, -0.0667,  ...,  0.3804,  0.3804,  0.3804],
          ...,
          [ 0.2392,  0.2235,  0.2157,  ..., -0.7020, -0.7020, -0.7020],
          [ 0.2235,  0.2000,  0.2314,  ..., -0.7020, -0.7098, -0.7020],
          [ 0.2235,  0.2235,  0.2706,  ..., -0.6941, -0.7020, -0.7176]]],


        [[[ 0.2784, -0.0745, -0.2863,  ..., -0.0196, -0.0667, -0.0824],
          [-0.0353, -0.2627, -0.0118,  ...,  0.0588,  0.0196,  0.0118],
          [-0.3098, -0.0902,  0.1922,  ...,  0.1294,  0.1059,  0.1059],
          ...,
          [-0.2706, -0.4039, -0.5922,  ..., -0.0275, -0.1608,  0.0902],
          [-0.4431, -0.3882, -0.5294,  ..., -0.0039, -0.1922,  0.0902],
          [-0.5216, -0.3176, -0.4275,  ...,  0.0353, -0.1216,  0.0902]]],


        [[[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
          ...,
          [-0.5059, -0.4980, -0.4667,  ..., -0.4667, -0.4510, -0.4588],
          [-0.5294, -0.5294, -0.5216,  ..., -0.4588, -0.4431, -0.4431],
          [-0.5294, -0.5137, -0.4902,  ..., -0.4824, -0.4980, -0.5059]]],


        ...,


        [[[-0.7961, -0.7961, -0.7961,  ..., -0.6627, -0.6549, -0.6549],
          [-0.7961, -0.7882, -0.7882,  ..., -0.6549, -0.6549, -0.6549],
          [-0.7882, -0.7882, -0.7882,  ..., -0.6471, -0.6549, -0.6549],
          ...,
          [-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
          [-1.0000, -0.9922, -1.0000,  ..., -0.9922, -1.0000, -1.0000],
          [-1.0000, -0.9922, -1.0000,  ..., -0.9922, -1.0000, -1.0000]]],


        [[[ 0.0902,  0.0353, -0.0039,  ...,  0.0980,  0.1137,  0.1059],
          [ 0.0902,  0.0588,  0.0275,  ...,  0.0745,  0.1059,  0.1137],
          [-0.0039,  0.0039, -0.0039,  ...,  0.0588,  0.0980,  0.0902],
          ...,
          [-0.6314, -0.6314, -0.6314,  ..., -0.5608, -0.5765, -0.5922],
          [-0.6392, -0.6314, -0.6157,  ..., -0.5843, -0.5922, -0.6078],
          [-0.6471, -0.6392, -0.6314,  ..., -0.5922, -0.5922, -0.5765]]],


        [[[-0.5765, -0.5765, -0.5765,  ..., -0.0902, -0.0980, -0.1059],
          [-0.5765, -0.5765, -0.5765,  ..., -0.0824, -0.0824, -0.0902],
          [-0.5843, -0.5843, -0.5843,  ..., -0.0667, -0.0667, -0.0667],
          ...,
          [-0.9686, -0.9686, -0.9686,  ..., -0.9451, -0.9373, -0.9294],
          [-0.9608, -0.9765, -0.9686,  ..., -0.9451, -0.9373, -0.9294],
          [-0.9608, -0.9765, -0.9765,  ..., -0.9451, -0.9451, -0.9294]]]],
       device='cuda:0')
tensor([0, 1, 0, 0, 0, 0, 1, 1, 0, 1], device='cuda:0')

And when I make the dataset small, I find that the loss begin to change from ≈0.80 to 0.69。
But still 0.69

Henry_Chibueze · November 4, 2020, 11:39am

Sorry for the late reply
The thing is ur loss isn’t exactly the same on every epoch, it’s just approximately 0.69.
What I suggest is u initialize ur weights with xavier init.
Also since the relu activation only make outputs range from (0 to positive infinity), it won’t give room for negative inputs as it would approximate all negative inputs to zero, so I suggest u remove the relu activation for now and see how it works out.
It’s possible that ur neural network is not able to significantly differentiate patterns in ur data because ur data mostly consists of negative values and all negative values are approximated to zeros by relu activation, so it’ll just end up seeing alot of zeros than other numbers that significantly contribute to the image pattern.

This is just my speculation.
Hope this helps you .

Henry_Chibueze · November 4, 2020, 11:47am

Also since the cross-entropy loss is a combo of logsoftmax with Negative logarithmic likelyhood, it won’t really make much sense to add a logit kinda activation for probability score.
So just don’t add any kinda activation at all let’s see how it goes.
Looking forward to here ur feedback

Aaaaakko · November 4, 2020, 12:59pm

Thanks for reply, This is my first time to train a model myself, so maybe have a few of faults, thanks for reading my code

I have used the xavier init before, but it seems result still not change
the init code:

class GNCNN(nn.Module):
  def __init__(self):
    super(GNCNN, self).__init__()
    self.gaussian1 = gaussian1
     ... ...
    self.fc3 = nn.Linear(128, 2)
    self.reset_parameters()

		
  def forward(self, x):
    ... ...

  # model init
  def reset_parameters(self):
    for mod in self.modules():
      if isinstance(mod, nn.Conv2d):
        nn.init.xavier_uniform_(mod.weight.data)
        mod.bias.data.fill_(0.2)
    
      elif isinstance(mod, nn.Linear):
        nn.init.kaiming_normal_(mod.weight.data)
        mod.bias.data.zero_()

Today i try to change my dataset from label 0 : label 1 = 1 : 1 to label 0 : label 1 = 2 : 1
then the result change, loss become 0.64 and acc become 67%, I think that proof my model did not be trained at all:sob: (the result always change with the change of the dataset)

Epoch 35/99
----------

Train: Epoch [35] Loss: 0.6493 Acc: 66.6667%

Epoch 36/99
----------

Train: Epoch [36] Loss: 0.6493 Acc: 66.6667%

Then i try to delete relu function, but the result is still not good.
the loss change randomly, and finally to 0.64 again(after changing my dataset)
Code:

  def forward(self, x):
    prep = F.conv2d(x, KV, padding=2)
    out = self.avg_pool1(gaussian1(self.conv1(prep)))
    out = self.avg_pool2(gaussian2(self.conv2(out)))
    out = self.avg_pool3(gaussian2(self.conv3(out)))
    out = self.avg_pool4(gaussian2(self.conv4(out)))
    out = self.avg_pool5(gaussian2(self.conv5(out)))
    out = out.view(out.size(0), -1)
    out = self.fc1(out)
    out = self.fc2(out)
    out = self.fc3(out)
    return out

result:

Epoch 2/99
Train: Epoch [2] Loss: 0.6372 Acc: 66.6667%
----------

Epoch 4/99
Train: Epoch [4] Loss: 0.8944
---------

Epoch 6/99
Train: Epoch [6] Loss: 0.6770
--------

Epoch 18/99
Train: Epoch [18] Loss: 0.6513 
--------

Epoch 30/99
Train: Epoch [30] Loss: 0.6474

THanks again

Henry_Chibueze · November 4, 2020, 1:31pm

Hmmm🤔
Have u tried removing the gaussian transformations?

Remove the gaussian transformation and try one of these Try one of these;

just scaling ur training data to be within range (0, 1) and use it as the input data.

2 scale the data to range (0, 1) then normalize (subtract mean and divide by standard deviation)

ARHAM_JAIN · December 26, 2020, 10:39am

Did you find any solution , i am facing the same problem

ARHAM_JAIN · April 1, 2021, 11:02am

Try to train in pairs , worked for me

ARHAM_JAIN · April 1, 2021, 11:42am

And why are you resetting the parameters after every epoch??

annwang · August 12, 2021, 6:22am

May I ask that what do you mean about train in pairs?
Because I am facing the same problem.
However, I’ve done all the other solution above and still haven’t got an improvement.
Tks.