MNIST CNN doesn't improve loss

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from random import randint
from matplotlib import pyplot as plt


train = datasets.MNIST("",train=True,download=True,
                       transform = transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("",train=False,download=True,
                       transform = transforms.Compose([transforms.ToTensor()]))

bs = 64
trainset = torch.utils.data.DataLoader(train, batch_size = bs,
                                       shuffle = True)
testset = torch.utils.data.DataLoader(train, batch_size = 1,
                                       shuffle = True)


def MSE(a,b):
   t1 = (a-b)**2
   return t1.mean()
   
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.mp = nn.MaxPool2d(2)
        self.fc = nn.Linear(320, 1)

    def forward(self, x):
        in_size = x.size(0)
        x = F.relu(self.mp(self.conv1(x)))
        x = F.relu(self.mp(self.conv2(x)))
        x = x.view(in_size, -1)  # flatten the tensor
        x = self.fc(x).relu()
        return x
     
net = Net().cuda()     
optimizer = optim.Adam(net.parameters(), lr=(1.0e-3))
print('net created')
losses=[]
for epoch in range(20):
   net.train()#training mode
   for data in trainset:
      t1 = t.time()
      x,y = data
      x = x.cuda()
      y = y.cuda()
      optimizer.zero_grad()
      output = net(x)
      loss = MSE(output, y)
      #print(round(float(output),2),float(y),round(float(loss),2))
      print(float(loss))
      loss.backward()
      optimizer.step()
      losses.append(float(loss))
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from random import randint
from matplotlib import pyplot as plt


train = datasets.MNIST("",train=True,download=True,
                       transform = transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("",train=False,download=True,
                       transform = transforms.Compose([transforms.ToTensor()]))

bs = 64
trainset = torch.utils.data.DataLoader(train, batch_size = bs,
                                       shuffle = True)
testset = torch.utils.data.DataLoader(train, batch_size = 1,
                                       shuffle = True)


def MSE(a,b):
   t1 = (a-b)**2
   return t1.mean()

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.mp = nn.MaxPool2d(2)
        self.fc = nn.Linear(320, 1)

    def forward(self, x):
        in_size = x.size(0)
        x = F.relu(self.mp(self.conv1(x)))
        x = F.relu(self.mp(self.conv2(x)))
        x = x.view(in_size, -1)  # flatten the tensor
        x = self.fc(x).relu()
        return x

net = Net().cuda()     
optimizer = optim.Adam(net.parameters(), lr=(1.0e-3))
print('net created')
losses=[]
for epoch in range(20):
   net.train()#training mode
   for data in trainset:
      t1 = t.time()
      x,y = data
      x = x.cuda()
      y = y.cuda()
      optimizer.zero_grad()
      output = net(x)
      loss = MSE(output, y)
      #print(round(float(output),2),float(y),round(float(loss),2))
      print(float(loss))
      loss.backward()
      optimizer.step()
      losses.append(float(loss))

The CNN returns a single o/p which is used to compute error according to MSE. One thing to note is that that when i change the net to have an a 10 neuron o/p layer, I get much better results. In that situation i use nll_loss as my loss function and i also apply a softmax to the o/p layer. Theoretically i feel this should work. However this is my first crack at CNNs. The error stays constant at 7-9 for me. Even after training for 200 batches, I see no learning. Any info on how to fix this would be great!

For the classification problem, e.g. MNIST, MSE loss should be avoided. Instead, the suitable loss should be CrossEntropyLoss (or a combination of nll_loss with soft_max). That’s why the first case yields not very bad accuracy.

Also, remove relu here. It will squash the predicted logits into 0 if negative.

Ok CrossEntropyLoss, got it!
I tried it out and I get this error

bool value of Tensor with more than one value is ambiguous
From what i understand, CEL takes in a tensor of o/ps and a tensor of targets which is what im feeding it.

I’m super new to this so forgive me for the noob mistake.
Could you guide me on how to fix this?

The last layer (prediction layer) yielding the logits must have num_classes as output_size; here, say, 10.
I just modified your code a little bit. The following should work properly.

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
from random import randint
from matplotlib import pyplot as plt


train = datasets.MNIST("",train=True,download=True,
                       transform = transforms.Compose([transforms.ToTensor()]))
test = datasets.MNIST("",train=False,download=True,
                       transform = transforms.Compose([transforms.ToTensor()]))

bs = 64
trainset = torch.utils.data.DataLoader(train, batch_size = bs,
                                       shuffle = True)
testset = torch.utils.data.DataLoader(train, batch_size = 1,
                                       shuffle = True)


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.mp = nn.MaxPool2d(2)
        self.fc = nn.Linear(320, 10) # Change output_size to the # of classes

    def forward(self, x):
        in_size = x.size(0)
        x = F.relu(self.mp(self.conv1(x)))
        x = F.relu(self.mp(self.conv2(x)))
        x = x.view(in_size, -1)  # flatten the tensor
        x = self.fc(x)
        return x
     
net = Net().cuda()
optimizer = optim.Adam(net.parameters(), lr=1.0e-3)
criterion = nn.CrossEntropyLoss()

print('net created')
losses=[]
for epoch in range(1):
    for iters, (x, y) in enumerate(trainset):
        x = x.cuda()
        y = y.cuda()
        optimizer.zero_grad()
        output = net(x)
        loss = criterion(output, y)
        
        if iters % 100 == 0:
            print(float(loss))
        
        loss.backward()
        optimizer.step()
        losses.append(float(loss))
1 Like

Thank you so much!
I noticed that if i do what you did which is defining criterion = nn.CrossEntropyLoss(), it works perfectly.
But if i just do loss = nn.CrossEntropyLoss(output, y) it throws that error I mentioned. Must be some strange implementation thing i guess…
Once again, Thank you so much.

I see. First, you have to initialize a loss “function” like criterion = nn.CrossEntropyLoss(). This actually create an object named criterion which is responsible for compute the loss later. nn.CrossEntropyLoss is itself not a direct function to compute the loss.