LeNet5 on MNIST classification not learning

Hello guys, I am trying to build a LeNet5 network to train a classifier for MNIST dataset, but the network seems not to be learning. I have printed the gradients and they are not always zero, but very small. The loss curve as well as the accuracy curve is almost like a flat line when learning rate is small (1e-1) and behaves in a random way when learning rate is very large (1e2)

Here is my code for the model:

import torch.nn as nn
from torch.nn import functional as F

from keras.datasets import mnist
from torch.optim import SGD, Adam
from torch.utils.data import TensorDataset, DataLoader
import torch
import time
import matplotlib.pyplot as plt

class LeNet5_MNIST(nn.Module):
   def __init__(self):
       super().__init__()
       self.conv1 = nn.Conv2d(1, 6, 5, padding=2)
       self.conv2 = nn.Conv2d(6, 16, 5)
       self.fc1 = nn.Linear(16*5*5, 120)
       self.fc2 = nn.Linear(120, 84)
       self.fc3 = nn.Linear(84, 10)
       
   def forward(self, x):
       x = x.unsqueeze(1)
       x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
       x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
       x = x.view(-1, self.num_flat_features(x))
       x = F.relu(self.fc1(x))
       x = F.relu(self.fc2(x))
       x = self.fc3(x)
       return x
   
   def num_flat_features(self, x):
       size = x.size()[1:]
       num_features = 1
       for s in size:
           num_features *= s
       return num_features

And for training:

net = LeNet5_MNIST()
(X_train, y_train), (X_test, y_test) = mnist.load_data()

device = torch.device("cuda")
net.to(device)

n_epochs = 100
batch_size = 64
momentum = 0.9
lr = 1e-1

X_train = torch.tensor(X_train, device=device, dtype=torch.float)
y_train = torch.tensor(y_train, device=device, dtype=torch.int64)

X_test = torch.tensor(X_test, device=device, dtype=torch.float)
y_test = torch.tensor(y_test, device=device, dtype=torch.int64)

train_ds = TensorDataset(X_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size)

valid_ds = TensorDataset(X_test, y_test)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

optimizer = Adam(net.parameters(), lr = lr)
#optimizer = SGD(net.parameters(), lr=lr, momentum=momentum)
optimizer.zero_grad()
loss_func = nn.CrossEntropyLoss()

split_line = '-' * 50
split_line_bold = '=' * 50

start_time = time.time()
losses = []
valid_losses = []
accuracy = []
print(f"Training on {device}")
print(split_line_bold)

a = 0

for epoch in range(n_epochs):
  batch_loss = []
  net.train()
  for xb, yb in train_dl:
    y = net(xb)
    loss = F.cross_entropy(y, yb)
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()
    batch_loss += [loss.item()]

  losses += [sum(batch_loss)/len(batch_loss)]
  print(f"Epoch {epoch+1}:")
  print(f"Training loss {sum(batch_loss)/len(batch_loss)}")
  
  net.eval()
  with torch.no_grad():
    valid_loss = sum(loss_func(net(xb), yb) for xb, yb in valid_dl)

    out = net(X_test)
    y_pre = torch.argmax(out, dim=1)
    acc = (y_pre == y_test).float().mean()

  valid_loss /= len(valid_dl)
  valid_losses += [valid_loss]

  accuracy += [acc]

  print(f"Validation loss: {valid_loss}")
  print(f"Validation accuracy: {acc}")
  print(split_line)

end_time = time.time()

Evaluation code and results:

training_time = end_time - start_time
print("Training finished in %.2f seconds" % training_time)

fig, ax = plt.subplots()
ax.plot(list(range(n_epochs)), losses, label="Training loss")
ax.plot(list(range(n_epochs)), valid_losses, label="Validation loss")
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
ax.legend()
fig.show()

fig, ax = plt.subplots()
ax.plot(list(range(n_epochs)), accuracy)
ax.set_xlabel('Epoch')
ax.set_ylabel('Accuracy')
ax.set_title('Validation accuracy')
fig.show()

You might want to play around with some hyperparameters and e.g. lower the learning rate to ~1e-3, as I cannot see any obvious errors in your code.

Thanks for the reply! I’ve tried with a smaller learning rate and it did work! Plus, with smaller learning rate the performance (judged by the loss curve) is even better. I don’t yet understand the reason though, as it is not like a problem of overfitting …