Training Loss: nan

Arvind_Subramaniam · October 8, 2019, 3:20pm

I am training a simple conv layer on cifar10 and I keep getting a high loss during training:

Epoch 1/50 - Training loss: nan
Epoch 2/50 - Training loss: nan

Conv Network

import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.conv1 = nn.Conv2d(3, 6, 5)
    self.pool = nn.MaxPool2d(2, 2)
    self.conv2 = nn.Conv2d(6, 16, 5)
    self.conv3 = nn.Conv2d(16, 32, 5)
    self.conv4 = nn.Conv2d(32, 64, 5)
    self.conv5 = nn.Conv2d(64, 64, 1)
    self.fc1 = nn.Linear(64 * 1 * 1, 120)
    self.fc2 = nn.Linear(120, 84)
    self.fc3 = nn.Linear(84, 10)

  def forward(self, x):
    x = self.pool(F.relu(self.conv1(x)))
    x = F.relu(self.conv2(x))
    #print(x.shape)
    x = F.relu(self.conv3(x))
    x = F.relu(self.conv4(x))
    x = self.pool(F.relu(self.conv5(x)))
    x = x.view(-1, 64 * 1 * 1)
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

Here are the training and validation functions:

def train_conv(model):
  minimum = 1000
  model = model.cuda()
  optimizer = optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
  criterion =nn.NLLLoss()
  epochs = 50
  for epoch in range(epochs):  # loop over the dataset multiple times
    running_loss = 0.0
    for inputs, labels in train_loader:
      # get the inputs; data is a list of [inputs, labels]
      inputs, labels = inputs.cuda(),labels.cuda()
      # zero the parameter gradients
      optimizer.zero_grad()

      # forward + backward + optimize
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

          # print statistics
      running_loss += loss.item()
    print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, running_loss / len(train_loader)))
  return (running_loss/len(train_loader))

def validation_conv(model):
  correct = 0
  total = 0
  model = model.cuda()
  with torch.no_grad():
    for data in val_loader:
      images, labels = data
      images, labels = images.cuda(),labels.cuda()
      outputs = model(images)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

  print('Accuracy of the network on the 10000 test images: %d %%' % (
      100 * correct / total))

I am not sure how to change the train_conv function to get the correct training loss.

ptrblck · October 11, 2019, 3:41pm

nn.NLLLoss expects log probabilities as the model outputs, so use F.log_softmax on your output before passing it to the criterion or alternatively keep your current model architecture and use nn.CrossEntropyLoss instead, which will internally use F.log_softmax + nn.NLLLoss.