Getting nan as loss when I use SGD as the optimization algorithm of my CNN model?

As the title clearly describes, the loss is calculated as nan when I use SGD as the optimization algorithm of my CNN model. Between, no issues et al when I use Adam as the optimizer of my network.

Here is the whole code:

num_epochs = 20  # 1000
batch_size = 8
learning_rate = 0.0001
momentum = 0.5  # for SGD
log_interval = 50

class AndroModel(torch.nn.Module):

    def __init__(self, input_size):
        super(AndroModel, self).__init__()

        self.first_time_log = False

        self.kernel_size = 3
        self.padding = 0
        self.stride = 1
        self.input_size = input_size

        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=self.kernel_size, padding=self.padding,
                      stride=self.stride, bias=False),
            nn.ReLU(inplace=True)
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=self.kernel_size, padding=self.padding,
                      stride=self.stride, bias=False),
            nn.ReLU(inplace=True)
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=self.kernel_size, padding=self.padding,
                      stride=self.stride, bias=False),
            nn.ReLU(inplace=True)
        )
        self.conv4 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=self.kernel_size, padding=self.padding,
                      stride=self.stride, bias=False),
            nn.ReLU(inplace=True)
        )
        self.conv5 = nn.Sequential(
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=self.kernel_size, padding=self.padding,
                      stride=self.stride, bias=False),
            nn.ReLU(inplace=True)
        )

        self.num_conv_layers = 5
        last_conv_layer = self.conv5
        new_input_size = self.calculate_new_width(self.input_size, self.kernel_size, self.padding, self.stride,
                                                  self.num_conv_layers, max_pooling=None)

        out_channels = last_conv_layer._modules['0'].out_channels
        dimension = out_channels * new_input_size
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=dimension, out_features=3584),
            nn.Dropout(0.5))
        self.fc2 = nn.Sequential(
            nn.Linear(in_features=3584, out_features=1792),
            nn.Dropout(0.5))
        self.fc3 = nn.Sequential(
            nn.Linear(in_features=1792, out_features=448),
            nn.Dropout(0.5))
        self.fc4 = nn.Sequential(
            nn.Linear(in_features=448, out_features=112),
            nn.Dropout(0.5))
        self.fc5 = nn.Sequential(
            nn.Linear(in_features=112, out_features=28),
            nn.Dropout(0.5))
        self.fc6 = nn.Sequential(
            nn.Linear(in_features=28, out_features=6),
            nn.Dropout(0.5))
        self.fc7 = nn.Sequential(
            nn.Linear(in_features=6, out_features=2))

    def forward(self, x):
        if not self.first_time_log:
            print(f'Initial shape: {x.shape}')
        x = x.reshape((-1, 1, self.input_size))
        if not self.first_time_log:
            print(f'Shape after reshape: {x.shape}')
        output = self.conv1(x)
        if not self.first_time_log:
            print(f'Shape after conv1: {output.shape}')
        output = self.conv2(output)
        if not self.first_time_log:
            print(f'Shape after conv2: {output.shape}')
        output = self.conv3(output)
        if not self.first_time_log:
            print(f'Shape after conv3: {output.shape}')
        output = self.conv4(output)
        if not self.first_time_log:
            print(f'Shape after conv4: {output.shape}')
        output = self.conv5(output)
        if not self.first_time_log:
            print(f'Shape after conv5: {output.shape}')

        output = output.view(output.size(0), -1)
        if not self.first_time_log:
            print(f'Shape after flattening: {output.shape}')
        output = self.fc1(output)
        if not self.first_time_log:
            print(f'Shape after fc1: {output.shape}')
        output = self.fc2(output)
        if not self.first_time_log:
            print(f'Shape after fc2: {output.shape}')
        output = self.fc3(output)
        if not self.first_time_log:
            print(f'Shape after fc3: {output.shape}')
        output = self.fc4(output)
        if not self.first_time_log:
            print(f'Shape after fc4: {output.shape}')
        output = self.fc5(output)
        if not self.first_time_log:
            print(f'Shape after fc5: {output.shape}')
        output = self.fc6(output)
        if not self.first_time_log:
            print(f'Shape after fc6: {output.shape}')
        output = self.fc7(output)
        if not self.first_time_log:
            print(f'Shape after fc7: {output.shape}')

        self.first_time_log = True

        return output

    @staticmethod
    def calculate_new_width(input_size, kernel_size, padding, stride, num_conv_layers, max_pooling=2):
        new_input_size = input_size
        for i in range(num_conv_layers):
            new_input_size = ((new_input_size - kernel_size + 2 * padding) // stride) + 1
            if max_pooling is not None:
                new_input_size //= max_pooling
        return new_input_size


class AndroDataset(Dataset):
    def __init__(self, features_as_ndarray, classes_as_ndarray):
        self.features = Variable(torch.from_numpy(features_as_ndarray).float())
        self.classes = Variable(torch.from_numpy(classes_as_ndarray).float())

    def __getitem__(self, index):
        return self.features[index], self.classes[index]

    def __len__(self):
        return len(self.features)


def main():
    start = time()

    global device
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('Device: {}'.format(device))
    if torch.cuda.is_available():
        print('GPU Model: {}'.format(torch.cuda.get_device_name(0)))

    csv_data = pd.read_csv('android_binary.csv')
    print('\nData Features (Columns):')
    print(csv_data.columns)

    num_of_features = csv_data.shape[1] - 1
    x = csv_data.iloc[:, :-1].values
    y = csv_data.iloc[:, -1].values
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    training_data = AndroDataset(x_train, y_train)
    test_data = AndroDataset(x_test, y_test)

    print('\n~~~~~~~~ TRAINING HAS STARTED ~~~~~~~~')
    print('# of training instances: {}'.format(len(training_data)))

    train_loader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True)

    model = AndroModel(num_of_features)
    model = model.to(device)

    print('Model Overview:')
    print(model, '\n')

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum)

    losses_in_epochs = []
    # training
    total_step = len(train_loader) 

    # switch to train mode
    model.train()
    for epoch in range(num_epochs):
        losses_in_current_epoch = []
        for i, (features, classes) in enumerate(train_loader):
            # Transfer data to GPU
            features, classes = features.to(device), classes.to(device, dtype=torch.int64)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            output = model(features)
            loss = criterion(output, classes)
            loss.backward()
            optimizer.step()

            if (i + 1) % log_interval == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, num_epochs, i + 1, total_step,
                                                                         loss.item()))
                losses_in_current_epoch.append(loss.item())

        avg_loss_current_epoch = 0
        for tmp_loss in losses_in_current_epoch:
            avg_loss_current_epoch += tmp_loss
        avg_loss_current_epoch /= len(losses_in_current_epoch)
        print('End of the epoch #{}, avg. loss: {:.4f}'.format(epoch + 1, avg_loss_current_epoch))
        losses_in_epochs.append(avg_loss_current_epoch)

    print('Average loss: {:.4f}'.format(losses_in_epochs[-1]))
    print(f'Training Duration (in minutes): {(time() - start) / 60}')

    print('\n~~~~~~~~ TEST HAS STARTED ~~~~~~~~')
    print('# of test instances: {}'.format(len(test_data)))
    # test
    accuracy = 0

    # switch to evaluate mode
    model.eval()
    with torch.no_grad():
        correct = 0
        for features, classes in test_loader:
            # Transfer data to GPU
            features, classes = features.to(device), classes.to(device, dtype=torch.int64)
            output = model(features)
            output = output.to(device)
            _, predicted = torch.max(output.data, 1)
            correct += (predicted == classes).sum().item()

        accuracy = 100 * correct / len(test_loader.dataset)
        print('Accuracy of the model on the {} test instances: {:.4f} %'.format(len(test_loader.dataset), accuracy))


if __name__ == '__main__':
    main()
1 Like

Do you see the NaN from the first iteration or after a while?
In the latter case, could you check the loss values, as they might just blow up?
If that’s the case, check the gradients (or their norm).

Is the input to your model normalized? If not, try normalizing it, as large input values might also blow up your output.

3 Likes

No, I have not seen any values other than nan since the first iteration.

My input was not normalized. I have normalized it after your suggestion, now it works as it is expected (thanks a lot for that) but the accuracy has significantly decreased. Here is how I have normalized the input:

scaler = preprocessing.StandardScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df, columns=df.columns)