PyTorch Multi Class Classification using CrossEntropyLoss - not converging

Lucy_Jackson · July 1, 2020, 7:20am

I am trying to get a simple network to output the probability that a number is in one of three classes. These are, smaller than 1.1, between 1.1 and 1.5 and bigger than 1.5. I am using cross entropy loss with class labels of 0, 1 and 2, but cannot solve the problem.

Every time I train, the network outputs the maximum probability for class 2, regardless of input. The lowest loss I seem to be able to achieve is 0.9ish. Any advice on where I am going wrong would be greatly appreciated!! All code is below.

class gating_net(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(gating_net, self).__init__()
        self.linear1 = nn.Linear(input_dim, 32)
        self.linear2 = nn.Linear(32, output_dim)

    def forward(self, x):
        # The original input (action) is used as the residual.
        x = F.relu(self.linear1(x))
        x = F.sigmoid(self.linear2(x))
        return x

learning_rate = 0.01
batch_size = 64
epochs = 500
test = 1

gating_network = gating_net(1,3)

optimizer = torch.optim.SGD(gating_network.parameters(), lr=learning_rate, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=20, verbose=True)

for epoch in range (epochs):
    input_ = []
    label_ = []

    for i in range (batch_size):
        scale = random.randint(10,20)/10

        input = scale
        if scale < 1.1:
            label = np.array([0])
        elif 1.1 < scale < 1.5:
            label = np.array([1])
        else:
            label = np.array([2])

        input_.append(np.array([input]))
        label_.append(label)

    optimizer.zero_grad()

    # get output from the model, given the inputs
    output = gating_network.forward(torch.FloatTensor(input_))
    old_label  = torch.FloatTensor(label_)

    # get loss for the predicted output
    loss = nn.CrossEntropyLoss()(output, old_label.squeeze().long())

    # get gradients w.r.t to parameters
    loss.backward()
    # update parameters
    optimizer.step()
    scheduler.step(loss)

    print('epoch {}, loss {}'.format(epoch, loss.item()))

    if loss.item() < 0.01:
        print("########## Solved! ##########")
        torch.save(mod_network.state_dict(), './supervised_learning/run_{}.pth'.format(test))
        break

    # save every 500 episodes
    if epoch % 100 == 0:
        torch.save(gating_network.state_dict(), './run_{}.pth'.format(test))

ptrblck · July 1, 2020, 9:37am

nn.CrossEntropyLoss expects the output to contain raw logits not probabilities, thus you would have to remove the last sigmoid.

Also, unrelated to the issue, but you should directly call the model via model(x) instead of the .forward method. Otherwise hooks could be ignored (not important for your current code, but for future use cases).

After removing the sigmoid, switching to optim.Adam and increasing the epochs to >3000, your model seems to work fine.

Lucy_Jackson · July 1, 2020, 9:54am

Brilliant, I have it working now, thank you very much for you help and advice!!

Xtian · February 24, 2023, 5:11pm

import torch, torchvision
from torch import nn, optim
import torch.nn.functional as F
import random
import numpy as np
import time
from torchmetrics import ConfusionMatrix

class gating_net(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(gating_net, self).__init__()
        self.linear1 = nn.Linear(input_dim, 32)
        self.linear2 = nn.Linear(32, output_dim)

    def forward(self, x):
        # The original input (action) is used as the residual.
        x = F.relu(self.linear1(x))
        x = self.linear2(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available()else "cpu")
print(f'device: {device}')
# device = 'cpu'

learning_rate = 0.1
batch_size = 32
epochs = 1500
test = 1

gating_network = gating_net(1,3).to(device)
print(gating_network)



optimizer = torch.optim.Adam(gating_network.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.9, patience=128, verbose=True)

cm = ConfusionMatrix(task = 'multiclass', num_classes = 3).to(device)


t0 = time.time()
for epoch in range (epochs):
    input_ = []
    label_ = []
    cm_train = torch.zeros(3,3).type(torch.LongTensor).to(device)

    for i in range (batch_size):
        scale = random.randint(10,20)/10

        input = scale
        if scale < 1.1:
            label = np.array([0])
        elif 1.1 < scale < 1.5:
            label = np.array([1])
        else:
            label = np.array([2])

        input_.append(np.array([input]))
        label_.append(label)

    optimizer.zero_grad()

    # get output from the model, given the inputs
    
    # print('input_: ', torch.FloatTensor(input_).shape)
    input_ = torch.FloatTensor(input_)
    input_ = input_.to(device)
    output = gating_network(input_)
    
    old_label  = torch.FloatTensor(label_).to(device)

    # get loss for the predicted output
    # print('output: ' , output.shape)
    # print('old_label: ', old_label.shape)
    # print('old_label.squeeze().long(): ', old_label.squeeze().long().shape)
    
    # time.sleep(5)
    
    loss = nn.CrossEntropyLoss()(output, old_label.squeeze().long())
    prec = torch.sum(torch.argmax(output, axis=1) == old_label.squeeze().long())/batch_size
    cm_train = cm(torch.argmax(output, axis=1),old_label.squeeze().long())

    # get gradients w.r.t to parameters
    loss.backward()
    # update parameters
    optimizer.step()
    scheduler.step(loss)

    

    if loss.item() < 0.001:
        print("########## Solved! ##########")
        torch.save(gating_network, './supervised_learning/run_{}.pth'.format(test))
        break

    # save every 500 episodes
    if epoch % 100 == 0:
        print(f'epoch {epoch}, loss {round(loss.item(),3)}, precision {round(prec.item()*100,2)}%')
        print(cm_train)
        torch.save(gating_network, './run_{}.pth'.format(test))

print(f'Tiempo en finalizar con device: {device}: {round(time.time()-t0,2)} s')

Hi, for this, I change the code:

removed sigmoid from model
criterion SGD for Adam
lr_start 0.01 for 0.10
scheduler: factor=0.9 and patience=128

I used device ‘cuda’, but if you dont have one use ‘cpu’

Hyperparameters have great influence. I recommend using a gridsearch or genetic algorithms