Why is my loss not decreasing?

I’m working on a model that takes in some custom csv data and is supposed to make predictions based on a lot of parameters, the labels are numbers up to 30. My concern is that when I run the training, the loss doesn’t decrease.

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
class HorseDataset(Dataset):
    def __init__(self, file_name):
        super().__init__()
        file_out = pd.read_csv(file_name, float_precision='round_trip').fillna(value=0)
        x = file_out.iloc[0:7292, 0:673]
        y = file_out.iloc[0:7292, 673]
        x_train = torch.Tensor(x.values)
        y_train = y
        self.X_train = torch.tensor(x_train, dtype=torch.float32)
        self.Y_train = torch.tensor(y_train).type(torch.LongTensor)
    def __len__(self):
        return len(self.Y_train)
    
    def __getitem__(self, idx):
        return self.X_train[idx], self.Y_train[idx]
    
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.l1 = nn.Linear(673, int(673*(2/3)//1))
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(int(673*(2/3)//1),int(673*(2/3)//1))
        self.relu = nn.ReLU()
        self.l3 = nn.Linear(int(673*(2/3)//1),31)
    def forward(self, x):
        output = self.l1(x)
        output = self.relu(output)
        output = self.l2(output)
        output = self.relu(output)
        output = self.l3(output)
        return output

dataset = HorseDataset('./export_labelled_correct.csv')
batch_size = 64
testing_split = .2
loss_fn = torch.nn.CrossEntropyLoss()

dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(testing_split * dataset_size))

train_indices, test_indices = indices[split:], indices[:split]
neural_net = NeuralNet()
neural_net = neural_net.to(device)

train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=test_sampler)
optimizer = torch.optim.SGD(neural_net.parameters(), lr=0.01, momentum=0.9)
scheduler = ReduceLROnPlateau(optimizer, 'min')

def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_loader):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)
        neural_net.to(device)
        optimizer.zero_grad()

        outputs = neural_net(inputs)

        loss = loss_fn(outputs, labels)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()
        if i % 10000 == 9999:
            last_loss = running_loss / batch_size # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss


timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/oddsgenie_trainer_{}'.format(timestamp))
epoch_number = 0

EPOCHS = 10000

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    neural_net.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)

    running_vloss = 0.0
    neural_net.eval()

    with torch.no_grad():
        for i, vdata in enumerate(test_loader):
            vinputs, vlabels = vdata
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device)
            voutputs = neural_net(vinputs)
            vloss = loss_fn(voutputs, vlabels)
            scheduler.step(vloss)
            running_vloss += vloss
    
    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'optimal_model'
        torch.save(neural_net.state_dict(), model_path)

    epoch_number += 1

I am quite new to pytorch, with a theoretical understanding of some of the basic machine learning concepts, so there might be a lot of bugs here, but I would appreciate if somebody could potentially point to why the output during training looks like this

LOSS train 0.0 valid 3.0965657234191895

Seems like the avg_loss variable is always returning 0, and the loss just sticks at about 3.08-3.09.

Thanks in advance!

Did you make sure if i % 10000 == 9999: is called at all?
If not, last_loss will stay initialized to 0.0 and will never be updated.

Yeah that’s a good point @ptrblck .

I also noticed that the loss function is returning nan. That’s probably linked to the whole issue of the loss not decreasing. Do you have any ideas why loss_fn would be outputing nan here? I am not normalizing the data, so I was going to try that, but not sure if maybe that’s a completely separate thing.

        loss = loss_fn(outputs, labels)
        print(f"Loss: {loss}")

No, I don’t know why the loss is NaN and you would need to narrow it down by isolating which iteration is causing this invalid loss value and which operation (inside the model) creates the first invalid value.