Why does the loss stay constant?

Hi there. I was wondering why does the loss in my self-build LeNet5 stay constant?
Below is the code.

# LeNet5
class LeNet5(nn.Module):
    
    def __init__(self):
        super(LeNet5, self).__init__()
        self.model = nn.Sequential(
        nn.Conv2d(3, 6, kernel_size=3, stride=1),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.ReLU(),
        nn.Conv2d(6, 16, kernel_size=3, stride=1),
        nn.MaxPool2d(kernel_size=2, stride=2),
        nn.ReLU(),
        
        nn.Flatten(),
        # MLP
        nn.Linear(400, 120, bias=True),
        nn.ReLU(),
        nn.Linear(120, 84, bias=True),
        nn.ReLU(),
        nn.Dropout(keep_prob),
        nn.Linear(84, 2),
        )
    def forward(self, x):
        x = self.model(x)
        x_softmax = F.softmax(x, dim=1)
        return x_softmax

# instance
model = LeNet5()
#  optimizer & loss function
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
loss_function = nn.CrossEntropyLoss()

# 
def train(
    model,
    data_loader,
    optimizer,
    loss_function,
    epoch,
    log_interval=200    # print out info every 200 batches
):
    model.train()
    for batch_idx, (x, label) in enumerate(data_loader):
        
        optimizer.zero_grad()
        output = model(x)
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                epoch, batch_idx*len(x), len(data_loader.dataset),
                100. * batch_idx/len(data_loader), loss.data.item()
                # "batch_idx/len(data_loader)" = 当前的batch index除以batch总数
            ))
# 
def validate(
    model, 
    data_loader,
    loss_function,
    epoch,
    loss_vector,
    accuracy_vector
):
    '''applied at train dataset as well as validate dataset'''
    model.eval()
    loss, correct = 0, 0
    for x, label in data_loader: # retrieve tensor dataset and label
        output = model(x)
        loss += loss_function(output, label).data.item()   # "loss_function" is pre-defined, which should be applied in this section
        pred = output.data.max(1)[1]
        correct += pred.eq(label.data).cpu().sum()
    
    # save cal
    loss /= len(data_loader)
#     print(data_loader.dataset)
    loss_vector.append(loss) # for plotting
    accuracy = 100. * correct.to(torch.float32) / len(data_loader.dataset)
    accuracy_vector.append(accuracy)
    print('Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        loss, correct, len(data_loader.dataset), accuracy
    ))
# 
def main(model, train_loader, validate_loader, optimizer, loss_function, num_epochs):
    lossv, accv = [], []
    train_lossv, train_accv = [], []
    for epoch in range(1, num_epochs+1):
        train(model, train_loader, optimizer, loss_function, epoch)
        print("Train set:")
        validate(model, train_loader, loss_function, epoch, train_lossv, train_accv)
        print("Validate set:")
        validate(model, validate_loader, loss_function, epoch, lossv, accv)
    return {'validate':[lossv, accv], 'train':[train_lossv, train_accv]}

I’ll really appreciate it if someone could give me a hint!

Hi Hongpu!

Using softmax() with CrossEntropyLoss will significantly degrade your
training (although it is unlikely to cause your loss to remain exactly constant).
Try removing the softmax() and see if that helps.

CrossEntropyLoss expects its input (the predictions you pass it) to be
“unnormalized” log-probabilities (for example, the output of your final
Linear layer). Passing such through softmax() converts them to actual
probabilities – not what CrossEntropyLoss wants.

Also, you don’t show us what your learning_rate is. If it happened to be
zero, your loss would remain constant.

If these ideas don’t help, try passing a single sample through your network
and check that the output does depend on your input. If that works, perform
a single backward pass and check that you get non-zero gradients. If that
also works, perform a single optimizer.step() and check that your model
parameters do get changed. If they do, check that the updated parameters
produce a different output for the original input. If they do, check that the
different output does, in fact, lead do a different loss.

(As an aside, it looks like your model predicts two values – your final
Linear layer has out_features = 2. It would likely be marginally more
efficient to treat you problem as binary-classification problem, rather than
a two-class multi-class problem, have your final layer be Linear (84, 1),
and use BCEWithLogitsLoss as your loss criterion. Note, this isn’t the
cause of our problem – it’s just a suggestion for a slight improvement.)

Good luck.

K. Frank