Loss not updating during training

I am trying to train a simple CNN classifier for a binary version of MNIST (.png files containing 0’s and 1’s). My code is as follows:

class NetTwo(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(6 * 14 * 14, 84)
        self.fc2 = nn.Linear(84, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))  # ensures range between 0 and 1 
        return x


net = NetTwo()


# confirm model works 
for mini_batch in trainloader:
    # predict on one batch  
    # mini_batch is a size 2 list where 
    # [0] is the samples and [1] is the labels 
    # batch size 32 
    sample_points = mini_batch[0]
    sample_labels = mini_batch[1]
    logits = net(sample_points)
    print("Raw logit: ", logits[21])
    print("Label (round logit): ", torch.round(logits[21]))
    break 



# Define a Loss function and optimizer 


criterion = nn.BCELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


# Train the net 


idx = 1
for epoch in range(20):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[0]
        labels = data[1]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = torch.squeeze(net(inputs))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print("Loss (epoch " + str(idx) + "): ", running_loss)
    idx += 1
print('Finished Training')


# testing 

num_correct = 0
num_total = 0
for mini_batch in testloader:
    sample_points = mini_batch[0]
    sample_labels = mini_batch[1]
    logits = net(sample_points)
    logits = logits[0]
    
    loss_output = criterion(logits, sample_labels)
    
    for i in range(len(logits)):
        pred = torch.round(logits[i]).item()
        real = sample_labels[i].item()
        if pred == real:
            num_correct += 1
        num_total += 1


print("accuracy: ", num_correct/num_total)

But the loss printed out during training is:

Loss (epoch 1):  169.0617936849594
Loss (epoch 2):  169.0617936849594
Loss (epoch 3):  169.0617936849594
Loss (epoch 4):  169.0617936849594
Loss (epoch 5):  169.0617936849594
Loss (epoch 6):  169.0617936849594
Loss (epoch 7):  169.0617936849594
Loss (epoch 8):  169.0617936849594
Loss (epoch 9):  169.0617936849594
Loss (epoch 10):  169.0617936849594
Loss (epoch 11):  169.0617936849594
Loss (epoch 12):  169.0617936849594
Loss (epoch 13):  169.0617936849594
Loss (epoch 14):  169.0617936849594
Loss (epoch 15):  169.0617936849594
Loss (epoch 16):  169.0617936849594
Loss (epoch 17):  169.0617936849594
Loss (epoch 18):  169.0617936849594
Loss (epoch 19):  169.0617936849594
Loss (epoch 20):  169.0617936849594
Finished Training

And the accuracy on the test set is the same when ran before and after training. So clearly, training is not doing anything to update the model params. Why is this? I feel pretty directionless with regards to debugging this. Any tips for what may be wrong?

I guess you use relu before sigmoid where minimum of relu is 0 and sigmoid of 0 is 0.5 that means your network couldn compute gradients that help to optimize function. Check without relu.

I cannot reproduce the issue using your code and by mapping all targets > 1 to 1 for the sake of debugging (this use case might of course not make sense, but the loss is at least decreasing):

class NetTwo(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(6 * 14 * 14, 84)
        self.fc2 = nn.Linear(84, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.flatten(x, 1) # flatten all dimensions except batch
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))  # ensures range between 0 and 1 
        return x


device = 'cuda'
net = NetTwo().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

dataset = datasets.MNIST(root="data", transform=transforms.ToTensor())
dataset.targets[dataset.targets>1] = 1 # set all targets > 1 to 1
loader = DataLoader(dataset, batch_size=32, shuffle=True)

idx = 1
for epoch in range(20):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs = data[0].to(device)
        labels = data[1].to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = torch.squeeze(net(inputs))
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print("Loss (epoch " + str(idx) + "): ", running_loss)
    idx += 1
print('Finished Training')

Output:

Loss (epoch 1):  198.56596073810942
Loss (epoch 2):  64.87397073610919
Loss (epoch 3):  57.281974435201846
Loss (epoch 4):  54.00418394652661
Loss (epoch 5):  51.572434037341736
Loss (epoch 6):  48.94695136777591
Loss (epoch 7):  46.908603859410505
Loss (epoch 8):  45.338596298286575