Training Issues (loss isn't decreasing)

Hi all,

I’m trying to solve a binary classification problem where the input to the CNN is a CT slice of a patient’s chest. The dataset is from GitHub: GitHub - UCSD-AI4H/COVID-CT: COVID-CT-Dataset: A CT Scan Dataset about COVID-19

I’ve written a custom dataset class that converts all the images to 224x224 rgb (some of the original images are grayscale or rgba). Here’s the code for defining my net and the training loop:

class RONANet(nn.Module):
    def __init__(self, classifier_type=None):
        super(RONANet, self).__init__()
        self.classifier_type = classifier_type
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.classifier = self.compose_classifier()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            self.relu,
            self.maxpool,
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            self.relu,
            self.maxpool,
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(128),
            self.relu,
            self.maxpool,
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            self.relu,
            self.maxpool,
            nn.AdaptiveAvgPool2d(output_size=(1,1)),
        )
    def compose_classifier(self):
        if 'fc' in self.classifier_type:
                classifier = nn.Sequential(
                    nn.Flatten(),
                    nn.Linear(14**2*256, 256),
                    self.relu,
                    nn.Linear(256, 128),
                    self.relu,
                    nn.Linear(128, 2))
                
        elif 'conv'in self.classifier_type:
            classifier = nn.Sequential(
                nn.Conv2d(256, 1, kernel_size=1, stride=1))
            return classifier
        
    def forward(self, x):
        features = self.conv_layers(x)
        out = self.classifier(features)
        if 'conv' in self.classifier_type:
            out = out.reshape([-1,])
        return out

        
RONANetv1 = RONANet(classifier_type='conv')
RONANetv1 = RONANetv1.cuda()
RONANetv2 = RONANet(classifier_type='fc')
RONANetv2 = RONANetv2.cuda()
# This dataset is benchmarked by F1, AUC, and accuracy

def get_scores(model, dataloader):
    model.eval()
    
    with torch.no_grad():
        
        for image, label in dataloader:
            image = image.cuda()
            prediction = model(image)
            prediction_cpu = prediction.detach().cpu().numpy()
            binary_prediction = np.where(prediction_cpu<0.5, 0, 1)
    return metrics.roc_auc_score(label.numpy(), prediction_cpu), metrics.f1_score(label.numpy(), binary_prediction), metrics.accuracy_score(label.numpy(), binary_prediction)

# metrics.f1_score(labels, predictions), metrics.accuracy_score(labels, binary_predictions)   
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(RONANetv1.parameters(), lr=0.001)
num_epochs = 2
best_auc = 0.5 # set threshold to random model performance
scores = {}

for epoch in range(num_epochs):
    RONANetv1.train()
    print(f'Current Epoch: {epoch+1}')
    epoch_loss = 0
    
    for images, labels in train_dataloader:
        print(labels)
#         print(images)
        batch_loss = 0
        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            images = images.cuda()
            labels = labels.cuda()
            out = RONANetv1(images)
            print(out)
            loss = criterion(out, labels)
            batch_loss += loss.item()
            loss.backward()
            optimizer.step()
            epoch_loss += batch_loss
    print(f'Loss this epoch: {epoch_loss}\n')
            
        
    current_val_auc, current_val_f1, current_val_acc = get_scores(RONANetv1, val_dataloader)
    if current_val_auc > best_auc:
        best_auc = current_val_auc
        torch.save(RONANetv1.state_dict(), 'RONANetv1.pth')
        scores['AUC'] = current_val_auc
        scores['f1'] = current_val_f1
        scores['Accuracy'] = current_val_acc
        print(scores)

I have experimented with changing the learning rate (10e-5 to 0.1) and I’ve let it go for 100 epochs, but the loss will not change. It just hovers at ~38. I checked for the usual culprits: zeroing optimizer grad, calling loss.backward() and optimizer.step(), etc. If you have any thoughts or recommendations I’d love to hear. Thanks!