My model won't learn, no matter what I do!

Hey! I’m having some issues with a CNN model I’m trying to recreate from a paper, I’m hoping somebody could help me out :slight_smile: No matter what I do, the model just doesn’t learn, and I’m not sure why. I’ve tried with different optimizers (Adam & SGD with momentum), but the loss and accuracy values just don’t change at all. Here’s a small example on how it just doesn’t change (stays the same at ~2.080, barely changes no matter the number of epochs):

Epoch 0: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:30<00:00,  0.53it/s, v_num=4, train_loss_step=2.070, val_loss=2.070, train_loss_epoch=3.170]

Epoch 1: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.97it/s, v_num=4, train_loss_step=2.080, val_loss=2.070, train_loss_epoch=2.080]

Epoch 2: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.77it/s, v_num=4, train_loss_step=2.060, val_loss=2.060, train_loss_epoch=2.080]

Epoch 3: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.77it/s, v_num=4, train_loss_step=2.060, val_loss=2.060, train_loss_epoch=2.080]

Epoch 4: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:05<00:00,  2.77it/s, v_num=4, train_loss_step=2.060, val_loss=2.060, train_loss_epoch=2.080]

Here’s the model, for reference:

class Modelo(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.LeakyReLU(negative_slope=0.05, inplace=True),
        )

        self.classifier = nn.Sequential(
             nn.Dropout(p=0.5),
             nn.Linear(256 * 5 * 5, 256),
             nn.ReLU(inplace=True),
             nn.Dropout(p=0.5),
             nn.Linear(256, 256),
             nn.ReLU(inplace=True),
             #nn.Dropout(p=0.5),
             nn.Linear(256, 8)
        )

        for layer in self.features:
            if type(layer) == nn.Conv2d:
                nn.init.xavier_uniform_(layer.weight)
        for layer in self.classifier:
            if type(layer) == nn.Linear:
                nn.init.xavier_uniform_(layer.weight)


    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)

        return F.log_softmax(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = F.cross_entropy(logits, y)
        self.log('train_loss', loss, on_epoch=True, on_step=True,prog_bar=True)

        accu = ((argmax(logits, dim=1) == y).sum()/x.shape[0]).item()
        self.log('train_accu', accu, on_epoch=True, on_step=True, prog_bar=True)

        f1 = F1Score(task="multiclass", num_classes=8)
        f1_score = f1(logits, y)
        self.log('train_f1', f1_score, on_epoch=True, on_step=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        #return optim.SGD(self.parameters(), lr = 0.0001, momentum = 0.9, weight_decay = 0.0001)
        return optim.Adam(self.parameters(), lr=0.001)

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self(x)

        loss = F.cross_entropy(logits, y)
        self.log('val_loss', loss, on_epoch=True, on_step=False,prog_bar=True)

        accu = ((argmax(logits, dim=1) == y).sum()/x.shape[0]).item()
        self.log('val_accu', accu, on_epoch=True, on_step=True, prog_bar=True)

        f1 = F1Score(task="multiclass", num_classes=8)
        f1_score = f1(logits, y)
        self.log('val_f1', f1_score, on_epoch=True, on_step=True, prog_bar=True)

And training code:

        train_transform = transforms.Compose([
                transforms.Grayscale(num_output_channels=1),
                transforms.Resize((42, 42)),
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(45),
                transforms.RandomResizedCrop(42, scale=(0.875, 1.125), ratio=(1.0, 1.0)),
                transforms.ToTensor()
        ])
        val_transform = transforms.Compose([
                transforms.Grayscale(num_output_channels=1),
                transforms.Resize((42, 42)),
                transforms.ToTensor()
        ])

        train_dataset = ImageFolder(root=train_dataset_dir, transform=train_transform)
        val_dataset   = ImageFolder(root=val_dataset_dir,   transform=val_transform)

        batch_size = 128

        train_loader = DataLoader(train_dataset, shuffle=True,batch_size=batch_size)
        val_loader = DataLoader(val_dataset, shuffle=False, batch_size=batch_size)

        model = module.Modelo()

        early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=0.001, patience=5, verbose=False, mode="max")
        trainer = L.Trainer(accelerator="tpu", devices="auto", strategy="auto", callbacks=[early_stop_callback],max_epochs=2000,default_root_dir=model_dir)
        trainer.fit(model, train_loader, val_loader)

I’m really lost here so if anyone could help me out I’d greatly appreciate it!

Try to overfit a small sunset of the data, e.g. just 10 samples, and remove the F.log_softmax as it’s unnecessary since raw logits are expected.

I’m getting even worse values… it’s very strange. Here’s what it looks like after 50 epochs:

Epoch 49/49 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1/1 0:00:00 • 0:00:00 0.00it/s v_num: 12.000 train_loss_step: 2.061 train_accu_step: 0.000 train_f1_step:
                                                                                    0.000 val_accu_step: 0.000 val_f1_step: 0.000 val_loss: 2.110             
                                                                                    val_accu_epoch: 0.000 val_f1_epoch: 0.000 train_loss_epoch: 2.061         
                                                                                    train_accu_epoch: 0.000 train_f1_epoch: 0.000

(note that I changed the progress bar but everything else is the same)

This is weird since your model should not be worse than pure random chance.
E.g. even if your model would output the same label for all samples, the class frequency would correspond to the accuracy. If all classes are balanced, it would be 1/8 in your case.

I’m also able to overfit random noise using your model perfectly:

device = "cuda"
model = Modelo().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
x, y = torch.randn(64, 1, 42, 42, device=device), torch.randint(0, 8, (64,), device=device)

for i in range(300):
    optimizer.zero_grad()
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    loss.backward()
    optimizer.step()
    
    acc = ((torch.argmax(logits, dim=1) == y).sum()/x.shape[0]).item()
    print("iter {}, loss {:.3f}, acc {}".format(i, loss.item(), acc))
    
# iter 0, loss 2.080, acc 0.0625
# iter 1, loss 2.078, acc 0.15625
# iter 2, loss 2.075, acc 0.125
# iter 3, loss 2.068, acc 0.1875
# ...
# iter 100, loss 2.018, acc 0.1875
# iter 101, loss 1.995, acc 0.21875
# iter 102, loss 2.010, acc 0.1875
# iter 103, loss 2.006, acc 0.203125
# ...
# iter 296, loss 0.010, acc 1.0
# iter 297, loss 0.007, acc 1.0
# iter 298, loss 0.009, acc 1.0
# iter 299, loss 0.008, acc 1.0

so you might need to play around with the hyperparameters more.

Try to add BN layer after conv2d and before relu.
Based on my experience, if you face to a model whose loss cannot reduce, RELAX AND IT IS COMMON!!! T…T
I often cannot sleep when i encounter such a problem, and feel very bad.