CNN model does not update weights

Hello,
I implemented a CNN for CIFAR-10 image classification, but my model does not seem to update its weights. I already looked into several threads discussing the same symptoms, but my model seems to have a different problem.

Model

class CNNModel(ImageClassificationBase):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 64 x 16 x 16

            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 128 x 8 x 8

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # output: 256 x 4 x 4

            nn.Flatten(), 
            nn.Linear(256*4*4, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 10))
        
    def forward(self, x):
        return self.network(x)

The model inherits ImageClassificationBase

class ImageClassificationBase(nn.Module):
    def training_step(self, batch, criterion):
        images, labels = batch
        images = images.to(device)
        labels = labels.to(device)
        out = self(images)              # Generate predictions
        loss = criterion(out, labels)   # Calculate loss
        return loss
    
    def validation_step(self, batch, criterion):
        images, labels = batch
        images = images.to(device)
        labels = labels.to(device)
        out = self(images)                    # Generate predictions
        loss = criterion(out, labels)         # Calculate loss
        acc = accuracy(out, labels)           # Calculate accuracy
        return {'val_loss': loss.detach(), 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch+1, result['train_loss'], result['val_loss'], result['val_acc']))

Training Loop

def fit(epochs, model, train_loader, val_loader, optimizer, criterion):
    history = []
    for epoch in range(epochs):
        # Training phase 
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch, criterion)
            train_losses.append(loss)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # Validation phase
        result = evaluate(model, val_loader, criterion)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    return history

Further code used for training

def accuracy(output, labels):
    _, predictions = torch.max(output, dim=1)
    return torch.tensor(torch.sum(predictions == labels).item() / len(predictions))

def evaluate(model, val_loader, criterion):
    model.eval()
    outputs = [model.validation_step(batch, criterion) for batch in val_loader]
    return model.validation_epoch_end(outputs)

I use CrossEntropyLoss as criterion

criterion = nn.CrossEntropyLoss()

and Adam as optimizer

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay, eps=eps)

In some other thread someone said to look at

list(model.parameters())[0].grad

which returns arrays completly filled with zeros.

Accuracy and loss stays the same over a couple of epochs (the tensor-list is the model prediction for individual images).

In some other threads the problem was a combination of Softmax and CrossEntropyLoss - which is not the problem in my case as i am using ReLU as last activation layer.
I already tried SGD as well, both optimizers with different learning rates (0.001-0.0001). Nothing worked.

I really have no idea where the problem is…
Thanks for the help!

Could you post how you are creating the optimizer and also the result of loss for each batch in the loader?

I initialize the optimizier as follows:

model = CNNModel().to(device)
optimizer = initialize_optimizer(model, "adam")

def initialize_optimizer(model, opt_func):
    if opt_func == 'adam':
        return torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay, eps=eps)
    elif opt_func == 'adagrad':  
        return torch.optim.Adagrad(model.parameters(), lr=learning_rate, lr_decay=lr_decay, weight_decay=weight_decay, eps=eps)
    elif opt_func == 'sgd':
        return torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
    else:
        print("Error! Specified optimizer has not been implemented.")

Here are the losses for each batch for the third epoch (I left some out since the loss is pretty much the same for every batch for every epoch.

Batch Loss: tensor(2.3031, device='cuda:0', grad_fn=<NllLossBackward0>)
Batch Loss: tensor(2.3029, device='cuda:0', grad_fn=<NllLossBackward0>)
Batch Loss: tensor(2.3038, device='cuda:0', grad_fn=<NllLossBackward0>)
Batch Loss: tensor(2.3028, device='cuda:0', grad_fn=<NllLossBackward0>)
....
Batch Loss: tensor(2.3024, device='cuda:0', grad_fn=<NllLossBackward0>)
Batch Loss: tensor(2.3024, device='cuda:0', grad_fn=<NllLossBackward0>)
Batch Loss: tensor(2.3040, device='cuda:0', grad_fn=<NllLossBackward0>)
Batch Loss: tensor(2.3011, device='cuda:0', grad_fn=<NllLossBackward0>)
Batch Loss: tensor(2.3009, device='cuda:0', grad_fn=<NllLossBackward0>)

I would sanity check something like sum([p.sum() for p in model.parameters()] after each iteration. If this isn’t changing, it suggests that the gradient updates aren’t being applied. If it is changing, then it could be that the model is being repeatedly re-initialized somehow.


Model parameters seem to change; going towards zero. Validation accuracy and training loss stay the same though.

I don’t think that the model is repeatedly re-initialized though. This is exactly how I initiate training:

# Initialize dataloaders
train_dl = DataLoader(train_ds, batch_size_target, num_workers=2, shuffle=True, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size_target, num_workers=2, pin_memory=True)

# Initialize model
model = CNNModel().to(device)

# Initialize optimizer
optimizer = initialize_optimizer(model, opt_func)
criterion = nn.CrossEntropyLoss()

_ = fit(num_epochs, model, train_dl, val_dl, optimizer, criterion)

I recoded initialize_optimizer to instanciate the optimizer first before returning the optimizer object, but it did not help as well.

I found the problem! The reason for it seemed to be the combination of settings for weight_decay=0.005 and learning_rate=0.0005. When I set weight_decay=0 the model started to learn.

Still, I have to say that I find it quite strange. I always thought weight_decay and learning_rate are independent of each other - accordingly I thought that weight_decay can also be greater than learning_rate. Strange.

Thanks for your help though!