Constant validation loss and accuracy in CNN

I made a custom CNN architecture and when I try training the model, the validation accuracy and loss are not improving and the training accuracy is improving slightly. I also tried out with a pretrained model and it’s working fine for that.

This is the architecture below.

class Custom(nn.Module):
    def __init__(self, num_classes):
        super(Custom, self).__init__()
        
        self.conv1 = nn.Conv2d(3, 128, kernel_size=3, stride=1, padding=2)
        self.conv2 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=0)
        self.conv3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=0)
        self.conv4 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=0)
        self.conv5 = nn.Conv2d(512, 1024, kernel_size=3, stride=1, padding=0)
        self.conv6 = nn.Conv2d(1024, 1024, kernel_size=3, stride=1, padding=0)
        
        self.bn_relu_max1 = nn.Sequential(
            nn.BatchNorm2d(64*2),
            nn.ReLU(),
            nn.MaxPool2d(4, 2, 1)
            )
        self.bn_relu_max2 = nn.Sequential(
            nn.BatchNorm2d(128*2),
            nn.ReLU(),
            nn.MaxPool2d(4, 2, 1)
            )
        self.bn_relu_max3 = nn.Sequential(
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(4, 2, 1)
            )
        self.bn_relu_max4 = nn.Sequential(
            nn.BatchNorm2d(256*2),
            nn.ReLU(),
            nn.MaxPool2d(4, 2, 1)
            )
        self.bn_relu_max5 = nn.Sequential(
            nn.BatchNorm2d(256*4),
            nn.ReLU(),
            nn.MaxPool2d(4, 2, 1)
            )
        self.bn_relu_max6 = nn.Sequential(
            nn.BatchNorm2d(256*4),
            nn.ReLU(),
            nn.MaxPool2d(4, 2, 1)
            )
        
        self.adaptiveavgpool = nn.AdaptiveAvgPool2d((1,1))
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout2d(0.5),
            nn.Linear(1024, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes))
        
    def forward(self, input):
        
        x = self.conv1(input)
        x = self.bn_relu_max1(x)
        
        x = self.conv2(x)
        x = self.bn_relu_max2(x)
        
        x = self.conv3(x)
        x = self.bn_relu_max3(x)
        
        x = self.conv4(x)
        x = self.bn_relu_max4(x)
        
        x = self.conv5(x)
        x = self.bn_relu_max5(x)
        
        x = self.conv6(x)
        x = self.bn_relu_max6(x)
        
        x = self.adaptiveavgpool(x)
        x = self.classifier(x)
        
        return x
criterion = nn.CrossEntropyLoss()   
optimizer = bnb.optim.Adam8bit(model.parameters(), lr=0.001)#, momentum=0.9)
use_amp = True
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2)
scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

The training and validation loop are below.

total_step = len(loader_train)

def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(model, iterator, optimizer, criterion, device):
    
    liveloss = PlotLosses()
    epoch_loss = 0
    epoch_acc = 0
    #running_loss = 0.0
    #running_corrects = 0
    #training_acc = 0
    #train_acc = []
    losses = []
    #train_losses = []
    train.lrs=[]
    #total = 0
    #correct = 0

    model.train()

    for (x, y) in tqdm(iterator, desc="Training", leave=False):
        logs = {}
        x = x.to(device)
        y = y.to(device)
        
        with autocast(enabled=use_amp):
            optimizer.zero_grad()            
            y_pred = model(x)

            loss = criterion(y_pred, y)
            #loss = loss / accumulation_steps 
            losses.append(loss.item())

            acc = calculate_accuracy(y_pred, y)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()


            epoch_loss += loss.item()
            epoch_acc += acc.item()          


    train.lrs.append(optimizer.param_groups[0]['lr'])
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion, device):

    epoch_loss = 0
    epoch_acc = 0
    running_loss = 0.0
    running_corrects = 0
    #eval_acc = []
    #eval_loss = []
    losses = []
    evaluate.lrs_val = []
    #total = 0
    #correct = 0

    model.eval()

    with torch.no_grad():

        for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):

            x = x.to(device)
            y = y.to(device)
            
            #with autocast(enabled=use_amp):
            y_pred = model(x)

            loss = criterion(y_pred, y)
            losses.append(loss.item())

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            
            mean_loss = sum(losses)/len(losses)
            
            lr_scheduler.step(mean_loss)
                            
    evaluate.lrs_val.append(optimizer.param_groups[0]['lr'])
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

EPOCHS=10
best_valid_loss = float('inf')

for epoch in trange(EPOCHS, desc="Epochs"):

    start_time = time.monotonic()

    train_loss, train_acc = train(model, loader_train, optimizer, criterion,device)
    valid_loss, valid_acc = evaluate(model, loader_valid, criterion,device)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut3-model.pt')

    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

The dataset has 5 classes. I have performed data augmentation using albumentations to tackle class imabalance. Not sure why this happening. Any help would be appreciated.

Your code looks fine to me. Did you check the gradients? It might worth a shot to see the gradient flow.
Another trick to try is adding a regularisation to your optimizer or manually implement it.

I tried adding this function to check the gradient flow of the network but it’s giving me an attribute error.

def plot_grad_flow(named_parameters):
    '''Plots the gradients flowing through different layers in the net during training.
    Can be used for checking for possible gradient vanishing / exploding problems.
    
    Usage: Plug this function in Trainer class after loss.backwards() as 
    "plot_grad_flow(self.model.named_parameters())" to visualize the gradient flow'''
    ave_grads = []
    max_grads= []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            layers.append(n)
            ave_grads.append(p.grad.abs().mean().cpu().detach().numpy())
            max_grads.append(p.grad.abs().max().cpu().detach().numpy())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])

Error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_3753/3966350946.py in <cell line: 4>()
      6     start_time = time.monotonic()
      7 
----> 8     train_loss, train_acc = train(model, loader_train, optimizer, criterion,device)
      9     valid_loss, valid_acc = evaluate(model, loader_valid, criterion,device)
     10 

/tmp/ipykernel_3753/3118454070.py in train(model, iterator, optimizer, criterion, device)
     32 
     33             scaler.scale(loss).backward()
---> 34             plot_grad_flow(model.named_parameters())
     35             scaler.step(optimizer)
     36             scaler.update()

/tmp/ipykernel_3753/2366640368.py in plot_grad_flow(named_parameters)
     11         if(p.requires_grad) and ("bias" not in n):
     12             layers.append(n)
---> 13             ave_grads.append(p.grad.abs().mean().cpu().detach().numpy())
     14             max_grads.append(p.grad.abs().max().cpu().detach().numpy())
     15     plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")

AttributeError: 'NoneType' object has no attribute 'abs'

I implemented the function here,

scaler.scale(loss).backward()
plot_grad_flow(model.named_parameters())
scaler.step(optimizer)
scaler.update()

Is there something wrong with the function?

The error shows that your parameters have None values. Might be the case that the gradients have None value and model ends up not learning, or they’re not computed.

If I’m not mistaken learning rate of 1e-4 is recommended when using Adam- it is widely used. It’s worth to try, because another case might be that your model overshoots and ends up on a local minimum in the early training phase. (also since your model is quite deep)

Yes I did try that and couple other learning rates too but no luck! :confused:

Also is it right to have flatten layer right after adaptiveavgpool? Can it be the problem?