CNN Multilabel classification architecture & hyperparameter tuning

Hi,
I am currently training a multilabel classification that takes as input RGB 268x180 images [3,268,180], with labels that are one-hot encoded for each class [7]

It would be appreciate if you have any recommendations how to modify my architecture/hyperparameters to obtain lower train/validation loss and higher F1_score for more accurate predicitons

num_epochs = 1
eval_every = 100
total_step = len(train_loader)*num_epochs
best_val_loss = None
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
save_path = f'cifar_net.pt'
model = model.to(device)
TRAIN(model, train_loader, valid_loader, num_epochs, eval_every, total_step, criterion, optimizer, best_val_loss, device, save_path)

After training I obtain this output:

Epoch [1/1], Step [100/260], Train Loss: 0.7003, Train Acc: 0.1125, Valid Loss: 0.6889, Valid Acc: 4.4154 Model saved to ==> cifar_net.pt Epoch [1/1],
Step [200/260], Train Loss: 0.6902, Train Acc: 0.1528, Valid Loss: 0.6886, Valid Acc: 3.9231 Model saved to ==> cifar_net.pt Finished Training

This is my “naive” architecture as of now

class Net(Module):   
    def __init__(self):
        super(Net, self).__init__()

        self.cnn_layers = Sequential(
            # Defining a 2D convolution layer
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # Defining another 2D convolution layer
            nn.Conv2d(16, 3, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(3),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        self.linear_layers = Sequential(
            nn.Linear(3*67*45, 100),
            nn.ReLU(),
            nn.BatchNorm1d(100),
            nn.Dropout(p=0.4),
            nn.Linear(100,50),
            nn.BatchNorm1d(50),
            nn.Dropout(p=0.3),
            nn.ReLU(),
            nn.Linear(50,7),
            nn.ReLU(),
            nn.Sigmoid()
            
        )

    # Defining the forward pass    
    def forward(self, x):
        x = self.cnn_layers(x)
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x

Thank you in advance

Could you try to remove the last nn.ReLU before the nn.Sigmoid?

Also, I assume your targets are multi-hot encoded, i.e. each sample might contain more than a single valid class?

That’s right, multi-hot encoded is more appropriate as each image can have more than a single class i.e. a movie can be Drama and Adventure genre

Thanks for the confirmation.

Is the validation accuracy calculation printed as a percentage, i.e. multiplied by 100?
The training accuracy seems to be in the range [0, 1], while it’s higher than 1 for the validation set.

I’d say my main concern is a way to improve the architecture, I have tried understanding the ResNet and VGG architectures. And the architecture must have at least one batchnorm and/or maxpool and/or dropout layers, less than 10Conv2d layers and less than 1,000,000 trainable parameters

I think there is a bug in the validation accuracy printout, but not too relevant as I am mainly using the loss to see how my model is doing, ideally also would used the F1_score but I believe I can print that with a classification report library

def save_checkpoint(save_path, model, optimizer, val_loss):
    if save_path==None:
        return
    save_path = save_path 
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'val_loss': val_loss}

    torch.save(state_dict, save_path)

    print(f'Model saved to ==> {save_path}')

def load_checkpoint(model, optimizer):
    save_path = f'cifar_net.pt'
    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    val_loss = state_dict['val_loss']
    print(f'Model loaded from <== {save_path}')
    
    return val_loss



def TRAIN(net, train_loader, valid_loader,  num_epochs, eval_every, total_step, criterion, optimizer, val_loss, device, save_name):
    
    running_loss = 0.0
    running_corrects = 0
    running_num = 0
    global_step = 0
    if val_loss==None:
        best_val_loss = float("Inf")  
    else: 
        best_val_loss=val_loss
    

    for epoch in range(num_epochs):  # loop over the dataset multiple times

        for i, (inputs, labels) in enumerate(train_loader):
            net.train()
            inputs = inputs.to(device)
            labels = labels.to(device)
#             print("inputs size")
#             print(inputs.size())
#             print("labels size")
#             print(labels.size())

            '''Training of the model'''
            # Forward pass
            outputs = net(inputs)
#             print("outputs.size()"+str(outputs.size()))
            threshold = 0.5
            preds = (outputs>threshold).int()
            for k in range(0, len(preds)):
                running_corrects += torch.equal(preds[k], labels[k].int())
           # _, preds = torch.max(outputs.data, 1)
#             print("preds.size()"+ str(preds.size()))
#             print(str(torch.max(outputs.data,1)))
            labels = labels.float()
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_step += 1

            running_loss += loss.item()
            #running_corrects += torch.sum(preds == labels.data)
            running_num += len(labels)

            '''Evaluating the model every x steps'''
            if global_step % eval_every == 0:
                with torch.no_grad():
                    net.eval()
                    val_running_loss = 0.0
                    val_running_corrects = 0
                    for val_inputs, val_labels in valid_loader:
                        val_inputs = val_inputs.to(device)
                        val_labels = val_labels.to(device)
                        val_outputs = net(val_inputs)
                        val_labels = val_labels.float()
                        val_loss = criterion(val_outputs, val_labels)
                        threshold = 0.5
                        val_preds = (val_outputs>threshold).int()
                        for j in range(0, len(val_preds)):
                            val_running_corrects += torch.equal(val_preds[j], val_labels[j].int())
                        val_running_loss += val_loss.item()
                        #val_running_corrects += torch.sum(preds == val_labels.data)


                    average_train_loss = running_loss / eval_every
                    average_val_loss = val_running_loss / len(valid_loader)
                    average_train_acc = running_corrects / float(running_num)
                    average_val_acc = val_running_corrects / float(len(valid_loader))
            

                    print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Train Acc: {:.4f}, Valid Loss: {:.4f},  Valid Acc: {:.4f}'
                          .format(epoch+1, num_epochs, global_step, total_step, average_train_loss,
                                 average_train_acc, average_val_loss, average_val_acc))
                    # ...log the running loss
                    writer.add_scalar('training loss', running_loss / 1000,epoch * len(train_loader) + i)
                    writer.add_scalar('validation loss', val_running_loss / 1000,epoch * len(valid_loader) + i)

            
                    #writer.add_figure('predictions vs. actuals', plot_classes_preds(net, inputs, labels),global_step=epoch * len(train_loader) + i)
        

                    running_loss = 0.0
                    running_num = 0
                    running_corrects = 0
                    
                    if average_val_loss < best_val_loss:
                        best_val_loss = average_val_loss
                        save_checkpoint(save_name, net, optimizer, best_val_loss)
                    
                    

    print('Finished Training')