FCN ResNet18 low precision on SUNRGBD dataset

I am conducting an experiment, for the purposes of this experiment it is necessary for the resnet18 network to be accurate. The idea is to implement the network on the Jatson Nano microcontroller located on my mobile robot. The task of the mobile robot is to move in a technological environment and recognize machines and parts located on the machines.

For this experiment I am using the Python programming language, more precisely the “PyTorch” library.

I already did:

*Variations on epochs number (50, 100, 150, 400, 500, 750, 1000)
*Variation on Learning rate (0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001)
*Variation on optimizers (SGD, RMSprop, SGD (NAG), Adam, AdamW)
*Implement learning rate scheduler (I tried with gama=0.1 on every 100 epochs, gama=0.1 on every 50 epochs, gama=0.5 on every 20 epochs)
*Dropout (0.2, 0.3, 0.5) Weight_decay (0.01, 0.005, 0.001)
*Gradient clipping (1, 5, 10, 25, 50)
*Data Augmentation

What is my goal:

Currently, accuracy of my loss is 72%, and accuracy of my validation loss is 47%, I wanna make it way better, at least 80% both.

So my questions are:

Is it even possible with this type of model and ResNet18? If yes, what can I do to make it possible?

My code:

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torchvision import transforms
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision.models import resnet18
from tqdm import tqdm
import matplotlib.pyplot as plt

import time

s_time = time.time()

class FCNHead(nn.Sequential):
    def __init__(self, in_channels, channels):
        inter_channels = in_channels // 4
        layers = [
                nn.Conv2d(in_channels, inter_channels, 3, padding=1, bias=False),
                nn.BatchNorm2d(inter_channels),
                nn.ReLU(),
                nn.Dropout(0.1),
                nn.Conv2d(inter_channels, channels, 1)
            ]
        super(FCNHead, self).__init__(*layers)

    def forward(self, x):
        return super(FCNHead, self).forward(x)

# Define normalization parameters
mean = [0.3903, 0.2816, 0.2673]
std = [0.3092, 0.2990, 0.2931]

# Custom dataset class for SUNRGBD_new 
class CustomSUNRGBDNew(Dataset):
    def __init__(self, root_dir, split='train', augment=False):
        self.root_dir = root_dir
        self.split = split
        self.augment = augment
        self.image_dir = os.path.join(root_dir, 'image', split)
        self.depth_dir = os.path.join(root_dir, 'depth', split)
        self.label_dir = os.path.join(root_dir, 'label13', split)
        self.image_files = sorted([f for f in os.listdir(self.image_dir) if f.endswith('.jpg')])

        # Define the augmentations if `augment` is set to True
        if augment:
            self.image_transform = transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(degrees=15),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.3903, 0.2816, 0.2673], std=[0.3092, 0.2990, 0.2931])
            ])
        else:
            self.image_transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.3903, 0.2816, 0.2673], std=[0.3092, 0.2990, 0.2931])
            ])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.image_files[idx])
        depth_name = os.path.join(self.depth_dir, self.image_files[idx].replace('img-', '00').replace('.jpg', '.png'))
        label_name = os.path.join(self.label_dir, self.image_files[idx].replace('img-', 'img13labels-').replace('.jpg', '.png'))

        # Load image
        if not os.path.exists(img_name):
            raise FileNotFoundError(f'Image file not found: {img_name}')
        image = Image.open(img_name).convert('RGB')
        image = self.image_transform(image)  # Apply augmentations and normalization

        # Load depth map
        if not os.path.exists(depth_name):
            print(f'Depth file not found: {depth_name}')
            depth = torch.zeros((531, 730), dtype=torch.uint8)
        else:
            depth = Image.open(depth_name).convert('L')
            depth = torch.from_numpy(np.array(depth))

        # Load label map
        if not os.path.exists(label_name):
            print(f'Label file not found: {label_name}')
            label = torch.zeros((531, 730), dtype=torch.uint8)
        else:
            label = Image.open(label_name).convert('L')
            label = torch.from_numpy(np.array(label))

        return image, label, depth

# Unnormalization function
def unnormalize_image(image, mean, std):
    """Unnormalize the image for visualization."""
    image_np = image.squeeze(0).cpu().permute(1, 2, 0).numpy()  # Reshape to HWC format
    mean = np.array(mean)
    std = np.array(std)
    image_np = (image_np * std + mean) * 255  # Unnormalize and convert to [0, 255]
    image_np = image_np.astype(np.uint8)
    return image_np

# Custom FCN model using ResNet18 backbone
class FCNResNet18(nn.Module):
    def __init__(self, num_classes):
        super(FCNResNet18, self).__init__()
        backbone = resnet18(pretrained=True)
        self.backbone = nn.Sequential(*list(backbone.children())[:-2])  # Remove the last two layers (pooling and FC)
        self.head = FCNHead(512, num_classes)  # FCN head for segmentation
        self.dropout = nn.Dropout(p=0.5) # dodato za potrebe dropouta 12.11.

    def forward(self, x):
        x = self.backbone(x)
        x = self.dropout(x) # dodato za potrebe dropouta 12.11. 
        x = self.head(x)
        return x

# Dataset loading
root_dir = '/home/ITSstudent_1/dataset/SUNRGBD_new'  # The new preprocessed dataset directory
train_dataset = CustomSUNRGBDNew(root_dir=root_dir, split='train')
val_dataset = CustomSUNRGBDNew(root_dir=root_dir, split='test')

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4) # probati sa smanjivanjem na 16 max vrednost 128
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4) # smanjiti na 8 max vrednost 128

# Load the custom FCN model with ResNet18 backbone and 13 classes
num_classes = 13
model = FCNResNet18(num_classes)

# Load all available GPUs if possible
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # dodati :0 ako se koristi samo jedna graficka

# Move model to the selected device (GPU or CPU)
model = model.to(device)

# Class weights for loss function (assuming they are pre-saved in a file)
class_weights = torch.load('class_weights_newSUNRGBD.pth').to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=13)  # Adding ignore_index for ignored class
optimizer = optim.RMSprop(model.parameters(), lr=0.0001, weight_decay=0.01) # Promena optimizera

# Learning rate scheduler# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

# Training loop
num_epochs = 250
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels, depths in tqdm(train_loader):
        images = images.to(device)
        labels = labels.to(device).long()

        optimizer.zero_grad()
        outputs = model(images)

        # Upsample the output to match the label size
        outputs = torch.nn.functional.interpolate(outputs, size=labels.shape[1:], mode='bilinear', align_corners=False)

        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=25.0)
        optimizer.step()

        running_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        correct += (predicted == labels).sum().item()
        total += labels.numel()
        
    # Step the learning rate scheduler
    scheduler.step()
        
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}, Accuracy: {100 * correct / total:.2f}%")

    
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for images, labels, depths in tqdm(val_loader):
            images = images.to(device)
            labels = labels.to(device).long()

            outputs = model(images)

                # Upsample the output to match the label size
            outputs = torch.nn.functional.interpolate(outputs, size=labels.shape[1:], mode='bilinear', align_corners=False)

            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == labels).sum().item()
            val_total += labels.numel()

    print(f"Validation Loss: {val_loss / len(val_loader)}, Validation Accuracy: {100 * val_correct / val_total:.2f}%")

# Save the model after training
torch.save(model.state_dict(), 'fcn_resnet18_sunrgbd_new.pth')

# ------------------ Evaluation and Plotting Function with "check" Matrix ------------------ #
def visualize_prediction(model, dataset, index):
    """Visualize the image, label, predicted segmentation mask, and the check matrix."""
    model.eval()

    # Fetch image, label, and depth from the dataset at the specified index
    image, label, depth = dataset[index]
    image = image.unsqueeze(0).to(device)  # Add batch dimension and move to GPU

    # Make a prediction
    with torch.no_grad():
        output = model(image)

    # Get predicted class (argmax over the class dimension)
    pred = torch.argmax(output.squeeze(), dim=0).cpu().numpy()

    # "check" matrix: position of the max value in the class dimension for each pixel
    check = torch.argmax(output, dim=1).squeeze().cpu().numpy()  # Same as prediction, just highlighting max class

    # Unnormalize the image for visualization
    image_np = unnormalize_image(image, mean, std)

    label_np = label.cpu().numpy()  # Ground truth label

    # Plot the image, label, prediction, and "check" matrix
    fig, axes = plt.subplots(1, 4, figsize=(20, 5))

    # Plot the original image
    axes[0].imshow(image_np)
    axes[0].set_title('Image')
    axes[0].axis('off')

    # Plot the ground truth label
    axes[1].imshow(label_np, cmap='gray')
    axes[1].set_title('Ground Truth')
    axes[1].axis('off')

    # Plot the predicted segmentation mask
    axes[2].imshow(pred, cmap='gray')
    axes[2].set_title('Prediction')
    axes[2].axis('off')

    # Plot the "check" matrix
    axes[3].imshow(check, cmap='gray')
    axes[3].set_title('"Check" Matrix')
    axes[3].axis('off')

    plt.show()

e_time = time.time()

# Example usage: Visualize the prediction for the first sample in the validation dataset
for i in range(4):
    visualize_prediction(model, val_dataset, i)

print(f'Vreme rada je {(e_time - s_time)/3600 :.4f} sati.')