Code Review for Pytorch classification training code

Following is my code for training an efficientnet-b2 pytorch model for 17 classes. I trained the exact same model in tensorflow and got very good results but dont seem to get good results in pytorch. can someone help me with if anything is wrong in the code. or if i can make any improvements?

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DECODED_IMAGE_DIRECTORY = '/raid/CLASSIFICATION/iter9_more_data/dcms_labelled'
MODEL_DIR = "/raid/classification_model/iter12_new_scipt"

BATCH_SIZE = 64
EPOCHS = 50
INPUT_SHAPE = (3, 150, 150)  # PyTorch convention: (channels, height, width)
CLASSES = ["uTurnRight", "straightRight", "left", "straightLeft", "straightUTurn", "uTurnLeft","right", "UNKNOWN", "straight", "LeftRight", "mergeRight","mergeLeft","slightRight","slightLeft","straightLeftRight","secondLeft","secondRight"]


LABEL_TO_INT = {'uTurnRight': 0, 'straightRight': 1, 'left': 2, 'straightLeft': 3, 'straightUTurn': 4, 'uTurnLeft': 5,
                'right': 6, 'UNKNOWN': 7, 'straight': 8, "LeftRight" : 9, "mergeRight":10, "mergeLeft":11, "slightRight":12, "slightLeft":13, "straightLeftRight":14, "secondLeft":15, "secondRight":16}

class CustomDataset(torch.utils.data.Dataset):
    
    def __init__(self, filenames, labels, transform=None): # limit=None
        self.filenames = filenames
        self.labels = labels
        self.transform = transform
        
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self, index):
        
        img = cv2.imread(self.filenames[index])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        label = self.labels[index]
        if self.transform:
            img = self.transform(img)  

        return img, label
    
def filenames_vs_labels(labelled_geojsons):
    filenames = []
    labels = []
    for filename in labelled_geojsons:
        name = basename(filename).split(".")[0]
        idx1 = name.rfind('_')
        label = name[idx1+1:]
        filenames.append(filename)
        labels.append(label)
    return filenames, labels


labels_vs_images = glob.glob(os.path.join(DECODED_IMAGE_DIRECTORY, "*.png"))
filenames, labels = filenames_vs_labels(labels_vs_images)
labels = [LABEL_TO_INT[label] for label in labels]

x_train, x_test, y_train, y_test = train_test_split(filenames, labels, test_size=0.3, shuffle=True, stratify=labels)
data_transforms = transforms.Compose([
            transforms.ToPILImage(), 
            transforms.Resize((INPUT_SHAPE[0], INPUT_SHAPE[1])),
            transforms.ToTensor(), 
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) #check

print("Filenames of images in test set:")
for filename in x_test:
    print(os.path.basename(filename))
    
train_dataset = CustomDataset(x_train, y_train, transform=data_transforms)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset(x_test, y_test, transform=data_transforms)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.2):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, outputs, labels):
        criterion = nn.CrossEntropyLoss(reduction='none')
        
        ce_loss =criterion(outputs, labels)
        pt = torch.exp(-ce_loss)
        focal_loss = (self.alpha * (1 - pt) ** self.gamma * ce_loss).mean()
        return focal_loss 

num_classes = len(CLASSES)

class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        self.backbone_model = models.efficientnet_b2(pretrained=True)
        self.backbone_model._avg_pooling = nn.AdaptiveAvgPool2d(1)
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc1 = nn.Linear(1000, 512)
        self.dropout2 = nn.Dropout(p=0.2)
        self.fc2 = nn.Linear(512, 256)
        self.fc_out = nn.Linear(256, num_classes)
    
    def forward(self, x):
        x = self.backbone_model(x)
        x = torch.flatten(x, 1)
        x = self.dropout1(x)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc_out(x) 
        x = torch.softmax(x, dim=1)
        return x


criterion = FocalLoss(gamma=2.0, alpha=0.2)
backbone_model = CustomModel()
backbone_model.to(device)
optimizer = optim.Adam(backbone_model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    backbone_model.train()
    running_loss = 0.0
    running_val_loss = 0.0
    total = 0
    correct = 0
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{EPOCHS}"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = backbone_model(images)
        # print(images, "images")
        # print(backbone_model, "model")
        # print(outputs, "outputs")
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct / total
    
    
        
    # Evaluation
    backbone_model.eval()
    correct = 0
    total = 0
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = backbone_model(images)
        val_loss = criterion(outputs, labels)
        running_val_loss += val_loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    val_loss_avg = running_val_loss / len(test_loader)
    val_accuracy = 100 * correct / total

    accuracy = correct / total
    print(f'Epoch [{epoch+1}/{EPOCHS}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Validation Accuracy: {val_accuracy:.2f}%') 

torch.save(backbone_model.state_dict(), os.path.join(MODEL_DIR, "iter2_classification.pth"))

Your loss calculation looks wrong since you are applying a softmax before passing the outputs to nn.CrossEntropyLoss while logits are expected since internally log_softmax will be applied. Afterwards you are then computing a focal loss with it, so check how the loss calculation is done in your reference code.

backbone_model = enet.EfficientNetB2(include_top=False, input_shape=INPUT_SHAPE, pooling=‘avg’, weights=‘imagenet’)

backbone_model.trainable = True
model = models.Sequential()
model.add(backbone_model)
model.add(layers.Dropout(0.2, name=“drop1”))
model.add(layers.Dense(512, activation=‘relu’, name=“fc1”))
model.add(layers.Dropout(0.2, name=“drop2”))
model.add(layers.Dense(256, activation=‘relu’, name=“fc2”))
model.add(layers.Dense(len(CLASSES), activation=“softmax”, name=“fc_out”))

model.summary()
model.compile(loss=focal_loss(gamma=2.0, alpha=0.2),
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
metrics=[‘accuracy’, tf.keras.metrics.Precision(),tf.keras.metrics.Recall()])

Thanks for your reply, the above is what the tensorflow code looks like