MASK RCNN training on Cityscapes giving wrong results

I am using pytorch fine tuning mask rcnn from the tutorial and set pretrained to true to train the model on cityscapes dataset. I haven’t converted the cityscapes to COCO format. Instead I have created the dataset class to create binary mask using polygons. I am using pytorch lightning for training. I monitored tval loss and val acc and it seems ok for 30 epochs.

My dataset class.

class Cityscapes(Dataset):
    def __init__(self, img_dir, ann_dir, split, transform= None):

        self.categories = {'person': 1, 'car':2, 'rider':3, 'bus': 4, 'train': 5, 'truck': 6, 'motorcycle': 7, 'bicycle': 8 }
        #self.root_dir = root
        assert split in ["train", "val", "test"]
        img_dir = os.path.abspath(os.path.join(img_dir, split))
        ann_dir = os.path.abspath(os.path.join(ann_dir, split))
        self.ann_dir = ann_dir
        img_name = os.path.join(img_dir, "*", "*_leftImg8bit.png")
        #print(ann_dir)
        self.img_paths = sorted(glob.glob(img_name))
        #print(self.img_paths)
        #similarly for annotations (polygons.json path)
        ann_name = os.path.join(ann_dir, "*", "*_polygons.json")
        self.annots_paths = list(sorted(glob.glob(ann_name)))
        #print(annots_paths)
        assert len(self.img_paths) == len(self.annots_paths)
        self.transform = transform

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, index):
        annots_file = self.annots_paths[index]
        images = self.img_paths[index]
        img_read = cv2.imread(images)
        # opencv takes the image and open into BGR format, so need to convert to RGB format
        rgb = cv2.cvtColor(img_read, cv2.COLOR_BGR2RGB)
        with open(annots_file, 'r') as f:
            data = json.load(f)
        height = data['imgHeight']
        width = data['imgWidth']
        bboxes = []
        labels = []
        masks = []
        for item in data['objects']:
            label = item['label']
            if label not in self.categories.keys():
                continue
            label = self.categories[item['label']]
            #extract the bounding box coordinates from the polygons.json file.
            poly = np.array(item['polygon'], dtype=np.int32)
            poly[poly < 0] = 0
            x_min = np.min(poly[:, 0])
            y_min = np.min(poly[:, 1])
            x_max = np.max(poly[:, 0])
            y_max = np.max(poly[:, 1])
            mask = np.zeros((height, width), dtype=np.uint8)
            bboxes.append([x_min, y_min, x_max, y_max])
            labels.append(label)
            # after getting the bboxes and labels, use opencv fillpoly method to draw masks over the object labels selected in self.categories
            cv2.fillPoly(mask, pts=[poly], color=(255,))
            #plt.imshow(mask, cmap='gray')
            #plt.show()
            masks.append(mask)
        if(len(masks) ==0):
            mask = np.zeros((height, width), dtype=np.uint8)
            masks.append(mask)

        bboxes = np.array(bboxes)
        masks = np.array(masks)

        if(len(masks) > 0):
            transformed = self.transform(image= rgb, bboxes=bboxes, class_labels=labels, masks = masks)

            image_tr = transformed["image"] / 255.0
            bboxes = transformed["bboxes"]
            masks = transformed['masks']

            masks = torch.tensor(np.stack(masks, axis=0)) // 255  # Transformed masks as input

        if len(bboxes) > 0:
            bboxes = torch.stack([torch.tensor(item) for item in bboxes])
            labels = torch.stack([torch.tensor(item) for item in labels])
            #print(labels)
        else:
            bboxes = torch.zeros(0,4)

        return image_tr, masks, bboxes, labels

My model loading and accuracy calculations

class MASKRCNN(pl.LightningModule):
    def __init__(self, n_classes, batchsize):
        super(MASKRCNN, self).__init__()
        self.n_classes = n_classes
        self.batchsize = batchsize

        self.detector = torchvision.models.detection.maskrcnn_resnet50_fpn(min_size=600, max_size=1200,
                                                                           weight_backbone=True)
        in_features = self.detector.roi_heads.box_predictor.cls_score.in_features
        self.detector.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)
        in_features_mask = self.detector.roi_heads.mask_predictor.conv5_mask.in_channels
        hidden_layer = 256
        # and replace the mask predictor with a new one
        self.detector.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, n_classes)

        self.best_val_acc = 0
        self.val_acc_stack = []
        self.training_step_outputs = []
        self.log('val_loss', 100000)
        self.log('val_acc', self.best_val_acc)

        self.lr = 1e-5  # Original base lr is 1e-4
        self.momentum = 0.9
        self.weight_decay = 0.0001

    def forward(self, imgs, targets=None):
        # Torchvision FasterRCNN returns the loss during training
        # and the boxes during eval
        self.detector.eval()
        return self.detector(imgs)

    def configure_optimizers(self):

        optimizer = torch.optim.SGD(self.parameters(), lr=self.lr)

        lr_scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5,
                                                                    threshold=0.0001, min_lr=0, eps=1e-08),
            'monitor': 'val_loss'}

        return [optimizer], [lr_scheduler]

    def on_before_zero_grad(self, optimizer):
        print("I am calling the optimizer now")

    def train_dataloader(self):
        num_train_sample_batches = len(cityscapes_train) // self.batchsize
        temp_indices = np.array([i for i in range(len(cityscapes_train))])
        np.random.shuffle(temp_indices)
        sample_indices = []
        for i in range(num_train_sample_batches):

            batch = temp_indices[self.batchsize * i:self.batchsize * (i + 1)]

            for index in batch:
                sample_indices.append(index)

        return torch.utils.data.DataLoader(cityscapes_train, batch_size=self.batchsize, sampler=sample_indices, shuffle=False,
                                           collate_fn=collate_fn)

    def training_step(self, batch, batch_idx):

        imgs = list(image.cuda() for image in batch[0])

        targets = []
        for mask, boxes, labels in zip(batch[1], batch[2], batch[3]):
            target = {}
            target["boxes"] = boxes.float().cuda()
            #print(len(target['boxes']))
            target["labels"] = torch.as_tensor(labels, dtype=torch.int64).cuda()
            target["masks"] = mask.cuda()

            targets.append(target)

        # fasterrcnn takes both images and targets for training, returns
        # Detection using source images

        # temp_loss = []
        # for index in range(len(imgs)):
        # detections = self.detector([imgs[index]], [targets[index]])
        # temp_loss.append(sum(loss1 for loss1 in detections.values()))

        loss_dict = self.detector(imgs, targets)
        #print(loss_dict)
        loss_classifier = loss_dict['loss_classifier']
        loss_box_reg = loss_dict['loss_box_reg']
        loss_mask = loss_dict['loss_mask']


        loss = sum(loss for loss in loss_dict.values())
        self.training_step_outputs.append(loss)
        print('train_loss: {}'.format(loss))

        return {"loss": loss}  # , "log": loss_dict.detach().cpu()}

    def on_train_epoch_end(self):
        all_losses = torch.mean(torch.stack(self.training_step_outputs))
        #epoch_loss = torch.mean(all_losses)
        #self.log('train_loss', all_losses, on_step=True, on_epoch=True)
        print('epoch_loss: {}'.format(all_losses))
        self.training_step_outputs.clear()
    def validation_step(self, batch, batch_idx):
        img, mask, boxes, label = batch

        preds = self.forward(img)
        #print(preds)
        preds[0]['masks'] = preds[0]['masks'][preds[0]['scores'] > 0.5]
        print(preds[0]['scores'])
        preds[0]['masks'][preds[0]['masks'] > 0.5] = 1
        preds[0]['masks'][preds[0]['masks'] <= 0.5] = 0
        # self.val_acc = torch.mean(torch.stack([self.accuracy(b,pb["boxes"],iou_threshold=0.5) for b,pb in zip(boxes,pred_boxes)]))

        # self.val_acc_stack[domain[0]].append(torch.stack([self.accuracy(b,pb["boxes"],iou_threshold=0.5) for b,pb in zip(boxes,preds)]))
        self.val_acc_stack.append(self.accuracy(mask[0], preds[0]['masks'].type(torch.uint8)))

        # return val_acc_stack

    def on_validation_epoch_end(self):

        temp = torch.mean(torch.stack(self.val_acc_stack))

        self.log('val_loss', 1 - temp)  # Logging for model checkpoint
        self.log('val_acc', temp)
        if (self.best_val_acc < temp):
            self.best_val_acc = temp
            self.best_val_acc_epoch = self.trainer.current_epoch

        self.val_acc_stack = []

        print('Validation IOU: ', temp)

    def mask_iou(self, src_masks, tgt_masks):

        # src masks are of dimension N X H X W
        # tgt masks (predictions) are of dimension M X 1 X H X W
        maskiou_matrix = torch.zeros(len(src_masks), len(tgt_masks)).cuda()

        for src_index in range(len(src_masks)):
            src_mask = src_masks[src_index]

            for tgt_index in range(len(tgt_masks)):
                tgt_mask = tgt_masks[tgt_index][0]
                #print(tgt_mask)
                # print(torch.sum(torch.bitwise_and(src_mask, tgt_mask)))
                maskiou_matrix[src_index, tgt_index] = float(torch.sum(torch.bitwise_and(src_mask, tgt_mask))) / float(
                    torch.sum(torch.bitwise_or(src_mask, tgt_mask)))

        return maskiou_matrix

    def accuracy(self, src_boxes, pred_boxes, iou_threshold=0.5):
        """
        #The accuracy method is not the one used in the evaluator but very similar
        """
        total_gt = len(src_boxes)
        total_pred = len(pred_boxes)

        if total_gt > 0 and total_pred > 0:

            # Define the matcher and distance matrix based on iou
            matcher = Matcher(iou_threshold, iou_threshold, allow_low_quality_matches=False)
            match_quality_matrix = self.mask_iou(src_boxes, pred_boxes)

            results = matcher(match_quality_matrix)

            true_positive = torch.count_nonzero(results.unique() != -1)
            matched_elements = results[results > -1]

            # in Matcher, a pred element can be matched only twice
            false_positive = torch.count_nonzero(results == -1) + (
                        len(matched_elements) - len(matched_elements.unique()))
            false_negative = total_gt - true_positive

            return true_positive / (true_positive + false_positive)

        elif total_gt == 0:
            if total_pred > 0:
                return torch.tensor(0.).cuda()
            else:
                return torch.tensor(1.).cuda()
        elif total_gt > 0 and total_pred == 0:
            return torch.tensor(0.).cuda()


val_dataloader = torch.utils.data.DataLoader(cityscapes_val, batch_size=1, shuffle=False, collate_fn=collate_fn)

import os
import torchvision.transforms as T
import PIL.Image as I

detector = MASKRCNN(n_classes=9, batchsize=2)

NET_FOLDER = '.'
weights_file = 'best_baseline'
if (os.path.exists(NET_FOLDER + '/' + weights_file + '.ckpt')):
    detector.load_state_dict(torch.load(NET_FOLDER + '/' + weights_file + '.ckpt')['state_dict'])
else:
    if not os.path.exists(NET_FOLDER):
        mode = 0o777
        os.mkdir(NET_FOLDER, mode)
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

early_stop_callback = EarlyStopping(monitor='val_acc', min_delta=0.00, patience=10, verbose=False, mode='max')

checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=NET_FOLDER, filename=weights_file)
trainer = Trainer(accelerator='gpu', devices=1, max_epochs=30, deterministic=False,
                  callbacks=[checkpoint_callback, early_stop_callback], reload_dataloaders_every_n_epochs=1)
trainer.fit(detector, val_dataloaders=val_dataloader)

However, when doing inference (detector.eval()) on sample image the model is returning almost 0 scores and doing wrong predictions. I konw epochs might be less but it should predict the classes right as I am using pretrained = True.

Where am I lacking?.
What to look for debugging this?
Am I passing the dataset in wrong manner?

This is the output of batch = next(iter(train_dataloader)

(tensor([[[[0.1294, 0.1294, 0.1294,  ..., 0.6863, 0.6863, 0.6745],
          [0.1294, 0.1294, 0.1294,  ..., 0.6863, 0.6784, 0.6784],
          [0.1373, 0.1294, 0.1216,  ..., 0.6824, 0.6863, 0.6863],
          ...,
          [0.2000, 0.2078, 0.2039,  ..., 0.3294, 0.3255, 0.3255],
          [0.2078, 0.2157, 0.2157,  ..., 0.3137, 0.3216, 0.3216],
          [0.2118, 0.2157, 0.2157,  ..., 0.3137, 0.3216, 0.3216]],

         [[0.1725, 0.1725, 0.1725,  ..., 0.8118, 0.8118, 0.8078],
          [0.1765, 0.1765, 0.1765,  ..., 0.8118, 0.8078, 0.8000],
          [0.1804, 0.1765, 0.1686,  ..., 0.8039, 0.8000, 0.8039],
          ...,
          [0.2627, 0.2706, 0.2706,  ..., 0.3961, 0.3961, 0.3882],
          [0.2667, 0.2745, 0.2784,  ..., 0.3804, 0.3843, 0.3843],
          [0.2745, 0.2784, 0.2784,  ..., 0.3804, 0.3843, 0.3882]],

         [[0.1294, 0.1333, 0.1333,  ..., 0.7961, 0.7961, 0.7843],
          [0.1333, 0.1373, 0.1373,  ..., 0.8000, 0.7922, 0.7804],
          [0.1294, 0.1333, 0.1294,  ..., 0.7961, 0.7882, 0.7843],
          ...,
          [0.2314, 0.2392, 0.2314,  ..., 0.3255, 0.3255, 0.3255],
          [0.2353, 0.2431, 0.2431,  ..., 0.3137, 0.3216, 0.3176],
          [0.2431, 0.2510, 0.2471,  ..., 0.3137, 0.3216, 0.3216]]],


        [[[0.8980, 0.9059, 0.8941,  ..., 0.1020, 0.1255, 0.1843],
          [0.8196, 0.8510, 0.8627,  ..., 0.0980, 0.1137, 0.1137],
          [0.4745, 0.5020, 0.6078,  ..., 0.1529, 0.1412, 0.1176],
          ...,
          [0.1608, 0.1608, 0.1686,  ..., 0.1569, 0.1608, 0.1608],
          [0.1608, 0.1647, 0.1686,  ..., 0.1490, 0.1608, 0.1608],
          [0.1647, 0.1725, 0.1686,  ..., 0.1490, 0.1608, 0.1608]],

         [[0.9765, 0.9922, 0.9922,  ..., 0.1529, 0.1804, 0.2941],
          [0.9059, 0.9412, 0.9686,  ..., 0.1569, 0.1725, 0.2118],
          [0.5804, 0.6863, 0.7882,  ..., 0.2157, 0.1961, 0.1608],
          ...,
          [0.2118, 0.2118, 0.2118,  ..., 0.2039, 0.2078, 0.2039],
          [0.2118, 0.2157, 0.2196,  ..., 0.2000, 0.2078, 0.2078],
          [0.2196, 0.2196, 0.2235,  ..., 0.2000, 0.2078, 0.2078]],

         [[0.9294, 0.9294, 0.9412,  ..., 0.1176, 0.1529, 0.2431],
          [0.8549, 0.8824, 0.9098,  ..., 0.1137, 0.1373, 0.1686],
          [0.5647, 0.6588, 0.7373,  ..., 0.1529, 0.1412, 0.1216],
          ...,
          [0.1882, 0.1882, 0.1843,  ..., 0.1765, 0.1725, 0.1725],
          [0.1843, 0.1843, 0.1922,  ..., 0.1686, 0.1686, 0.1725],
          [0.1882, 0.1882, 0.1961,  ..., 0.1686, 0.1686, 0.1725]]]]), [tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8), tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.uint8)], [tensor([[7.2715e+02, 2.4609e+02, 8.4316e+02, 3.1172e+02],
        [1.0219e+03, 2.5137e+02, 1.1150e+03, 2.9414e+02],
        [1.0219e+03, 2.5137e+02, 1.1150e+03, 2.9414e+02],
        [5.8594e-01, 2.8594e+02, 5.0391e+01, 3.0820e+02],
        [2.2734e+02, 2.6133e+02, 3.1406e+02, 2.9238e+02],
        [2.4258e+02, 2.6719e+02, 3.3340e+02, 3.0117e+02],
        [6.5039e+01, 2.8184e+02, 1.0898e+02, 2.9297e+02],
        [5.3906e+01, 2.8477e+02, 1.0312e+02, 3.0059e+02],
        [1.0781e+02, 2.6367e+02, 1.9160e+02, 2.9824e+02],
        [1.0219e+03, 2.5137e+02, 1.1150e+03, 2.9414e+02],
        [7.1484e+02, 2.6133e+02, 7.3887e+02, 2.8711e+02],
        [6.6504e+02, 2.6133e+02, 6.8027e+02, 2.9180e+02],
        [6.5273e+02, 2.6016e+02, 6.7207e+02, 2.9180e+02],
        [7.2715e+02, 2.4609e+02, 8.4316e+02, 3.1172e+02],
        [9.4336e+01, 2.5547e+02, 1.2715e+02, 3.2109e+02],
        [1.0020e+02, 2.7422e+02, 1.2246e+02, 3.2051e+02]], dtype=torch.float64), tensor([[ 506.2500,  249.0234,  520.8984,  263.6719],
        [ 435.9375,  244.9219,  451.1719,  260.1562],
        [ 744.1406,  250.7812,  779.2969,  275.3906],
        [  66.7969,  237.3047,  100.1953,  255.4688],
        [ 192.1875,  250.7812,  244.3359,  272.4609],
        [ 247.2656,  243.7500,  287.1094,  259.5703],
        [ 280.0781,  244.3359,  321.6797,  262.5000],
        [ 312.8906,  246.0938,  351.5625,  270.1172],
        [ 984.9609,  260.1562, 1087.5000,  314.0625],
        [ 789.8438,  256.6406,  826.7578,  283.0078],
        [1160.1562,  261.9141, 1200.0000,  333.3984],
        [ 476.9531,  246.0938,  516.2109,  275.9766],
        [ 527.9297,  247.2656,  554.8828,  266.6016],
        [ 549.0234,  247.2656,  577.1484,  271.2891],
        [ 564.8438,  243.1641,  608.2031,  279.4922]], dtype=torch.float64)], [tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 1, 1, 2, 3, 8]), tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])])