MaskRCNN Training: gt labels don't match proposals

Hi,

I try to train a Mask RCNN from torchvision on a custom dataset. But encounter an issue with the ground truth labels from my dataset and the proposals from the network. Here is the code for my dataset (the annotations are stored in a separate json-file):

class Custom_Data(Dataset):
    def __init__(self, data_dir, transforms):
        self.image_ids = []
        self.data = data_dir
        self.transforms = transforms
        for filename in glob.glob(os.path.join(data_dir, '*.json')):
            nr = Path(filename).stem
            name = str(nr).split("_SHAPE")[0]
            name = name[:-1]
            self.image_ids.append(name)
        print("Length of dataset: ", len(self.image_ids))

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        f = self.image_ids[idx]

        # load rgb input image
        img_name = "{}.bmp".format(f)
        image_path = os.path.join(self.data, img_name)
        image = skimage.io.imread(image_path)
        height, width, channels = image.shape
        size = max([width, height])
        input_img = np.zeros([size, size, 3],
                             dtype=np.uint8)
        # compute center offset
        self.x_center = (input_img.shape[1] - image.shape[1]) // 2
        self.y_center = (input_img.shape[0] - image.shape[0]) // 2

        # copy img image into center of result image
        input_img[self.y_center:self.y_center + image.shape[0],
        self.x_center:self.x_center + image.shape[1]] = image
        input_img = Image.fromarray(input_img)

        # load annotations from json-file
        annotations = json.load(open(os.path.join(self.data, "{}__SHAPES.json".format(f))))
        del annotations['version']
        del annotations['enabled']
        del annotations['imageDimensions']
        del annotations['imageBackground']

        # create masks from annotations
        raw_mask = np.zeros((height, width, len(annotations.keys())), dtype=np.uint8)
        class_id = []
        self.boxes = []
        i = 0
        for key in annotations.keys():
            self.class_exists = False
            p = None
            if 'shape' in annotations[key]:
                p = annotations[key]['shape']
            elif 'x' in annotations[key] and 'y' in annotations[key]:
                p = annotations[key]
            if isinstance(p, dict):
                y = p['y']
                x = p['x']
                raw_mask = self.get_targets(x, y, raw_mask, i)
            elif isinstance(p, list):
                for n in p:
                    y = n['y']
                    x = n['x']
                    raw_mask = self.get_targets(x, y, raw_mask, i)

            img = np.zeros([size, size, len(annotations.keys())], dtype=np.uint8)
            # copy img image into center of result image
            img[self.y_center:self.y_center + raw_mask.shape[0],
            self.x_center:self.x_center + raw_mask.shape[1]] = raw_mask
            final_mask = img

            if self.class_exists:
                c = self.get_class_id(str(key))
                if c is not None:
                    class_id.append(c)
                    c = None
            i = i + 1

        # Return masks, bboxes and classes of each instance.
        boxes = torch.as_tensor(self.boxes, dtype=torch.float32)
        areas = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        area = torch.as_tensor(areas)
        classes = torch.as_tensor(class_id, dtype=torch.int64)
        masks = torch.as_tensor(final_mask, dtype=torch.uint8)
        iscrowd = torch.zeros((i,), dtype=torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = classes
        target["masks"] = masks
        target["image_id"] = torch.tensor([idx])
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            input_img, target = self.transforms(input_img, target)

        return [input_img, target]

    def get_class_id(self, name:str):
        # adding the classes
        if 'class1' in name:
            return 1
        elif 'class2' in name:
            return 2
        elif 'class3' in name:
            return 3
        elif 'class4' in name:
            return 4
        elif 'class5' in name:
            return 5
        elif 'class6' in name:
            return 6
        elif 'class7' in name:
            return 7
        elif 'class8' in name:
            return 8
        elif 'class9' in name:
            return 9
        elif 'class10' in name:
            return 10
        else:
            return None

    def get_targets(self, x, y, raw_mask, i):
        while y.__contains__(None):
            y.remove(None)
        while x.__contains__(None):
            x.remove(None)
        if np.max(x) - np.min(x) < 1 or np.max(y) - np.min(y) < 1:
            self.class_exists = False
            return raw_mask
        rr, cc = skimage.draw.polygon(y, x)
        raw_mask[rr - 1, cc - 1, i] = 1
        pos = np.nonzero(raw_mask[:, :, i])
        xmin = np.min(pos[1]) + self.x_center
        xmax = np.max(pos[1]) + self.x_center
        ymin = np.min(pos[0]) + self.y_center
        ymax = np.max(pos[0]) + self.y_center
        if xmin >= xmax:
            xmax = xmax + 1
        if ymin >= ymax:
            ymax = ymax + 1
        self.boxes.append([xmin, ymin, xmax, ymax])
        self.class_exists = True

        return raw_mask

This is adapted from here TorchVision Object Detection Finetuning Tutorial — PyTorch Tutorials 2.0.1+cu117 documentation and seems to work well.

Next is my training code:

def get_transform(train):
    transforms = []
    transforms.append(T.PILToTensor())
    transforms.append(T.ConvertImageDtype(torch.float))
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)


def get_model_instance_segmentation(model_num_classes):
    # load an instance segmentation model pre-trained on COCO
    this_model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # get number of input features for the classifier
    in_features = this_model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    this_model.roi_heads.box_predictor = FastRCNNPredictor(in_features, model_num_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = this_model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    this_model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                            hidden_layer,
                                                            model_num_classes)
    return this_model


############################################################
#  Training
############################################################

print("Current TF-version: ", tf.__version__)
device = torch.device('cpu')  # if torch.cuda.is_available() else None

num_classes = 11  # extra class for background
dataset = Custom_Data(DATA_DIR, get_transform(train=False))
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True,
                                          num_workers=4, collate_fn=utils.collate_fn)

model =get_model_instance_segmentation(model_num_classes=num_classes)

model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=1e-3, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

num_epochs = 10

for epoch in range(num_epochs):
    train_one_epoch(model, optimizer, data_loader, device=device, epoch=epoch, print_freq=10)
    lr_scheduler.step()
    evaluate(model, data_loader, device=device)

model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)
torch.onnx.export(model, x, "mask_rcnn.pnnx", opset_version=11)

I run the code on cpu, because running on gpu with cuda always ends up in a cuda device assertion error. So I debug my code while running on cpu.

Finally, here is the error message I get:

Traceback (most recent call last):
  File "/home/.../training.py", line 74, in <module>
    train_one_epoch(model, optimizer, data_loader, device=device, epoch=epoch, print_freq=10)
  File "/home/.../engine.py", line 31, in train_one_epoch
    loss_dict = model(images, targets)
                ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/.../venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/generalized_rcnn.py", line 105, in forward
    detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/.../venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/roi_heads.py", line 755, in forward
    proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
                                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/roi_heads.py", line 649, in select_training_samples
    matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/roi_heads.py", line 588, in assign_targets_to_proposals
    labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
                      ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
IndexError: index 17 is out of bounds for dimension 0 with size 9

Process finished with exit code 1

The tensor of the list gt_labels containing my classes is something like tensor([10, 2, 3, 1, 5, 6, 9, 4, 8]). But clamped_matched_idxs_in_image is tensor([0, 0, 0, …, 15, 16, 17]).
Did someone ever encounter a similar issue? Please let me know, if I missed any information.

I solved it on my own, if anyone has a similar issue in the future:
Check the masks and boxes, I had multiple masks for one instance (in one mask image) and multiple boxes (one for each mask). There should be only one box per instance.
my solution was to create a separate function for the box generation and call it after all masks are drawn using the polygon function.