IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3) - possibly pycocotools related?

Hello!
I am working on an instance segmentation classifier with MaskRCNN and a custom Dataset. There is only one class (excluding the background).
When I try to initialize the training, this is the error I get:

File ~\AppData\Roaming\Python\Python38\site-packages\torchvision\ops\roi_align.py:56, in roi_align(input, boxes, output_size, spatial_scale, sampling_ratio, aligned)
     54     rois = convert_boxes_to_roi_format(rois)
     55 print(input.shape)
---> 56 return torch.ops.torchvision.roi_align(input, rois, spatial_scale,
     57                                        output_size[0], output_size[1],
     58                                        sampling_ratio, aligned)

IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)

following previous posts to this error, I have printed input.shape in the same file and got these outputs:

torch.Size([1, 256, 200, 200])
torch.Size([1, 256, 100, 100])
torch.Size([1, 256, 50, 50])
torch.Size([1, 256, 25, 25])
torch.Size([1, 256, 200, 200])
torch.Size([1, 256, 100, 100])
torch.Size([1, 256, 50, 50])
torch.Size([1, 256, 25, 25])
torch.Size([500, 1, 800])

Now here is where I can’t seem to catch on… I think it is a flaw in my Dataset, specifically either the way I extract the masks from the annotations or something related.

Here is my code:

#Dataset
class CustomCucumberDataset(torch.utils.data.Dataset):
    def __init__(self, root, transforms):
        self.root = root
        self.transforms = transforms
        self.imgs = list(sorted(os.listdir(os.path.join(root, "Cucupng"))))
        self.anns_file = json_path
        coco=COCO(self.anns_file)
        self.anns = coco.loadImgs(coco.getImgIds())
        path = [] 
        [path.append(img['path']) for img in self.anns]

    def __getitem__(self, idx):
        coco=COCO(self.anns_file)
        #load annotations
        image = coco.loadImgs(coco.getImgIds(idx))
        anns = coco.loadAnns(coco.getAnnIds(idx))
        #load image
            #find img file name by img
        img_name = image[0]['path']
            #create path to img
        img_path = os.path.join(root, "Cucupng", img_name)
            #load img as np.array
        img = np.array(Image.open(img_path).convert("RGB"))

        #Annotationen auslesesen
           #Weite des Bildes
        width = image[0]['width'] 
            #höhe des Bildes
        height= image[0]['height']
        
            #array an bunding boxen von jeder Gurke im Bild
        boxes = []
        for i in range(len(anns)):
            [bbox_x, bbox_y, bbox_w, bbox_h] = anns[i]['bbox']
            xmin = bbox_x
            ymin = bbox_y
            xmax = bbox_w + xmin
            ymax = bbox_h + ymin
            boxes.append([xmin, ymin, xmax, ymax])
           #labels des Bildes ("Cucumber")
        labels=[]
        for i in range(len(anns)):
            labels.append(anns[i]['category_id'])
        # einzelne Segmentierungsmasken zu binary masken konvertieren (numpy 2D array)
        masks = coco.annToMask(anns[0])
        for i in range(len(anns)):
            masks += coco.annToMask(anns[i])

        num_objs = len(anns)

        #convert everything into a torch.tensor
        width = torch.as_tensor(width, dtype = torch.float32)
        height = torch.as_tensor(height, dtype = torch.float32)
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        image_id = torch.tensor([idx])
        masks = torch.as_tensor(masks, dtype=torch.uint8)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
        iscrowd = torch.zeros((num_objs,), dtype = torch.int64)

        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        target["image_id"] = image_id
        target["masks"] = masks
        target["area"] = area
        target["iscrowd"] = iscrowd 
        target["width"] = width
        target["height"] = height

        if self.transforms is not None: 
            img = self.transforms(img)
        return img, target

    def __len__(self):
        return len(self.imgs)