IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)

Hi everyone!
I am trying to train an instance segmentation model following the PedanFudan tutorial (TorchVision Object Detection Finetuning Tutorial — PyTorch Tutorials 1.7.1 documentation) on a custom Dataset. There are 10 classes that are not part of the categories the models were pretrained on.

When I try to run my code, I get the following error on line 44 of roi_align.py:

  File "D:\miniconda3\envs\pytorch-env\lib\site-packages\torchvision\ops\roi_align.py", line 44, in roi_align
    return torch.ops.torchvision.roi_align(input, rois, spatial_scale,
IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)

By printing input.shape and its length before the return statement in line 43 of the same file, I get the following shapes for input:

torch.Size([1, 256, 200, 272]) has length : 4
torch.Size([1, 256, 100, 136]) has length : 4
torch.Size([1, 256, 50, 68]) has length : 4
torch.Size([1, 256, 25, 34]) has length : 4
torch.Size([1, 256, 200, 272]) has length : 4
torch.Size([1, 256, 100, 136]) has length : 4
torch.Size([1, 256, 50, 68]) has length : 4
torch.Size([1, 256, 25, 34]) has length : 4
torch.Size([1200, 1, 1066]) has length : 3

It seems like the last length is my problem, since roi_align’s documentation states that input should have shape:(Tensor[N, C, H, W])

My problem is similar to what GideonsMarch had in this post: Dimension out of range (expected to be in range of [-3, 2], but got 3), so I guess that I have a problem with my Dataset, but I couldn’t find it.

Here is my source code:

import glob
import numpy as np
import torch
import json
import torchvision
import torchvision.transforms
from pathlib import Path
from skimage import io, transform
from PIL import Image

# Especific Imports
from torch.utils.data.dataset import Dataset
from torchvision.models.detection.mask_rcnn import maskrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator

from vision.references.detection.engine import train_one_epoch, evaluate
import vision.references.detection.utils as utils
import vision.references.detection.transforms as T


class CupDataset(Dataset):
    """Cup Dataset"""

    def __init__(self, root_dir, transforms = None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """

        self.root_dir = Path(root_dir)
        self.transforms = transforms
        self.idxs = glob.glob((self.root_dir / '*.bmp').__str__())
        self.labels = {'dominante': 1, 'naodominante': 2,
                       'paretico_pre': 3, 'naoparetico_pre': 4,
                       'naoparetico_3d': 5, 'naoparetico_30d': 6,
                       'paretico_3d': 7, 'paretico_30d': 8,
                       'naoparetico_90d': 9, 'paretico_90d': 10}


    def __len__(self):
        return len(self.idxs)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
             idx = idx.tolist()

        img = Image.open(self.idxs[idx]).convert("RGB")
        mask_path = self.idxs[idx][:-3] + 'jpg'
        json_path = self.idxs[idx][:-3] + 'json'

        with open(json_path) as f:
            j = json.load(f)

        boxes=[]
        label = (self.labels[j['shapes'][0]['label']],)
        xmin, ymin = j['shapes'][1]['points'][0]
        xmax, ymax = j['shapes'][1]['points'][1]
        boxes.append([xmin, ymin, xmax, ymax])
        iscrowd = torch.zeros(1, dtype=torch.int64)

        mask = Image.open(mask_path)
        mask = np.array(mask)[:, :, 0]

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # there is only one class
        labels = torch.as_tensor(label, dtype=torch.int64)
        masks = torch.as_tensor(mask, dtype=torch.uint8)
        area = torch.as_tensor((xmax - xmin) * (ymax - ymin), dtype=torch.float32 )

        target ={}
        target['labels'] = labels
        target['masks'] = masks
        target["boxes"] = boxes
        target["image_id"] = torch.tensor([idx])
        target["area"] = area
        target["iscrowd"] = iscrowd

        if self.transforms is not None:
            img, target = self.transforms(img, target)

        return img, target


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, mask = sample[0], sample[1]['mask']
        ret = sample[1]
        mask = torch.from_numpy(mask)
        image = image.transpose((2, 0, 1))
        return torch.from_numpy(image), torch.from_numpy(mask)

    def __repr__(self):
        return self.__class__.__name__ + '()'

def get_transform(train):
    transforms = []
    transforms.append(T.ToTensor())
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    return T.Compose(transforms)

class CupModel():
    """
    Model for the Network:
    Deciding if finetuning or feature extraction
    """
    def __init__(self):
        # Top level data directory. Here we assume the format of the directory conforms
        #   to the ImageFolder structure
        self.model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

        self.num_classes = 10
        self.batch_size = 1
        self.num_epochs = 200
        self.hidden_layer = 256

        in_features = self.model.roi_heads.box_predictor.cls_score.in_features
        in_features_mask = self.model.roi_heads.mask_predictor.conv5_mask.in_channels
        self.model.roi_heads.box_predictor = FastRCNNPredictor(in_features, self.num_classes)
        self.model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
                                                                self.hidden_layer,
                                                                self.num_classes)


        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(self.device)
        params = [p for p in self.model.parameters() if p.requires_grad]
        self.optimizer = torch.optim.SGD(params,
                                         lr=0.005,
                                         momentum=0.9,
                                         weight_decay=0.0005)
        self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,
                                                       step_size=3,
                                                       gamma=0.1)

    def train(self, train_dataset, test_dataset):

        data_loader_train = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=4,
            collate_fn=utils.collate_fn)

        data_loader_test = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=1,
            shuffle=False,
            num_workers=4,
            collate_fn=utils.collate_fn)

        for epoch in range(self.num_epochs):
            # train for one epoch, printing every 10 iterations
            train_one_epoch(self.model,
                            self.optimizer,
                            data_loader_train,
                            self.device,
                            epoch,
                            print_freq=10)

            # update the learning rate
            self.lr_scheduler.step()
            # evaluate on the test dataset
            evaluate(self.model, data_loader_test, device=self.device)

        print("That's it!")


if __name__ == "__main__":
    cup = CupDataset((Path.cwd() / 'Data').__str__())
    train_dataset = CupDataset((Path.cwd() / 'Data' / 'train_new').__str__(),
                                transforms = get_transform(train=True))
    test_dataset = CupDataset((Path.cwd() / 'Data' / 'test').__str__(),
                                transforms = get_transform(train=False))


    cupmodel = CupModel()
    cupmodel.train(train_dataset, test_dataset)

Although I am not allowed to upload the .bmp files (user data is involved), I am uploading the mask of the image and .json file with the polygon (created with labelme).

json file:

{
  "version": "4.5.6",
  "flags": {},
  "shapes": [
    {
      "label": "dominante",
      "points": [
        [
          597.0,
          396.0
        ],
        [
          658.0,
          387.0
        ],
        [
          722.0,
          384.0
        ],
        [
          760.0,
          395.0
        ],
        [
          808.0,
          412.0
        ],
        [
          802.0,
          435.0
        ],
        [
          792.0,
          453.0
        ],
        [
          768.0,
          462.0
        ],
        [
          764.0,
          485.0
        ],
        [
          743.0,
          496.0
        ],
        [
          701.0,
          503.0
        ],
        [
          701.0,
          513.0
        ],
        [
          667.0,
          541.0
        ],
        [
          614.0,
          607.0
        ],
        [
          597.0,
          649.0
        ],
        [
          588.0,
          733.0
        ],
        [
          646.0,
          780.0
        ],
        [
          694.0,
          768.0
        ],
        [
          722.0,
          766.0
        ],
        [
          776.0,
          747.0
        ],
        [
          806.0,
          747.0
        ],
        [
          846.0,
          756.0
        ],
        [
          865.0,
          773.0
        ],
        [
          871.0,
          789.0
        ],
        [
          864.0,
          808.0
        ],
        [
          756.0,
          851.0
        ],
        [
          738.0,
          864.0
        ],
        [
          639.0,
          871.0
        ],
        [
          584.0,
          874.0
        ],
        [
          542.0,
          870.0
        ],
        [
          498.0,
          860.0
        ],
        [
          444.0,
          800.0
        ],
        [
          420.0,
          758.0
        ],
        [
          398.0,
          698.0
        ],
        [
          391.0,
          639.0
        ],
        [
          366.0,
          504.0
        ],
        [
          372.0,
          440.0
        ],
        [
          380.0,
          406.0
        ],
        [
          408.0,
          362.0
        ],
        [
          450.0,
          328.0
        ],
        [
          520.0,
          279.0
        ],
        [
          560.0,
          257.0
        ],
        [
          612.0,
          243.0
        ],
        [
          712.0,
          238.0
        ],
        [
          734.0,
          255.0
        ],
        [
          742.0,
          286.0
        ],
        [
          731.0,
          319.0
        ],
        [
          692.0,
          343.0
        ],
        [
          614.0,
          383.0
        ]
      ],
      "group_id": null,
      "shape_type": "polygon",
      "flags": {}
    },
    {
      "label": "dominante",
      "points": [
        [
          355.51851851851853,
          228.85185185185185
        ],
        [
          885.1481481481482,
          880.7037037037037
        ]
      ],
      "group_id": null,
      "shape_type": "rectangle",
      "flags": {}
    }
  ],
  "imagePath": "C2dominanteD2.bmp",
  "imageData": null,
  "imageHeight": 1200,
  "imageWidth": 1600
}

I would appreciate any advice on how to tackle this error or how to debug it. Thanks in advance!

Your debugging so far seems reasonable and indeed the last shape might fail.
I don’t know what the solution in the other thread was besides a “dataset error”, so maybe you could also ask the author what exactly was wrong.

For the input shapes: are these shapes coming from the same tensor in different iterations? If so, which operation creates these tensors?