Hi everyone!
I am trying to train an instance segmentation model following the PedanFudan tutorial (TorchVision Object Detection Finetuning Tutorial — PyTorch Tutorials 1.7.1 documentation) on a custom Dataset. There are 10 classes that are not part of the categories the models were pretrained on.
When I try to run my code, I get the following error on line 44 of roi_align.py:
File "D:\miniconda3\envs\pytorch-env\lib\site-packages\torchvision\ops\roi_align.py", line 44, in roi_align
return torch.ops.torchvision.roi_align(input, rois, spatial_scale,
IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)
By printing input.shape and its length before the return statement in line 43 of the same file, I get the following shapes for input:
torch.Size([1, 256, 200, 272]) has length : 4
torch.Size([1, 256, 100, 136]) has length : 4
torch.Size([1, 256, 50, 68]) has length : 4
torch.Size([1, 256, 25, 34]) has length : 4
torch.Size([1, 256, 200, 272]) has length : 4
torch.Size([1, 256, 100, 136]) has length : 4
torch.Size([1, 256, 50, 68]) has length : 4
torch.Size([1, 256, 25, 34]) has length : 4
torch.Size([1200, 1, 1066]) has length : 3
It seems like the last length is my problem, since roi_align’s documentation states that input should have shape:(Tensor[N, C, H, W])
My problem is similar to what GideonsMarch had in this post: Dimension out of range (expected to be in range of [-3, 2], but got 3), so I guess that I have a problem with my Dataset, but I couldn’t find it.
Here is my source code:
import glob
import numpy as np
import torch
import json
import torchvision
import torchvision.transforms
from pathlib import Path
from skimage import io, transform
from PIL import Image
# Especific Imports
from torch.utils.data.dataset import Dataset
from torchvision.models.detection.mask_rcnn import maskrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.models.detection.rpn import AnchorGenerator
from vision.references.detection.engine import train_one_epoch, evaluate
import vision.references.detection.utils as utils
import vision.references.detection.transforms as T
class CupDataset(Dataset):
"""Cup Dataset"""
def __init__(self, root_dir, transforms = None):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.root_dir = Path(root_dir)
self.transforms = transforms
self.idxs = glob.glob((self.root_dir / '*.bmp').__str__())
self.labels = {'dominante': 1, 'naodominante': 2,
'paretico_pre': 3, 'naoparetico_pre': 4,
'naoparetico_3d': 5, 'naoparetico_30d': 6,
'paretico_3d': 7, 'paretico_30d': 8,
'naoparetico_90d': 9, 'paretico_90d': 10}
def __len__(self):
return len(self.idxs)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
img = Image.open(self.idxs[idx]).convert("RGB")
mask_path = self.idxs[idx][:-3] + 'jpg'
json_path = self.idxs[idx][:-3] + 'json'
with open(json_path) as f:
j = json.load(f)
boxes=[]
label = (self.labels[j['shapes'][0]['label']],)
xmin, ymin = j['shapes'][1]['points'][0]
xmax, ymax = j['shapes'][1]['points'][1]
boxes.append([xmin, ymin, xmax, ymax])
iscrowd = torch.zeros(1, dtype=torch.int64)
mask = Image.open(mask_path)
mask = np.array(mask)[:, :, 0]
boxes = torch.as_tensor(boxes, dtype=torch.float32)
# there is only one class
labels = torch.as_tensor(label, dtype=torch.int64)
masks = torch.as_tensor(mask, dtype=torch.uint8)
area = torch.as_tensor((xmax - xmin) * (ymax - ymin), dtype=torch.float32 )
target ={}
target['labels'] = labels
target['masks'] = masks
target["boxes"] = boxes
target["image_id"] = torch.tensor([idx])
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
img, target = self.transforms(img, target)
return img, target
class ToTensor(object):
"""Convert ndarrays in sample to Tensors."""
def __call__(self, sample):
image, mask = sample[0], sample[1]['mask']
ret = sample[1]
mask = torch.from_numpy(mask)
image = image.transpose((2, 0, 1))
return torch.from_numpy(image), torch.from_numpy(mask)
def __repr__(self):
return self.__class__.__name__ + '()'
def get_transform(train):
transforms = []
transforms.append(T.ToTensor())
if train:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
class CupModel():
"""
Model for the Network:
Deciding if finetuning or feature extraction
"""
def __init__(self):
# Top level data directory. Here we assume the format of the directory conforms
# to the ImageFolder structure
self.model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
self.num_classes = 10
self.batch_size = 1
self.num_epochs = 200
self.hidden_layer = 256
in_features = self.model.roi_heads.box_predictor.cls_score.in_features
in_features_mask = self.model.roi_heads.mask_predictor.conv5_mask.in_channels
self.model.roi_heads.box_predictor = FastRCNNPredictor(in_features, self.num_classes)
self.model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
self.hidden_layer,
self.num_classes)
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.model.to(self.device)
params = [p for p in self.model.parameters() if p.requires_grad]
self.optimizer = torch.optim.SGD(params,
lr=0.005,
momentum=0.9,
weight_decay=0.0005)
self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer,
step_size=3,
gamma=0.1)
def train(self, train_dataset, test_dataset):
data_loader_train = torch.utils.data.DataLoader(
train_dataset,
batch_size=self.batch_size,
shuffle=True,
num_workers=4,
collate_fn=utils.collate_fn)
data_loader_test = torch.utils.data.DataLoader(
test_dataset,
batch_size=1,
shuffle=False,
num_workers=4,
collate_fn=utils.collate_fn)
for epoch in range(self.num_epochs):
# train for one epoch, printing every 10 iterations
train_one_epoch(self.model,
self.optimizer,
data_loader_train,
self.device,
epoch,
print_freq=10)
# update the learning rate
self.lr_scheduler.step()
# evaluate on the test dataset
evaluate(self.model, data_loader_test, device=self.device)
print("That's it!")
if __name__ == "__main__":
cup = CupDataset((Path.cwd() / 'Data').__str__())
train_dataset = CupDataset((Path.cwd() / 'Data' / 'train_new').__str__(),
transforms = get_transform(train=True))
test_dataset = CupDataset((Path.cwd() / 'Data' / 'test').__str__(),
transforms = get_transform(train=False))
cupmodel = CupModel()
cupmodel.train(train_dataset, test_dataset)
Although I am not allowed to upload the .bmp files (user data is involved), I am uploading the mask of the image and .json file with the polygon (created with labelme).
json file:
{
"version": "4.5.6",
"flags": {},
"shapes": [
{
"label": "dominante",
"points": [
[
597.0,
396.0
],
[
658.0,
387.0
],
[
722.0,
384.0
],
[
760.0,
395.0
],
[
808.0,
412.0
],
[
802.0,
435.0
],
[
792.0,
453.0
],
[
768.0,
462.0
],
[
764.0,
485.0
],
[
743.0,
496.0
],
[
701.0,
503.0
],
[
701.0,
513.0
],
[
667.0,
541.0
],
[
614.0,
607.0
],
[
597.0,
649.0
],
[
588.0,
733.0
],
[
646.0,
780.0
],
[
694.0,
768.0
],
[
722.0,
766.0
],
[
776.0,
747.0
],
[
806.0,
747.0
],
[
846.0,
756.0
],
[
865.0,
773.0
],
[
871.0,
789.0
],
[
864.0,
808.0
],
[
756.0,
851.0
],
[
738.0,
864.0
],
[
639.0,
871.0
],
[
584.0,
874.0
],
[
542.0,
870.0
],
[
498.0,
860.0
],
[
444.0,
800.0
],
[
420.0,
758.0
],
[
398.0,
698.0
],
[
391.0,
639.0
],
[
366.0,
504.0
],
[
372.0,
440.0
],
[
380.0,
406.0
],
[
408.0,
362.0
],
[
450.0,
328.0
],
[
520.0,
279.0
],
[
560.0,
257.0
],
[
612.0,
243.0
],
[
712.0,
238.0
],
[
734.0,
255.0
],
[
742.0,
286.0
],
[
731.0,
319.0
],
[
692.0,
343.0
],
[
614.0,
383.0
]
],
"group_id": null,
"shape_type": "polygon",
"flags": {}
},
{
"label": "dominante",
"points": [
[
355.51851851851853,
228.85185185185185
],
[
885.1481481481482,
880.7037037037037
]
],
"group_id": null,
"shape_type": "rectangle",
"flags": {}
}
],
"imagePath": "C2dominanteD2.bmp",
"imageData": null,
"imageHeight": 1200,
"imageWidth": 1600
}
I would appreciate any advice on how to tackle this error or how to debug it. Thanks in advance!