Hi,
I try to train a Mask RCNN from torchvision on a custom dataset. But encounter an issue with the ground truth labels from my dataset and the proposals from the network. Here is the code for my dataset (the annotations are stored in a separate json-file):
class Custom_Data(Dataset):
def __init__(self, data_dir, transforms):
self.image_ids = []
self.data = data_dir
self.transforms = transforms
for filename in glob.glob(os.path.join(data_dir, '*.json')):
nr = Path(filename).stem
name = str(nr).split("_SHAPE")[0]
name = name[:-1]
self.image_ids.append(name)
print("Length of dataset: ", len(self.image_ids))
def __len__(self):
return len(self.image_ids)
def __getitem__(self, idx):
f = self.image_ids[idx]
# load rgb input image
img_name = "{}.bmp".format(f)
image_path = os.path.join(self.data, img_name)
image = skimage.io.imread(image_path)
height, width, channels = image.shape
size = max([width, height])
input_img = np.zeros([size, size, 3],
dtype=np.uint8)
# compute center offset
self.x_center = (input_img.shape[1] - image.shape[1]) // 2
self.y_center = (input_img.shape[0] - image.shape[0]) // 2
# copy img image into center of result image
input_img[self.y_center:self.y_center + image.shape[0],
self.x_center:self.x_center + image.shape[1]] = image
input_img = Image.fromarray(input_img)
# load annotations from json-file
annotations = json.load(open(os.path.join(self.data, "{}__SHAPES.json".format(f))))
del annotations['version']
del annotations['enabled']
del annotations['imageDimensions']
del annotations['imageBackground']
# create masks from annotations
raw_mask = np.zeros((height, width, len(annotations.keys())), dtype=np.uint8)
class_id = []
self.boxes = []
i = 0
for key in annotations.keys():
self.class_exists = False
p = None
if 'shape' in annotations[key]:
p = annotations[key]['shape']
elif 'x' in annotations[key] and 'y' in annotations[key]:
p = annotations[key]
if isinstance(p, dict):
y = p['y']
x = p['x']
raw_mask = self.get_targets(x, y, raw_mask, i)
elif isinstance(p, list):
for n in p:
y = n['y']
x = n['x']
raw_mask = self.get_targets(x, y, raw_mask, i)
img = np.zeros([size, size, len(annotations.keys())], dtype=np.uint8)
# copy img image into center of result image
img[self.y_center:self.y_center + raw_mask.shape[0],
self.x_center:self.x_center + raw_mask.shape[1]] = raw_mask
final_mask = img
if self.class_exists:
c = self.get_class_id(str(key))
if c is not None:
class_id.append(c)
c = None
i = i + 1
# Return masks, bboxes and classes of each instance.
boxes = torch.as_tensor(self.boxes, dtype=torch.float32)
areas = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
area = torch.as_tensor(areas)
classes = torch.as_tensor(class_id, dtype=torch.int64)
masks = torch.as_tensor(final_mask, dtype=torch.uint8)
iscrowd = torch.zeros((i,), dtype=torch.int64)
target = {}
target["boxes"] = boxes
target["labels"] = classes
target["masks"] = masks
target["image_id"] = torch.tensor([idx])
target["area"] = area
target["iscrowd"] = iscrowd
if self.transforms is not None:
input_img, target = self.transforms(input_img, target)
return [input_img, target]
def get_class_id(self, name:str):
# adding the classes
if 'class1' in name:
return 1
elif 'class2' in name:
return 2
elif 'class3' in name:
return 3
elif 'class4' in name:
return 4
elif 'class5' in name:
return 5
elif 'class6' in name:
return 6
elif 'class7' in name:
return 7
elif 'class8' in name:
return 8
elif 'class9' in name:
return 9
elif 'class10' in name:
return 10
else:
return None
def get_targets(self, x, y, raw_mask, i):
while y.__contains__(None):
y.remove(None)
while x.__contains__(None):
x.remove(None)
if np.max(x) - np.min(x) < 1 or np.max(y) - np.min(y) < 1:
self.class_exists = False
return raw_mask
rr, cc = skimage.draw.polygon(y, x)
raw_mask[rr - 1, cc - 1, i] = 1
pos = np.nonzero(raw_mask[:, :, i])
xmin = np.min(pos[1]) + self.x_center
xmax = np.max(pos[1]) + self.x_center
ymin = np.min(pos[0]) + self.y_center
ymax = np.max(pos[0]) + self.y_center
if xmin >= xmax:
xmax = xmax + 1
if ymin >= ymax:
ymax = ymax + 1
self.boxes.append([xmin, ymin, xmax, ymax])
self.class_exists = True
return raw_mask
This is adapted from here TorchVision Object Detection Finetuning Tutorial — PyTorch Tutorials 2.0.1+cu117 documentation and seems to work well.
Next is my training code:
def get_transform(train):
transforms = []
transforms.append(T.PILToTensor())
transforms.append(T.ConvertImageDtype(torch.float))
if train:
transforms.append(T.RandomHorizontalFlip(0.5))
return T.Compose(transforms)
def get_model_instance_segmentation(model_num_classes):
# load an instance segmentation model pre-trained on COCO
this_model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
# get number of input features for the classifier
in_features = this_model.roi_heads.box_predictor.cls_score.in_features
# replace the pre-trained head with a new one
this_model.roi_heads.box_predictor = FastRCNNPredictor(in_features, model_num_classes)
# now get the number of input features for the mask classifier
in_features_mask = this_model.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
this_model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
hidden_layer,
model_num_classes)
return this_model
############################################################
# Training
############################################################
print("Current TF-version: ", tf.__version__)
device = torch.device('cpu') # if torch.cuda.is_available() else None
num_classes = 11 # extra class for background
dataset = Custom_Data(DATA_DIR, get_transform(train=False))
torch.manual_seed(1)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices[:-50])
data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True,
num_workers=4, collate_fn=utils.collate_fn)
model =get_model_instance_segmentation(model_num_classes=num_classes)
model.to(device)
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=1e-3, momentum=0.9, weight_decay=0.0005)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
num_epochs = 10
for epoch in range(num_epochs):
train_one_epoch(model, optimizer, data_loader, device=device, epoch=epoch, print_freq=10)
lr_scheduler.step()
evaluate(model, data_loader, device=device)
model.eval()
x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
predictions = model(x)
torch.onnx.export(model, x, "mask_rcnn.pnnx", opset_version=11)
I run the code on cpu, because running on gpu with cuda always ends up in a cuda device assertion error. So I debug my code while running on cpu.
Finally, here is the error message I get:
Traceback (most recent call last):
File "/home/.../training.py", line 74, in <module>
train_one_epoch(model, optimizer, data_loader, device=device, epoch=epoch, print_freq=10)
File "/home/.../engine.py", line 31, in train_one_epoch
loss_dict = model(images, targets)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/.../venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/generalized_rcnn.py", line 105, in forward
detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/.../venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/roi_heads.py", line 755, in forward
proposals, matched_idxs, labels, regression_targets = self.select_training_samples(proposals, targets)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/roi_heads.py", line 649, in select_training_samples
matched_idxs, labels = self.assign_targets_to_proposals(proposals, gt_boxes, gt_labels)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/.../venv/lib/python3.11/site-packages/torchvision/models/detection/roi_heads.py", line 588, in assign_targets_to_proposals
labels_in_image = gt_labels_in_image[clamped_matched_idxs_in_image]
~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
IndexError: index 17 is out of bounds for dimension 0 with size 9
Process finished with exit code 1
The tensor of the list gt_labels containing my classes is something like tensor([10, 2, 3, 1, 5, 6, 9, 4, 8]). But clamped_matched_idxs_in_image is tensor([0, 0, 0, …, 15, 16, 17]).
Did someone ever encounter a similar issue? Please let me know, if I missed any information.