I am using pytorch fine tuning mask rcnn from the tutorial and set pretrained to true to train the model on cityscapes dataset. I haven’t converted the cityscapes to COCO format. Instead I have created the dataset class to create binary mask using polygons. I am using pytorch lightning for training. I monitored tval loss and val acc and it seems ok for 30 epochs.
My dataset class.
class Cityscapes(Dataset):
def __init__(self, img_dir, ann_dir, split, transform= None):
self.categories = {'person': 1, 'car':2, 'rider':3, 'bus': 4, 'train': 5, 'truck': 6, 'motorcycle': 7, 'bicycle': 8 }
#self.root_dir = root
assert split in ["train", "val", "test"]
img_dir = os.path.abspath(os.path.join(img_dir, split))
ann_dir = os.path.abspath(os.path.join(ann_dir, split))
self.ann_dir = ann_dir
img_name = os.path.join(img_dir, "*", "*_leftImg8bit.png")
#print(ann_dir)
self.img_paths = sorted(glob.glob(img_name))
#print(self.img_paths)
#similarly for annotations (polygons.json path)
ann_name = os.path.join(ann_dir, "*", "*_polygons.json")
self.annots_paths = list(sorted(glob.glob(ann_name)))
#print(annots_paths)
assert len(self.img_paths) == len(self.annots_paths)
self.transform = transform
def __len__(self):
return len(self.img_paths)
def __getitem__(self, index):
annots_file = self.annots_paths[index]
images = self.img_paths[index]
img_read = cv2.imread(images)
# opencv takes the image and open into BGR format, so need to convert to RGB format
rgb = cv2.cvtColor(img_read, cv2.COLOR_BGR2RGB)
with open(annots_file, 'r') as f:
data = json.load(f)
height = data['imgHeight']
width = data['imgWidth']
bboxes = []
labels = []
masks = []
for item in data['objects']:
label = item['label']
if label not in self.categories.keys():
continue
label = self.categories[item['label']]
#extract the bounding box coordinates from the polygons.json file.
poly = np.array(item['polygon'], dtype=np.int32)
poly[poly < 0] = 0
x_min = np.min(poly[:, 0])
y_min = np.min(poly[:, 1])
x_max = np.max(poly[:, 0])
y_max = np.max(poly[:, 1])
mask = np.zeros((height, width), dtype=np.uint8)
bboxes.append([x_min, y_min, x_max, y_max])
labels.append(label)
# after getting the bboxes and labels, use opencv fillpoly method to draw masks over the object labels selected in self.categories
cv2.fillPoly(mask, pts=[poly], color=(255,))
#plt.imshow(mask, cmap='gray')
#plt.show()
masks.append(mask)
if(len(masks) ==0):
mask = np.zeros((height, width), dtype=np.uint8)
masks.append(mask)
bboxes = np.array(bboxes)
masks = np.array(masks)
if(len(masks) > 0):
transformed = self.transform(image= rgb, bboxes=bboxes, class_labels=labels, masks = masks)
image_tr = transformed["image"] / 255.0
bboxes = transformed["bboxes"]
masks = transformed['masks']
masks = torch.tensor(np.stack(masks, axis=0)) // 255 # Transformed masks as input
if len(bboxes) > 0:
bboxes = torch.stack([torch.tensor(item) for item in bboxes])
labels = torch.stack([torch.tensor(item) for item in labels])
#print(labels)
else:
bboxes = torch.zeros(0,4)
return image_tr, masks, bboxes, labels
My model loading and accuracy calculations
class MASKRCNN(pl.LightningModule):
def __init__(self, n_classes, batchsize):
super(MASKRCNN, self).__init__()
self.n_classes = n_classes
self.batchsize = batchsize
self.detector = torchvision.models.detection.maskrcnn_resnet50_fpn(min_size=600, max_size=1200,
weight_backbone=True)
in_features = self.detector.roi_heads.box_predictor.cls_score.in_features
self.detector.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)
in_features_mask = self.detector.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
# and replace the mask predictor with a new one
self.detector.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, n_classes)
self.best_val_acc = 0
self.val_acc_stack = []
self.training_step_outputs = []
self.log('val_loss', 100000)
self.log('val_acc', self.best_val_acc)
self.lr = 1e-5 # Original base lr is 1e-4
self.momentum = 0.9
self.weight_decay = 0.0001
def forward(self, imgs, targets=None):
# Torchvision FasterRCNN returns the loss during training
# and the boxes during eval
self.detector.eval()
return self.detector(imgs)
def configure_optimizers(self):
optimizer = torch.optim.SGD(self.parameters(), lr=self.lr)
lr_scheduler = {
'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=5,
threshold=0.0001, min_lr=0, eps=1e-08),
'monitor': 'val_loss'}
return [optimizer], [lr_scheduler]
def on_before_zero_grad(self, optimizer):
print("I am calling the optimizer now")
def train_dataloader(self):
num_train_sample_batches = len(cityscapes_train) // self.batchsize
temp_indices = np.array([i for i in range(len(cityscapes_train))])
np.random.shuffle(temp_indices)
sample_indices = []
for i in range(num_train_sample_batches):
batch = temp_indices[self.batchsize * i:self.batchsize * (i + 1)]
for index in batch:
sample_indices.append(index)
return torch.utils.data.DataLoader(cityscapes_train, batch_size=self.batchsize, sampler=sample_indices, shuffle=False,
collate_fn=collate_fn)
def training_step(self, batch, batch_idx):
imgs = list(image.cuda() for image in batch[0])
targets = []
for mask, boxes, labels in zip(batch[1], batch[2], batch[3]):
target = {}
target["boxes"] = boxes.float().cuda()
#print(len(target['boxes']))
target["labels"] = torch.as_tensor(labels, dtype=torch.int64).cuda()
target["masks"] = mask.cuda()
targets.append(target)
# fasterrcnn takes both images and targets for training, returns
# Detection using source images
# temp_loss = []
# for index in range(len(imgs)):
# detections = self.detector([imgs[index]], [targets[index]])
# temp_loss.append(sum(loss1 for loss1 in detections.values()))
loss_dict = self.detector(imgs, targets)
#print(loss_dict)
loss_classifier = loss_dict['loss_classifier']
loss_box_reg = loss_dict['loss_box_reg']
loss_mask = loss_dict['loss_mask']
loss = sum(loss for loss in loss_dict.values())
self.training_step_outputs.append(loss)
print('train_loss: {}'.format(loss))
return {"loss": loss} # , "log": loss_dict.detach().cpu()}
def on_train_epoch_end(self):
all_losses = torch.mean(torch.stack(self.training_step_outputs))
#epoch_loss = torch.mean(all_losses)
#self.log('train_loss', all_losses, on_step=True, on_epoch=True)
print('epoch_loss: {}'.format(all_losses))
self.training_step_outputs.clear()
def validation_step(self, batch, batch_idx):
img, mask, boxes, label = batch
preds = self.forward(img)
#print(preds)
preds[0]['masks'] = preds[0]['masks'][preds[0]['scores'] > 0.5]
print(preds[0]['scores'])
preds[0]['masks'][preds[0]['masks'] > 0.5] = 1
preds[0]['masks'][preds[0]['masks'] <= 0.5] = 0
# self.val_acc = torch.mean(torch.stack([self.accuracy(b,pb["boxes"],iou_threshold=0.5) for b,pb in zip(boxes,pred_boxes)]))
# self.val_acc_stack[domain[0]].append(torch.stack([self.accuracy(b,pb["boxes"],iou_threshold=0.5) for b,pb in zip(boxes,preds)]))
self.val_acc_stack.append(self.accuracy(mask[0], preds[0]['masks'].type(torch.uint8)))
# return val_acc_stack
def on_validation_epoch_end(self):
temp = torch.mean(torch.stack(self.val_acc_stack))
self.log('val_loss', 1 - temp) # Logging for model checkpoint
self.log('val_acc', temp)
if (self.best_val_acc < temp):
self.best_val_acc = temp
self.best_val_acc_epoch = self.trainer.current_epoch
self.val_acc_stack = []
print('Validation IOU: ', temp)
def mask_iou(self, src_masks, tgt_masks):
# src masks are of dimension N X H X W
# tgt masks (predictions) are of dimension M X 1 X H X W
maskiou_matrix = torch.zeros(len(src_masks), len(tgt_masks)).cuda()
for src_index in range(len(src_masks)):
src_mask = src_masks[src_index]
for tgt_index in range(len(tgt_masks)):
tgt_mask = tgt_masks[tgt_index][0]
#print(tgt_mask)
# print(torch.sum(torch.bitwise_and(src_mask, tgt_mask)))
maskiou_matrix[src_index, tgt_index] = float(torch.sum(torch.bitwise_and(src_mask, tgt_mask))) / float(
torch.sum(torch.bitwise_or(src_mask, tgt_mask)))
return maskiou_matrix
def accuracy(self, src_boxes, pred_boxes, iou_threshold=0.5):
"""
#The accuracy method is not the one used in the evaluator but very similar
"""
total_gt = len(src_boxes)
total_pred = len(pred_boxes)
if total_gt > 0 and total_pred > 0:
# Define the matcher and distance matrix based on iou
matcher = Matcher(iou_threshold, iou_threshold, allow_low_quality_matches=False)
match_quality_matrix = self.mask_iou(src_boxes, pred_boxes)
results = matcher(match_quality_matrix)
true_positive = torch.count_nonzero(results.unique() != -1)
matched_elements = results[results > -1]
# in Matcher, a pred element can be matched only twice
false_positive = torch.count_nonzero(results == -1) + (
len(matched_elements) - len(matched_elements.unique()))
false_negative = total_gt - true_positive
return true_positive / (true_positive + false_positive)
elif total_gt == 0:
if total_pred > 0:
return torch.tensor(0.).cuda()
else:
return torch.tensor(1.).cuda()
elif total_gt > 0 and total_pred == 0:
return torch.tensor(0.).cuda()
val_dataloader = torch.utils.data.DataLoader(cityscapes_val, batch_size=1, shuffle=False, collate_fn=collate_fn)
import os
import torchvision.transforms as T
import PIL.Image as I
detector = MASKRCNN(n_classes=9, batchsize=2)
NET_FOLDER = '.'
weights_file = 'best_baseline'
if (os.path.exists(NET_FOLDER + '/' + weights_file + '.ckpt')):
detector.load_state_dict(torch.load(NET_FOLDER + '/' + weights_file + '.ckpt')['state_dict'])
else:
if not os.path.exists(NET_FOLDER):
mode = 0o777
os.mkdir(NET_FOLDER, mode)
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
early_stop_callback = EarlyStopping(monitor='val_acc', min_delta=0.00, patience=10, verbose=False, mode='max')
checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath=NET_FOLDER, filename=weights_file)
trainer = Trainer(accelerator='gpu', devices=1, max_epochs=30, deterministic=False,
callbacks=[checkpoint_callback, early_stop_callback], reload_dataloaders_every_n_epochs=1)
trainer.fit(detector, val_dataloaders=val_dataloader)
However, when doing inference (detector.eval()) on sample image the model is returning almost 0 scores and doing wrong predictions. I konw epochs might be less but it should predict the classes right as I am using pretrained = True.
Where am I lacking?.
What to look for debugging this?
Am I passing the dataset in wrong manner?
This is the output of batch = next(iter(train_dataloader)
(tensor([[[[0.1294, 0.1294, 0.1294, ..., 0.6863, 0.6863, 0.6745],
[0.1294, 0.1294, 0.1294, ..., 0.6863, 0.6784, 0.6784],
[0.1373, 0.1294, 0.1216, ..., 0.6824, 0.6863, 0.6863],
...,
[0.2000, 0.2078, 0.2039, ..., 0.3294, 0.3255, 0.3255],
[0.2078, 0.2157, 0.2157, ..., 0.3137, 0.3216, 0.3216],
[0.2118, 0.2157, 0.2157, ..., 0.3137, 0.3216, 0.3216]],
[[0.1725, 0.1725, 0.1725, ..., 0.8118, 0.8118, 0.8078],
[0.1765, 0.1765, 0.1765, ..., 0.8118, 0.8078, 0.8000],
[0.1804, 0.1765, 0.1686, ..., 0.8039, 0.8000, 0.8039],
...,
[0.2627, 0.2706, 0.2706, ..., 0.3961, 0.3961, 0.3882],
[0.2667, 0.2745, 0.2784, ..., 0.3804, 0.3843, 0.3843],
[0.2745, 0.2784, 0.2784, ..., 0.3804, 0.3843, 0.3882]],
[[0.1294, 0.1333, 0.1333, ..., 0.7961, 0.7961, 0.7843],
[0.1333, 0.1373, 0.1373, ..., 0.8000, 0.7922, 0.7804],
[0.1294, 0.1333, 0.1294, ..., 0.7961, 0.7882, 0.7843],
...,
[0.2314, 0.2392, 0.2314, ..., 0.3255, 0.3255, 0.3255],
[0.2353, 0.2431, 0.2431, ..., 0.3137, 0.3216, 0.3176],
[0.2431, 0.2510, 0.2471, ..., 0.3137, 0.3216, 0.3216]]],
[[[0.8980, 0.9059, 0.8941, ..., 0.1020, 0.1255, 0.1843],
[0.8196, 0.8510, 0.8627, ..., 0.0980, 0.1137, 0.1137],
[0.4745, 0.5020, 0.6078, ..., 0.1529, 0.1412, 0.1176],
...,
[0.1608, 0.1608, 0.1686, ..., 0.1569, 0.1608, 0.1608],
[0.1608, 0.1647, 0.1686, ..., 0.1490, 0.1608, 0.1608],
[0.1647, 0.1725, 0.1686, ..., 0.1490, 0.1608, 0.1608]],
[[0.9765, 0.9922, 0.9922, ..., 0.1529, 0.1804, 0.2941],
[0.9059, 0.9412, 0.9686, ..., 0.1569, 0.1725, 0.2118],
[0.5804, 0.6863, 0.7882, ..., 0.2157, 0.1961, 0.1608],
...,
[0.2118, 0.2118, 0.2118, ..., 0.2039, 0.2078, 0.2039],
[0.2118, 0.2157, 0.2196, ..., 0.2000, 0.2078, 0.2078],
[0.2196, 0.2196, 0.2235, ..., 0.2000, 0.2078, 0.2078]],
[[0.9294, 0.9294, 0.9412, ..., 0.1176, 0.1529, 0.2431],
[0.8549, 0.8824, 0.9098, ..., 0.1137, 0.1373, 0.1686],
[0.5647, 0.6588, 0.7373, ..., 0.1529, 0.1412, 0.1216],
...,
[0.1882, 0.1882, 0.1843, ..., 0.1765, 0.1725, 0.1725],
[0.1843, 0.1843, 0.1922, ..., 0.1686, 0.1686, 0.1725],
[0.1882, 0.1882, 0.1961, ..., 0.1686, 0.1686, 0.1725]]]]), [tensor([[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
...,
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]], dtype=torch.uint8), tensor([[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
...,
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]],
[[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]]], dtype=torch.uint8)], [tensor([[7.2715e+02, 2.4609e+02, 8.4316e+02, 3.1172e+02],
[1.0219e+03, 2.5137e+02, 1.1150e+03, 2.9414e+02],
[1.0219e+03, 2.5137e+02, 1.1150e+03, 2.9414e+02],
[5.8594e-01, 2.8594e+02, 5.0391e+01, 3.0820e+02],
[2.2734e+02, 2.6133e+02, 3.1406e+02, 2.9238e+02],
[2.4258e+02, 2.6719e+02, 3.3340e+02, 3.0117e+02],
[6.5039e+01, 2.8184e+02, 1.0898e+02, 2.9297e+02],
[5.3906e+01, 2.8477e+02, 1.0312e+02, 3.0059e+02],
[1.0781e+02, 2.6367e+02, 1.9160e+02, 2.9824e+02],
[1.0219e+03, 2.5137e+02, 1.1150e+03, 2.9414e+02],
[7.1484e+02, 2.6133e+02, 7.3887e+02, 2.8711e+02],
[6.6504e+02, 2.6133e+02, 6.8027e+02, 2.9180e+02],
[6.5273e+02, 2.6016e+02, 6.7207e+02, 2.9180e+02],
[7.2715e+02, 2.4609e+02, 8.4316e+02, 3.1172e+02],
[9.4336e+01, 2.5547e+02, 1.2715e+02, 3.2109e+02],
[1.0020e+02, 2.7422e+02, 1.2246e+02, 3.2051e+02]], dtype=torch.float64), tensor([[ 506.2500, 249.0234, 520.8984, 263.6719],
[ 435.9375, 244.9219, 451.1719, 260.1562],
[ 744.1406, 250.7812, 779.2969, 275.3906],
[ 66.7969, 237.3047, 100.1953, 255.4688],
[ 192.1875, 250.7812, 244.3359, 272.4609],
[ 247.2656, 243.7500, 287.1094, 259.5703],
[ 280.0781, 244.3359, 321.6797, 262.5000],
[ 312.8906, 246.0938, 351.5625, 270.1172],
[ 984.9609, 260.1562, 1087.5000, 314.0625],
[ 789.8438, 256.6406, 826.7578, 283.0078],
[1160.1562, 261.9141, 1200.0000, 333.3984],
[ 476.9531, 246.0938, 516.2109, 275.9766],
[ 527.9297, 247.2656, 554.8828, 266.6016],
[ 549.0234, 247.2656, 577.1484, 271.2891],
[ 564.8438, 243.1641, 608.2031, 279.4922]], dtype=torch.float64)], [tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 1, 1, 2, 3, 8]), tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])])