Hi everyone! I am trying to build an object detection model using RetinaNet architecture ( torchvision.models.detection.
retinanet_resnet50_fpn
), but my model is not learning at all. It returns no errors, but when it comes to inference, model predicts the same bounding boxes with the same labels and same confidence scores for all images (or sometimes even empty lists).
Main parts of my code:
category_ids = [0, 1, 2, 3, 4, 5, 6]
category_id_to_name = {0: 'car',
1: 'number plate',
2: 'blur number plate',
3: 'two wheeler',
4: 'auto',
5: 'bus',
6: 'truck'}
# function for converting bounding boxes from yolo format to pascal voc format
def convert_to_voc(yolo_box, image_height, image_width):
x_c, y_c, w, h = yolo_box
x_tl = x_c - w / 2
y_tl = y_c - h / 2
x_tl *= image_width
y_tl *= image_height
w *= image_width
h *= image_height
x_br = x_tl + w
y_br = y_tl + h
voc_box = np.array([x_tl, y_tl, x_br, y_br], dtype=np.int64)
voc_box = list(voc_box)
return voc_box
# class for building dataset
class Traffic_Vehicles_Dataset(Dataset):
def __init__(self, images_filenames, images_dir, boxes_dir, transform=None):
self.images_filenames = images_filenames
self.images_dir = images_dir
self.boxes_dir = boxes_dir
self.transform = transform
def __len__(self):
return len(self.images_filenames)
def __getitem__(self, index):
image_filename = self.images_filenames[index]
image = cv2.imread(os.path.join(self.images_dir, image_filename))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_height, image_width = image.shape[0], image.shape[1]
boxes = np.loadtxt(os.path.join(self.boxes_dir, image_filename[:-4]+'.txt'), delimiter=' ')
if boxes.ndim < 2:
boxes = boxes[np.newaxis, :]
labels = boxes[:, 0]
boxes = boxes[:, 1:]
boxes = [convert_to_voc(box, image_width=image_width, image_height=image_height) for box in boxes]
if self.transform:
transformed = self.transform(image=image, bboxes=boxes, labels=labels)
image = transformed['image']
boxes = transformed['bboxes']
labels = transformed['labels']
image = image.div(255)
target = {'boxes': torch.as_tensor(boxes, dtype=torch.float32),
'labels': torch.as_tensor(labels, dtype=torch.int64)}
return image, target
# data augmentation
train_transform = A.Compose(
[A.Resize(256, 256),
A.HueSaturationValue(hue_shift_limit=0.1, sat_shift_limit= 0.1, val_shift_limit=0.1),
A.RGBShift(r_shift_limit=10, g_shift_limit=10, b_shift_limit=10),
A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1),
A.HorizontalFlip(p=0.3),
ToTensorV2()],
bbox_params=A.BboxParams(format='pascal_voc', min_area=0, min_visibility=0, label_fields=['labels']))
train_dataset = Traffic_Vehicles_Dataset(train_images_filenames, train_images_dir, train_boxes_dir, train_transform)
# custom collate_fn function to allow usage of bounding boxes with different size in a batch
def collate_fn(batch):
return tuple(zip(*batch))
# dataloader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
# model
model = models.detection.retinanet_resnet50_fpn(pretrained=False,
pretrained_backbone=True,
num_classes = 7,
trainable_backbone_layers=5)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=0.001, weight_decay=0.0001)
# train loop
def train_network(model, optimizer, n_epochs):
start = time.time()
losses_history = []
progress_bar = trange(n_epochs, desc='Epoch:')
model.train()
for epoch in progress_bar:
epoch_loss = 0
for images, targets in tqdm(train_loader, leave=False):
images = list(image.to(device) for image in images)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
optimizer.zero_grad()
losses.backward()
optimizer.step()
epoch_loss += losses
losses_history.append(epoch_loss)
progress_bar.set_description('Loss: {:.3f}'.format(epoch_loss))
training_time = time.time() - start
print('Training complete in {:.0f}m {:.0f}s'.format(training_time // 60, training_time % 60))
return model, losses_history
# training
model, losses_history = train_network(model, optimizer, n_epochs=20)
Sometimes it predicts weird outputs, sometimes it predicts no outputs at all (empty list for boxes, labels and scores). I visualized images, checked if dataloader works correctly, visualized augmentations - everything looks normal. But when it comes to training and inference, model is not learning at all (loss function somehow decreases to ±70 and after reaching this point stops decreasing, and in the end it gives really bad results).
What am I doing wrong? Why isn’t model learning?
P.S. And yes, I know than num_classes parameter in RetinaNet class requires a background class to be included. But even when I set num_classes to 8 (7 classes + background) instead of 7, it makes no difference - the model is still not learning.