RetinaNet is not learning

Hi everyone! I am trying to build an object detection model using RetinaNet architecture ( torchvision.models.detection. retinanet_resnet50_fpn), but my model is not learning at all. It returns no errors, but when it comes to inference, model predicts the same bounding boxes with the same labels and same confidence scores for all images (or sometimes even empty lists).

Main parts of my code:

category_ids = [0, 1, 2, 3, 4, 5, 6]
category_id_to_name = {0: 'car',
                       1: 'number plate',
                       2: 'blur number plate',
                       3: 'two wheeler',
                       4: 'auto',
                       5: 'bus',
                       6: 'truck'}

# function for converting bounding boxes from yolo format to pascal voc format
def convert_to_voc(yolo_box, image_height, image_width):
  x_c, y_c, w, h = yolo_box
  x_tl = x_c - w / 2
  y_tl = y_c - h / 2
  x_tl *= image_width
  y_tl *= image_height
  w *= image_width
  h *= image_height
  x_br = x_tl + w
  y_br = y_tl + h
  voc_box = np.array([x_tl, y_tl, x_br, y_br], dtype=np.int64)
  voc_box = list(voc_box)
  return voc_box

# class for building dataset
class Traffic_Vehicles_Dataset(Dataset):
  def __init__(self, images_filenames, images_dir, boxes_dir, transform=None):
    self.images_filenames = images_filenames
    self.images_dir = images_dir
    self.boxes_dir = boxes_dir
    self.transform = transform

  def __len__(self):
    return len(self.images_filenames)

  def __getitem__(self, index):
    image_filename = self.images_filenames[index]
    image = cv2.imread(os.path.join(self.images_dir, image_filename))
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_height, image_width = image.shape[0], image.shape[1]
    boxes = np.loadtxt(os.path.join(self.boxes_dir, image_filename[:-4]+'.txt'), delimiter=' ')
    if boxes.ndim < 2:
      boxes = boxes[np.newaxis, :]
    labels = boxes[:, 0]
    boxes = boxes[:, 1:]
    boxes = [convert_to_voc(box, image_width=image_width, image_height=image_height) for box in boxes]
    if self.transform:
      transformed = self.transform(image=image, bboxes=boxes, labels=labels)
      image = transformed['image']
      boxes = transformed['bboxes']
      labels = transformed['labels']
    image = image.div(255)
    target = {'boxes': torch.as_tensor(boxes, dtype=torch.float32),
              'labels': torch.as_tensor(labels, dtype=torch.int64)}
    return image, target

# data augmentation
train_transform = A.Compose(
    [A.Resize(256, 256),
     A.HueSaturationValue(hue_shift_limit=0.1, sat_shift_limit= 0.1, val_shift_limit=0.1),
     A.RGBShift(r_shift_limit=10, g_shift_limit=10, b_shift_limit=10),
     A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1),
     A.HorizontalFlip(p=0.3),
     ToTensorV2()],
     bbox_params=A.BboxParams(format='pascal_voc', min_area=0, min_visibility=0, label_fields=['labels']))

train_dataset = Traffic_Vehicles_Dataset(train_images_filenames, train_images_dir, train_boxes_dir, train_transform)

# custom collate_fn function to allow usage of bounding boxes with different size in a batch
def collate_fn(batch):
  return tuple(zip(*batch))

# dataloader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# model
model = models.detection.retinanet_resnet50_fpn(pretrained=False,
                                                pretrained_backbone=True,
                                                num_classes = 7,
                                                trainable_backbone_layers=5)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(params, lr=0.001, weight_decay=0.0001)

# train loop
def train_network(model, optimizer, n_epochs):
  start = time.time()
  losses_history = []
  progress_bar = trange(n_epochs, desc='Epoch:')
  model.train()
  for epoch in progress_bar:
    epoch_loss = 0
    for images, targets in tqdm(train_loader, leave=False):
      images = list(image.to(device) for image in images)
      targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
      loss_dict = model(images, targets)
      losses = sum(loss for loss in loss_dict.values())
      optimizer.zero_grad()
      losses.backward()
      optimizer.step()
      epoch_loss += losses
    losses_history.append(epoch_loss)
    progress_bar.set_description('Loss: {:.3f}'.format(epoch_loss))
  training_time = time.time() - start
  print('Training complete in {:.0f}m {:.0f}s'.format(training_time // 60, training_time % 60))
  return model, losses_history

# training
model, losses_history = train_network(model, optimizer, n_epochs=20)

Sometimes it predicts weird outputs, sometimes it predicts no outputs at all (empty list for boxes, labels and scores). I visualized images, checked if dataloader works correctly, visualized augmentations - everything looks normal. But when it comes to training and inference, model is not learning at all (loss function somehow decreases to ±70 and after reaching this point stops decreasing, and in the end it gives really bad results).

What am I doing wrong? Why isn’t model learning?

P.S. And yes, I know than num_classes parameter in RetinaNet class requires a background class to be included. But even when I set num_classes to 8 (7 classes + background) instead of 7, it makes no difference - the model is still not learning.

1 Like

An interesting observation: the model started learning after I ran the following command:

! pip install torch==1.7.0+cu101 torchvision==0.8.1+cu101 torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html

I also switched to SGD instead of Adam, but I do not think that model started learning because of this (at least it is very strange, because in most cases Adam performs better than SGD).
Maybe there are some bugs in current version of Pytorch? Or my code is out of date?

I think the Adam → SGD switch could be the culprit. You can run a quick experiment by going back to Adam.

Adam doesn’t always perform better than SGD, it’s jumpier and so many use it for exploratory purposes but switch to SGD to get top performance.

1 Like

Most probably you are right, thanks for help. Switching to SGD and increasing learning rate can really make significant difference in object detection.

1 Like