Hi, I would like to have a discussion about post precessing algorithm is torchvision ssd model. During post processing in the torchvision implementation, we get topk(400 by default) scores for each classes before conducting non-max suppression as follows.
def postprocess_detections(
self, head_outputs: Dict[str, Tensor], image_anchors: List[Tensor], image_shapes: List[Tuple[int, int]]
) -> List[Dict[str, Tensor]]:
bbox_regression = head_outputs["bbox_regression"]
pred_scores = F.softmax(head_outputs["cls_logits"], dim=-1)
num_classes = pred_scores.size(-1)
device = pred_scores.device
detections: List[Dict[str, Tensor]] = []
for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes):
boxes = self.box_coder.decode_single(boxes, anchors)
boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
image_boxes = []
image_scores = []
image_labels = []
for label in range(1, num_classes):
score = scores[:, label]
keep_idxs = score > self.score_thresh
score = score[keep_idxs]
box = boxes[keep_idxs]
# keep only topk scoring predictions
num_topk = det_utils._topk_min(score, self.topk_candidates, 0)
score, idxs = score.topk(num_topk)
box = box[idxs]
image_boxes.append(box)
image_scores.append(score)
image_labels.append(torch.full_like(score, fill_value=label, dtype=torch.int64, device=device))
image_boxes = torch.cat(image_boxes, dim=0)
image_scores = torch.cat(image_scores, dim=0)
image_labels = torch.cat(image_labels, dim=0)
# non-maximum suppression
keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
keep = keep[: self.detections_per_img]
detections.append(
{
"boxes": image_boxes[keep],
"scores": image_scores[keep],
"labels": image_labels[keep],
}
)
return detections
So, topk method is called for every iterations(classes) with (8732x1) score tensor, but I believe there will be not many meaningful scores as we conduct softmax before it.
I wonder why don’t we just extract max score class for each bbox and then perform topk just once with the extracted (8732x1) score tensor like below.
for boxes, scores, anchors, image_shape in zip(bbox_regression, pred_scores, image_anchors, image_shapes):
boxes = self.box_coder.decode_single(boxes, anchors)
boxes = box_ops.clip_boxes_to_image(boxes, image_shape)
scores_max, label_max = torch.max(scores[:, 1:], dim=1)
label_max = label_max + 1
image_scores, idxs = scores_max.topk(self.topk_candidates)
image_boxes = boxes[idxs]
image_labels = label_max[idxs]
# non-maximum suppression
keep = box_ops.batched_nms(image_boxes, image_scores, image_labels, self.nms_thresh)
keep = keep[: self.detections_per_img]
detections.append(
{
"boxes": image_boxes[keep],
"scores": image_scores[keep],
"labels": image_labels[keep],
}
)
return detections
In this way, I got much faster latency. (approximately, 35ms->11ms on V100)
And if it’s for multi-label classification, I believe there is no reason to use softmax at the start of post-processing.