Hi team,

I am using MTCNN in pytorch, and it looks like pure non-max suppression implementation in pytorch(cuda) is waaay slower than numpy implementation on cpu. I ended up running just the nms part on cpu to get decent frame rates. Could someone please correct any obvious mistakes, here is the code,

```
def nms(boxes, _keep, overlap_threshold=0.5, min_mode=False):
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
scores = boxes[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
_, order = scores.sort(dim=0, descending=True)
cnt = 0
while order.size()[0] > 1 and cnt < _keep.shape[0]:
_keep[cnt] = order[0]
cnt += 1
xx1 = torch.max(x1[order[0]], x1[order[1:]])
yy1 = torch.max(y1[order[0]], y1[order[1:]])
xx2 = torch.min(x2[order[0]], x2[order[1:]])
yy2 = torch.min(y2[order[0]], y2[order[1:]])
w = torch.clamp(xx2-xx1, min=0)
h = torch.clamp(yy2-yy1, min=0)
inter = w * h
if min_mode:
ovr = inter / torch.min(areas[order[0]], areas[order[1:]])
else:
ovr = inter / (areas[order[0]] + areas[order[1:]] - inter)
inds = torch.nonzero(ovr <= overlap_threshold).squeeze()
if inds.dim():
order = order[inds + 1]
else:
break
return _keep[:cnt]
```

And here’s the same in numpy

```
def nms_cpu(boxes, overlap_threshold=0.5, min_mode=False):
boxes = boxes.cpu().numpy()
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
scores = boxes[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
keep.append(order[0])
xx1 = np.maximum(x1[order[0]], x1[order[1:]])
yy1 = np.maximum(y1[order[0]], y1[order[1:]])
xx2 = np.minimum(x2[order[0]], x2[order[1:]])
yy2 = np.minimum(y2[order[0]], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
if min_mode:
ovr = inter / np.minimum(areas[order[0]], areas[order[1:]])
else:
ovr = inter / (areas[order[0]] + areas[order[1:]] - inter)
inds = np.where(ovr <= overlap_threshold)[0]
order = order[inds + 1]
return keep
```

Here is the output of profiling for pytorch

```
Total time: 55.6077 s
File: utils/box_utils.py
Function: nms at line 4
Line # Hits Time Per Hit % Time Line Contents
==============================================================
4 @profile
5 def nms(boxes, _keep, overlap_threshold=0.5, min_mode=False):
6 506 3933.0 7.8 0.0 x1 = boxes[:, 0]
7 506 2744.0 5.4 0.0 y1 = boxes[:, 1]
8 506 2433.0 4.8 0.0 x2 = boxes[:, 2]
9 506 2558.0 5.1 0.0 y2 = boxes[:, 3]
10 506 2363.0 4.7 0.0 scores = boxes[:, 4]
11
12 506 38317.0 75.7 0.1 areas = (x2 - x1 + 1) * (y2 - y1 + 1)
13 506 19902.0 39.3 0.0 _, order = scores.sort(dim=0, descending=True)
14 506 327.0 0.6 0.0 cnt = 0
15
16 39420 132472.0 3.4 0.2 while order.size()[0] > 1 and cnt < _keep.shape[0]:
17 39296 984623.0 25.1 1.8 _keep[cnt] = order[0]
18 39296 31360.0 0.8 0.1 cnt += 1
19 39296 8083133.0 205.7 14.5 xx1 = torch.max(x1[order[0]], x1[order[1:]])
20 39296 7976850.0 203.0 14.3 yy1 = torch.max(y1[order[0]], y1[order[1:]])
21 39296 7997772.0 203.5 14.4 xx2 = torch.min(x2[order[0]], x2[order[1:]])
22 39296 7949390.0 202.3 14.3 yy2 = torch.min(y2[order[0]], y2[order[1:]])
23
24 39296 1270632.0 32.3 2.3 w = torch.clamp(xx2-xx1, min=0)
25 39296 1048029.0 26.7 1.9 h = torch.clamp(yy2-yy1, min=0)
26 # print (w, h)
27 39296 662499.0 16.9 1.2 inter = w * h
28 39296 23280.0 0.6 0.0 if min_mode:
29 58 13687.0 236.0 0.0 ovr = inter / torch.min(areas[order[0]], areas[order[1:]])
30 else:
31 39238 9429904.0 240.3 17.0 ovr = inter / (areas[order[0]] + areas[order[1:]] - inter)
32
33 39296 3483842.0 88.7 6.3 inds = torch.nonzero(ovr <= overlap_threshold).squeeze()
34 39296 54212.0 1.4 0.1 if inds.dim():
35 38914 6390010.0 164.2 11.5 order = order[inds + 1]
36 else:
37 382 267.0 0.7 0.0 break
38
39 506 3197.0 6.3 0.0 return _keep[:cnt]
```

same for numpy

```
Total time: 13.3619 s
File: utils/box_utils.py
Function: nms_cpu at line 40
Line # Hits Time Per Hit % Time Line Contents
==============================================================
40 @profile
41 def nms_cpu(boxes, _keep, overlap_threshold=0.5, min_mode=False):
42 # This is run on CPU (numpy) as it runs slower on GPU
43 4895 255245.0 52.1 1.9 boxes = boxes.cpu().numpy()
44 4895 15433.0 3.2 0.1 x1 = boxes[:, 0]
45 4895 4332.0 0.9 0.0 y1 = boxes[:, 1]
46 4895 3376.0 0.7 0.0 x2 = boxes[:, 2]
47 4895 3341.0 0.7 0.0 y2 = boxes[:, 3]
48 4895 3260.0 0.7 0.0 scores = boxes[:, 4]
49
50 4895 112262.0 22.9 0.8 areas = (x2 - x1 + 1) * (y2 - y1 + 1)
51 4895 55061.0 11.2 0.4 order = scores.argsort()[::-1]
52
53 4895 3019.0 0.6 0.0 keep = []
54 375565 226824.0 0.6 1.7 while order.size > 0:
55 370670 272610.0 0.7 2.0 keep.append(order[0])
56 370670 1394733.0 3.8 10.4 xx1 = np.maximum(x1[order[0]], x1[order[1:]])
57 370670 1234165.0 3.3 9.2 yy1 = np.maximum(y1[order[0]], y1[order[1:]])
58 370670 1222226.0 3.3 9.1 xx2 = np.minimum(x2[order[0]], x2[order[1:]])
59 370670 1209276.0 3.3 9.1 yy2 = np.minimum(y2[order[0]], y2[order[1:]])
60
61 370670 1558863.0 4.2 11.7 w = np.maximum(0.0, xx2 - xx1 + 1)
62 370670 1438743.0 3.9 10.8 h = np.maximum(0.0, yy2 - yy1 + 1)
63 370670 442606.0 1.2 3.3 inter = w * h
64
65 370670 169062.0 0.5 1.3 if min_mode:
66 642 3642.0 5.7 0.0 ovr = inter / np.minimum(areas[order[0]], areas[order[1:]])
67 else:
68 370028 1811625.0 4.9 13.6 ovr = inter / (areas[order[0]] + areas[order[1:]] - inter)
69
70 370670 1086337.0 2.9 8.1 inds = np.where(ovr <= overlap_threshold)[0]
71 370670 833359.0 2.2 6.2 order = order[inds + 1]
72 4895 2533.0 0.5 0.0 return keep
```

Both of them were profiled for 60s. Pytorch NMS took 55s as opposed to numpy’s mere 13s.

Thank you so much for your time!