Detection incredibly slow

Hi All,

I created a script which should do a simple task. Open a film fragment 1280wx720h and perform detection on it for each frame.
On a jetson orin (6GB memory), with Pytorch 2.1 + 0.16 vision which has all optimizations ok, it performs very slow though.
Any hints??

import hashlib
from pathlib import Path
import numpy as np
import torch
import cv2
from torchvision.models import resnet50

from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FasterRCNN_ResNet50_FPN_V2_Weights

import cv2
import numpy as np

Load ResNet50 model ideally for extracting features

class_labels = {
1: ‘person’,
2: ‘car’,
3: ‘dog’,
# Add more class labels as needed

Specify the nodes to extract features from

return_nodes = {
‘layer1’: ‘layer1’,
‘layer2’: ‘layer2’,
‘layer3’: ‘layer3’,
‘layer4’: ‘layer4’,

Create the feature extractor

resnetModel = resnet50()

Function to draw bounding box on image

def draw_box(image, box, label):
x1, y1, x2, y2 = box
image_with_box = cv2.rectangle(image.copy(), (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
cv2.putText(image_with_box, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
return image_with_box

Load a pre-trained Faster R-CNN model

def load_model():
model = fasterrcnn_resnet50_fpn_v2()
state_dict = torch.hub.load_state_dict_from_url(FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1.url)
return model

Main function to perform object detection and tracking

def detect_and_track_objects(video_path, use_visdom=False):

# Load the video file using OpenCV
cap = cv2.VideoCapture(video_path)

# Get width and height of the video frames
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Create VideoWriter for output video
out = cv2.VideoWriter('output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 25, (width, height))

# Load the model
model = load_model()

frame_count = 0
while True:
    ret, frame =
    if not ret:

    frame_count += 1

    # Perform object detection
    # Convert the frame to tensor
    frame_tensor = torch.tensor(frame, dtype=torch.float32) / 255.0
    frame_tensor = frame_tensor.permute(2, 0, 1).unsqueeze(0)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(frame_tensor)
    frame_with_bbox = frame
    # Get the predicted bounding box from the output
    tracker_id = 0

    for bbox, score, label in zip(outputs[0]['boxes'], outputs[0]['scores'], outputs[0]['labels']):
        bbox = bbox.cpu().numpy()
        score = score.cpu().numpy()

        # Get human-readable label
        label_name = class_labels.get(label.item(), f'Unknown {label.item()}')
        #hash_code = generate_hash(label_name, bbox, feature)
        if score > 0.5:
            tracker_id +=1
            frame_with_bbox = draw_box(frame_with_bbox, bbox, label_name)

    # Write frame to output video

# Release video capture and close output video

Main function call

if name == “main”:
# Video file path
video_path = str(Path(“~/001.avi”).expanduser())

# Detect and track objects
detect_and_track_objects(video_path, use_visdom=True)

the load_model did not have a .to(‘cuda’/‘cpu’)
Still, inference is very slow. Talking about 1,5 frames per second. Any tips welcome.

def load_model():
    model = fasterrcnn_resnet50_fpn_v2()
    state_dict = torch.hub.load_state_dict_from_url(FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1.url)
    model =
    return model

I don’t believe you move your tensors to the GPU either. frame_tensor = torch.tensor(frame, dtype=torch.float32) / 255.0