Hi All,
I created a script which should do a simple task. Open a film fragment 1280wx720h and perform detection on it for each frame.
On a jetson orin (6GB memory), with Pytorch 2.1 + 0.16 vision which has all optimizations ok, it performs very slow though.
Any hints??
import hashlib
from pathlib import Path
import numpy as np
import torch
import cv2
from torchvision.models import resnet50
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FasterRCNN_ResNet50_FPN_V2_Weights
import cv2
import numpy as np
Load ResNet50 model ideally for extracting features
class_labels = {
1: ‘person’,
2: ‘car’,
3: ‘dog’,
# Add more class labels as needed
}
Specify the nodes to extract features from
return_nodes = {
‘layer1’: ‘layer1’,
‘layer2’: ‘layer2’,
‘layer3’: ‘layer3’,
‘layer4’: ‘layer4’,
}
Create the feature extractor
resnetModel = resnet50()
resnetModel.eval()
Function to draw bounding box on image
def draw_box(image, box, label):
x1, y1, x2, y2 = box
image_with_box = cv2.rectangle(image.copy(), (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
cv2.putText(image_with_box, label, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
return image_with_box
Load a pre-trained Faster R-CNN model
def load_model():
model = fasterrcnn_resnet50_fpn_v2()
state_dict = torch.hub.load_state_dict_from_url(FasterRCNN_ResNet50_FPN_V2_Weights.COCO_V1.url)
model.load_state_dict(state_dict)
model.eval()
return model
Main function to perform object detection and tracking
def detect_and_track_objects(video_path, use_visdom=False):
# Load the video file using OpenCV
cap = cv2.VideoCapture(video_path)
# Get width and height of the video frames
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Create VideoWriter for output video
out = cv2.VideoWriter('output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 25, (width, height))
# Load the model
model = load_model()
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# Perform object detection
# Convert the frame to tensor
frame_tensor = torch.tensor(frame, dtype=torch.float32) / 255.0
frame_tensor = frame_tensor.permute(2, 0, 1).unsqueeze(0)
# Forward pass through the model
with torch.no_grad():
outputs = model(frame_tensor)
frame_with_bbox = frame
# Get the predicted bounding box from the output
tracker_id = 0
for bbox, score, label in zip(outputs[0]['boxes'], outputs[0]['scores'], outputs[0]['labels']):
bbox = bbox.cpu().numpy()
score = score.cpu().numpy()
# Get human-readable label
label_name = class_labels.get(label.item(), f'Unknown {label.item()}')
#hash_code = generate_hash(label_name, bbox, feature)
if score > 0.5:
tracker_id +=1
frame_with_bbox = draw_box(frame_with_bbox, bbox, label_name)
# Write frame to output video
out.write(frame_with_bbox)
# Release video capture and close output video
cap.release()
out.release()
Main function call
if name == “main”:
# Video file path
video_path = str(Path(“~/001.avi”).expanduser())
# Detect and track objects
detect_and_track_objects(video_path, use_visdom=True)