I am just a beginner trying to implement a transformer model for object detection on a custom dataset.
i’ve been using a hugging face rt-detr model to fine tune it and train it on my custom dataset i visualised the dataset,checked the data being sent into the model and all other possibilities and none have any issue all of the data loading part is correct but during training the model start with a high loss value like in ranges of 10-100 and the valid loss is low compared to train loss like in ranges of 10-50 however when i calculate the map setting iou threshold as 0.3 or above the map returned is zero for like 30 epochs why is this happening
i followed this tutorial to implement this model : https://youtu.be/sBARif4DnL4?si=bYdAugeLz2eIS2Dn
since the map is very low when i go to do predictions on my test data it gives out nothing for confidence scores above 0.1 also the dataset i use is of images with small objects and images in night/darker environment so is the model struggling with this kind of datasets or is my code wrong ?.
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/roboflow/supervision.git
!pip install -q accelerate
!pip install -q roboflow
!pip install -q torchmetrics
!pip install -q “albumentations>=1.4.5”
import torch
import requests
import numpy as np
import supervision as sv
import albumentations as A
from PIL import Image
from pprint import pprint
from roboflow import Roboflow
from dataclasses import dataclass, replace
from google.colab import userdata
from torch.utils.data import Dataset
from transformers import (
AutoImageProcessor,
AutoModelForObjectDetection,
TrainingArguments,
Trainer
)
from torchvision import transforms
!pip install -q torchinfo
from torchinfo import summary
Loading the Model
CHECKPOINT = “PekingU/rtdetr_v2_r101vd”
DEVICE = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
NUM_LABELS = 2
model = AutoModelForObjectDetection.from_pretrained(CHECKPOINT).to(DEVICE)
processor = AutoImageProcessor.from_pretrained(CHECKPOINT)
model
#loading images and json file via detectiondataset
ds_train = sv.DetectionDataset.from_coco(
images_directory_path=“dataset link”,
annotations_path=“annotations.coco.json”,
)
ds_val = sv.DetectionDataset.from_coco(
images_directory_path=“dataset”,
annotations_path=“_annotations.coco.json”,
)
ds_test = sv.DetectionDataset.from_coco(
images_directory_path = “dataset”,
annotations_path = “_annotations.coco.json”
)
print(len(ds_train))
len(ds_val),len(ds_test)
#visualizing loaded dataset
GRID_SIZE = 3
def annotate(image, annotations, classes):
labels = [
classes[class_id]
for class_id
in annotations.class_id
]
bounding_box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator(text_scale=1, text_thickness=2)
annotated_image = image.copy()
annotated_image = bounding_box_annotator.annotate(annotated_image, annotations)
annotated_image = label_annotator.annotate(annotated_image, annotations, labels=labels)
return annotated_image
annotated_images =
for i in range(GRID_SIZE * GRID_SIZE):
_, image, annotations = ds_train[i]
annotated_image = annotate(image, annotations, ds_train.classes)
print(annotations)
annotated_images.append(annotated_image)
sv.plot_images_grid(
annotated_images,
grid_size=(GRID_SIZE, GRID_SIZE),
size=(20, 20)
)
#initialising the imageprocessor to convert dataset to the one required by the model
IMAGE_SIZE = 640
processor = AutoImageProcessor.from_pretrained(
“PekingU/rtdetr_v2_r101vd”,
do_convert_annotations=False,
do_resize=True,
do_rescale=False,
do_normalize=True,
size={“width”: IMAGE_SIZE, “height”: IMAGE_SIZE}
)
#since detections dataset converts coco to pascal_voc transforms are performed on format pascal_voc before the processor converts it back to coco
train_augmentation_and_transform = A.Compose(
[A.NoOp()],
bbox_params = A.BboxParams(
format=“pascal_voc”,
label_fields=[“category”],
clip=True
),
)
valid_transform = A.Compose(
[A.NoOp()],
bbox_params = A.BboxParams(
format=“pascal_voc”,
label_fields=[“category”],
clip=True
),
)
#converting the loaded dataset to a dataset object to feed it to model
import torch
from torch.utils.data import Dataset
import albumentations as A
import numpy as np
class PyTorchDetectionDataset(Dataset):
def init(self, dataset, processor, transform=None):
self.dataset = dataset
self.processor = processor
self.transform = transform
@staticmethod
def annotations_as_coco(image_id, categories, boxes, img_shape):
"""
Converts boxes to COCO format ([x, y, w, h] in pixels)
img_shape = (H, W, C)
"""
h, w = img_shape[:2]
annotations = []
box=[]
box.append(boxes)
for category, bbox in zip(categories, box):
x_min,y_min,x_max,y_max=bbox
cx = (x_min + x_max)/2
cy = (y_min + y_max)/2
w = x_max - x_min
h = y_max - y_min
bbox =[cx,cy,w,h]
# print("manual",bbox)
cx_norm = cx / 640.0
cy_norm = cy / 640.0
w_norm = w / 640.0
h_norm = h / 640.0
bbox = [cx_norm, cy_norm, w_norm, h_norm]
# print("normalised",bbox)
# bbox = np.array(bbox)
bbox = torch.tensor(bbox, dtype=torch.float32)
annotations.append({
"image_id": image_id,
"category_id": int(category),
"bbox": bbox,
"area": bbox[2] * bbox[3],
"iscrowd": 0
})
# print("anno",annotations)
return {"image_id": image_id, "annotations": annotations}
def __len__(self):
return len(self.dataset)
def __getitem__(self, idx):
_, image, annotations = self.dataset[idx]
boxes = annotations.xyxy
categories = annotations.class_id
if len(boxes) == 0:
print(f"[Warning] No boxes for image {idx} — skipping")
print("Categories:", categories)
print("Annotations:", annotations)
# Apply Albumentations transform
if self.transform:
# Albumentations expects [x_min, y_min, x_max, y_max] for 'pascal_voc'
transformed = self.transform(
image=image,
bboxes=boxes,
category=categories
)
image = transformed["image"]
boxes = transformed["bboxes"]
# print(boxes)
categories = transformed["category"]
# Convert to COCO-style dict for processor
formatted_annotations = self.annotations_as_coco(
image_id=idx, categories=categories, boxes=boxes[0], img_shape=image.shape
)
# print("formatted",formatted_annotations["annotations"][0]["bbox"])
# Feed processor
result = self.processor(images=image, annotations=formatted_annotations, return_tensors="pt")
# Remove batch dim
result = {k: v[0] for k, v in result.items()}
# print("results",result)
# print("boxes old",result["labels"]["boxes"])
boxe=formatted_annotations["annotations"][0]["bbox"]
boxe=boxe.unsqueeze(0)
result["labels"]["boxes"]=boxe
# print("new results",result)
# print("boxes new",result["labels"]["boxes"])
return result
from torchvision.utils import draw_bounding_boxes
import torch
from torch import Tensor
from typing import Optional
from torchvision.ops import box_convert
#creating the pytorch dataset object and visualising it without de-normalising the image
import matplotlib.pyplot as plt
pytorch_dataset_train = PyTorchDetectionDataset(
ds_train, processor, transform=train_augmentation_and_transform)
pytorch_dataset_valid = PyTorchDetectionDataset(
ds_val, processor, transform=train_augmentation_and_transform)
pytorch_dataset_test = PyTorchDetectionDataset(
ds_test, processor, transform=train_augmentation_and_transform)
print(pytorch_dataset_train[2])
sample = pytorch_dataset_train[2]
img = sample[“pixel_values”].squeeze(0)
boxes = sample[“labels”][“boxes”]
labels = sample[“labels”][“class_labels”]
_, H, W = img.shape # get image size
print(img.shape)
boxes[:, [0, 2]] *= W # scale x, width
boxes[:, [1, 3]] *= H # scale y, height
print(H,W)
print(boxes)
boxes = box_convert(boxes, in_fmt=“cxcywh”, out_fmt=“xyxy”)
id2label = {0: “debris”, 1: “satellite”}
label_names = [id2label[int(l)] for l in labels]
print(“bi”,boxes)
mean = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1)
std = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1)
img = img * std + mean
img = img.clamp(0,1)
img_with_boxes = draw_bounding_boxes(
image=img,
boxes=boxes,
labels=label_names,
colors=“red”,
width=2,
font_size=16
)
plt.figure(figsize=(8, 8))
plt.imshow(img_with_boxes.permute(1, 2, 0))
plt.axis(“off”)
plt.show()
from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torchmetrics.classification import MulticlassPrecision,MulticlassRecall
def collate_fn(batch):
data = {}
data[“pixel_values”] = torch.stack([x[“pixel_values”] for x in batch])
data[“labels”] = [x[“labels”] for x in batch]
return data
import numpy as np
from sklearn.metrics import precision_score
from torch import tensor
#map evaluation
id2label = {id: label for id, label in enumerate(ds_train.classes)}
label2id = {label: id for id, label in enumerate(ds_train.classes)}
@dataclass
class ModelOutput:
logits: torch.Tensor
pred_boxes: torch.Tensor
class MAPEvaluator:
def __init__(self, image_processor, threshold=0.00, id2label=None):
self.image_processor = image_processor
self.threshold = threshold
self.id2label = id2label
def collect_image_sizes(self, targets):
"""Collect image sizes across the dataset as list of tensors with shape [batch_size, 2]."""
image_sizes = []
for batch in targets:
batch_image_sizes = torch.tensor(np.array([x["size"] for x in batch]))
image_sizes.append(batch_image_sizes)
return image_sizes
def collect_targets(self, targets, image_sizes):
post_processed_targets = []
for target_batch, image_size_batch in zip(targets, image_sizes):
for target, (height, width) in zip(target_batch, image_size_batch):
boxes = target["boxes"]
boxes = sv.xcycwh_to_xyxy(boxes)
boxes = boxes * np.array([width, height, width, height])
boxes = torch.tensor(boxes)
labels = torch.tensor(target["class_labels"])
post_processed_targets.append({"boxes": boxes, "labels": labels})
return post_processed_targets
def collect_predictions(self, predictions, image_sizes):
post_processed_predictions = []
for batch, target_sizes in zip(predictions, image_sizes):
batch_logits, batch_boxes = batch[1], batch[2]
output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
post_processed_output = self.image_processor.post_process_object_detection(
output, threshold=self.threshold, target_sizes=target_sizes
)
post_processed_predictions.extend(post_processed_output)
return post_processed_predictions
@torch.no_grad()
def __call__(self, evaluation_results):
predictions, targets = evaluation_results.predictions, evaluation_results.label_ids
image_sizes = self.collect_image_sizes(targets)
post_processed_targets = self.collect_targets(targets, image_sizes)
post_processed_predictions = self.collect_predictions(predictions, image_sizes)
# print("preds",post_processed_predictions)
# print("targets",post_processed_targets)
evaluator = MeanAveragePrecision(iou_type="bbox",box_format="xyxy", class_metrics=True)
evaluator.warn_on_many_detections = False
evaluator.update(post_processed_predictions, post_processed_targets)
metrics = evaluator.compute()
# Replace list of per class metrics with separate metric for each class
classes = metrics.pop("classes")
map_per_class = metrics.pop("map_per_class")
mar_100_per_class = metrics.pop("mar_100_per_class")
for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
metrics[f"map_{class_name}"] = class_map
metrics[f"mar_100_{class_name}"] = class_mar
metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
# metrico = MulticlassPrecision(num_classes=2,multidim_average='samplewise')
# print("precision at epoch 1: ",metrico(tensor(post_processed_predictions), tensor(post_processed_targets)))
return metrics
eval_compute_metrics_fn = MAPEvaluator(image_processor=processor, threshold=0.3, id2label=id2label)
#loading the model but redifining classes and out_features
model = AutoModelForObjectDetection.from_pretrained(
“PekingU/rtdetr_v2_r101vd”,
id2label=id2label,
label2id=label2id,
anchor_image_size=None,
ignore_mismatched_sizes=True,
)
summary(model=model,input=[1,3,64,64],col_names=[“trainable”])
print(model.config)
#freezing the backbone of the model
for p in model.model.backbone.parameters():
p.requires_grad=False
summary(model=model,input=[1,3,64,64],col_names=[“trainable”])
model.config.num_labels
backbone_params =
transformer_params =
for name, param in model.named_parameters():
if “backbone” in name:
backbone_params.append(param)
else:
transformer_params.append(param)
#setting lr and optimiser with regularisation
optimizer = torch.optim.AdamW([
{‘params’: backbone_params, ‘lr’: 1e-5}, # Lower learning rate for the backbone
{‘params’: transformer_params, ‘lr’: 5e-5} # Higher learning rate for the transformer
])
#setting train args to pass to trainer api
training_args = TrainingArguments(
output_dir=f"{dataset.name.replace(’ ', ‘-’)}-finetune",
num_train_epochs=10,
max_grad_norm=0.1,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
dataloader_num_workers=2,
metric_for_best_model=“eval_map”,
greater_is_better=True,
load_best_model_at_end=True,
eval_strategy=“epoch”,
save_strategy=“epoch”,
save_total_limit=2,
remove_unused_columns=False,
eval_do_concat_batches=False,
)
from torch import seed
torch.manual_seed(42)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=pytorch_dataset_train,
eval_dataset=pytorch_dataset_valid,
processing_class=processor,
data_collator=collate_fn,
optimizers=(optimizer, None),
compute_metrics=eval_compute_metrics_fn
)
trainer.train()
Please check this code and let me know what are the mistakes i made .