Why is my rt-detr model giving map zero while train loss and valid loss is being decreased?

saint · October 23, 2025, 10:14am

I am just a beginner trying to implement a transformer model for object detection on a custom dataset.

i’ve been using a hugging face rt-detr model to fine tune it and train it on my custom dataset i visualised the dataset,checked the data being sent into the model and all other possibilities and none have any issue all of the data loading part is correct but during training the model start with a high loss value like in ranges of 10-100 and the valid loss is low compared to train loss like in ranges of 10-50 however when i calculate the map setting iou threshold as 0.3 or above the map returned is zero for like 30 epochs why is this happening
i followed this tutorial to implement this model : https://youtu.be/sBARif4DnL4?si=bYdAugeLz2eIS2Dn

since the map is very low when i go to do predictions on my test data it gives out nothing for confidence scores above 0.1 also the dataset i use is of images with small objects and images in night/darker environment so is the model struggling with this kind of datasets or is my code wrong ?.

!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/roboflow/supervision.git
!pip install -q accelerate
!pip install -q roboflow
!pip install -q torchmetrics
!pip install -q “albumentations>=1.4.5”

import torch
import requests

import numpy as np
import supervision as sv
import albumentations as A

from PIL import Image
from pprint import pprint
from roboflow import Roboflow
from dataclasses import dataclass, replace
from google.colab import userdata
from torch.utils.data import Dataset
from transformers import (
AutoImageProcessor,
AutoModelForObjectDetection,
TrainingArguments,
Trainer
)

from torchvision import transforms

!pip install -q torchinfo

from torchinfo import summary

Loading the Model

CHECKPOINT = “PekingU/rtdetr_v2_r101vd”
DEVICE = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)

NUM_LABELS = 2
model = AutoModelForObjectDetection.from_pretrained(CHECKPOINT).to(DEVICE)
processor = AutoImageProcessor.from_pretrained(CHECKPOINT)
model

#loading images and json file via detectiondataset

ds_train = sv.DetectionDataset.from_coco(
images_directory_path=“dataset link”,
annotations_path=“annotations.coco.json”,
)

ds_val = sv.DetectionDataset.from_coco(
images_directory_path=“dataset”,
annotations_path=“_annotations.coco.json”,
)

ds_test = sv.DetectionDataset.from_coco(
images_directory_path = “dataset”,
annotations_path = “_annotations.coco.json”
)

print(len(ds_train))
len(ds_val),len(ds_test)

#visualizing loaded dataset

GRID_SIZE = 3

def annotate(image, annotations, classes):
labels = [
classes[class_id]
for class_id
in annotations.class_id
]

bounding_box_annotator = sv.BoxAnnotator()
label_annotator = sv.LabelAnnotator(text_scale=1, text_thickness=2)

annotated_image = image.copy()
annotated_image = bounding_box_annotator.annotate(annotated_image, annotations)
annotated_image = label_annotator.annotate(annotated_image, annotations, labels=labels)
return annotated_image

annotated_images =
for i in range(GRID_SIZE * GRID_SIZE):
_, image, annotations = ds_train[i]
annotated_image = annotate(image, annotations, ds_train.classes)
print(annotations)
annotated_images.append(annotated_image)

sv.plot_images_grid(
annotated_images,
grid_size=(GRID_SIZE, GRID_SIZE),
size=(20, 20)
)

#initialising the imageprocessor to convert dataset to the one required by the model

IMAGE_SIZE = 640

processor = AutoImageProcessor.from_pretrained(
“PekingU/rtdetr_v2_r101vd”,
do_convert_annotations=False,
do_resize=True,
do_rescale=False,
do_normalize=True,
size={“width”: IMAGE_SIZE, “height”: IMAGE_SIZE}
)

#since detections dataset converts coco to pascal_voc transforms are performed on format pascal_voc before the processor converts it back to coco

train_augmentation_and_transform = A.Compose(
[A.NoOp()],
bbox_params = A.BboxParams(
format=“pascal_voc”,
label_fields=[“category”],
clip=True

),

)

valid_transform = A.Compose(
[A.NoOp()],
bbox_params = A.BboxParams(
format=“pascal_voc”,
label_fields=[“category”],
clip=True
),
)

#converting the loaded dataset to a dataset object to feed it to model

import torch
from torch.utils.data import Dataset
import albumentations as A
import numpy as np

class PyTorchDetectionDataset(Dataset):
def init(self, dataset, processor, transform=None):
self.dataset = dataset
self.processor = processor
self.transform = transform

@staticmethod
def annotations_as_coco(image_id, categories, boxes, img_shape):
    """
    Converts boxes to COCO format ([x, y, w, h] in pixels)
    img_shape = (H, W, C)
    """
    h, w = img_shape[:2]
    annotations = []
    box=[]
    box.append(boxes)
    for category, bbox in zip(categories, box):
        x_min,y_min,x_max,y_max=bbox
        cx = (x_min + x_max)/2
        cy = (y_min + y_max)/2
        w = x_max - x_min
        h = y_max - y_min
        bbox =[cx,cy,w,h]
        # print("manual",bbox)
        cx_norm = cx / 640.0
        cy_norm = cy / 640.0
        w_norm = w / 640.0

        h_norm = h / 640.0

        bbox = [cx_norm, cy_norm, w_norm, h_norm]
        # print("normalised",bbox)
        # bbox = np.array(bbox)
        bbox = torch.tensor(bbox, dtype=torch.float32)

        annotations.append({
            "image_id": image_id,
            "category_id": int(category),
            "bbox": bbox,
            "area": bbox[2] * bbox[3],
            "iscrowd": 0
        })
        # print("anno",annotations)
    return {"image_id": image_id, "annotations": annotations}



def __len__(self):
    return len(self.dataset)

def __getitem__(self, idx):
    _, image, annotations = self.dataset[idx]
    boxes = annotations.xyxy
    categories = annotations.class_id
    if len(boxes) == 0:
      print(f"[Warning] No boxes for image {idx} — skipping")
      print("Categories:", categories)
      print("Annotations:", annotations)

    # Apply Albumentations transform
    if self.transform:
        # Albumentations expects [x_min, y_min, x_max, y_max] for 'pascal_voc'
        transformed = self.transform(
            image=image,
            bboxes=boxes,
            category=categories
        )
        image = transformed["image"]
        boxes = transformed["bboxes"]
        # print(boxes)
        categories = transformed["category"]

    # Convert to COCO-style dict for processor
    formatted_annotations = self.annotations_as_coco(
        image_id=idx, categories=categories, boxes=boxes[0], img_shape=image.shape
    )
    # print("formatted",formatted_annotations["annotations"][0]["bbox"])
    # Feed processor
    result = self.processor(images=image, annotations=formatted_annotations, return_tensors="pt")
    # Remove batch dim
    result = {k: v[0] for k, v in result.items()}
    # print("results",result)
    # print("boxes old",result["labels"]["boxes"])
    boxe=formatted_annotations["annotations"][0]["bbox"]
    boxe=boxe.unsqueeze(0)
    result["labels"]["boxes"]=boxe
    # print("new results",result)
    # print("boxes new",result["labels"]["boxes"])
    return result

from torchvision.utils import draw_bounding_boxes
import torch
from torch import Tensor
from typing import Optional
from torchvision.ops import box_convert

#creating the pytorch dataset object and visualising it without de-normalising the image

import matplotlib.pyplot as plt

pytorch_dataset_train = PyTorchDetectionDataset(
ds_train, processor, transform=train_augmentation_and_transform)
pytorch_dataset_valid = PyTorchDetectionDataset(
ds_val, processor, transform=train_augmentation_and_transform)
pytorch_dataset_test = PyTorchDetectionDataset(
ds_test, processor, transform=train_augmentation_and_transform)

print(pytorch_dataset_train[2])
sample = pytorch_dataset_train[2]
img = sample[“pixel_values”].squeeze(0)
boxes = sample[“labels”][“boxes”]
labels = sample[“labels”][“class_labels”]

_, H, W = img.shape # get image size
print(img.shape)
boxes[:, [0, 2]] *= W # scale x, width
boxes[:, [1, 3]] *= H # scale y, height
print(H,W)

print(boxes)
boxes = box_convert(boxes, in_fmt=“cxcywh”, out_fmt=“xyxy”)

id2label = {0: “debris”, 1: “satellite”}
label_names = [id2label[int(l)] for l in labels]
print(“bi”,boxes)
mean = torch.tensor([0.485, 0.456, 0.406]).view(3,1,1)
std = torch.tensor([0.229, 0.224, 0.225]).view(3,1,1)
img = img * std + mean
img = img.clamp(0,1)

img_with_boxes = draw_bounding_boxes(
image=img,
boxes=boxes,
labels=label_names,
colors=“red”,
width=2,
font_size=16
)

plt.figure(figsize=(8, 8))
plt.imshow(img_with_boxes.permute(1, 2, 0))
plt.axis(“off”)
plt.show()

from torchmetrics.detection.mean_ap import MeanAveragePrecision
from torchmetrics.classification import MulticlassPrecision,MulticlassRecall

def collate_fn(batch):
data = {}
data[“pixel_values”] = torch.stack([x[“pixel_values”] for x in batch])
data[“labels”] = [x[“labels”] for x in batch]
return data

import numpy as np
from sklearn.metrics import precision_score
from torch import tensor

#map evaluation

id2label = {id: label for id, label in enumerate(ds_train.classes)}
label2id = {label: id for id, label in enumerate(ds_train.classes)}

@dataclass
class ModelOutput:
logits: torch.Tensor
pred_boxes: torch.Tensor

class MAPEvaluator:

def __init__(self, image_processor, threshold=0.00, id2label=None):
    self.image_processor = image_processor
    self.threshold = threshold
    self.id2label = id2label

def collect_image_sizes(self, targets):
    """Collect image sizes across the dataset as list of tensors with shape [batch_size, 2]."""
    image_sizes = []
    for batch in targets:
        batch_image_sizes = torch.tensor(np.array([x["size"] for x in batch]))
        image_sizes.append(batch_image_sizes)
    return image_sizes

def collect_targets(self, targets, image_sizes):
    post_processed_targets = []
    for target_batch, image_size_batch in zip(targets, image_sizes):
        for target, (height, width) in zip(target_batch, image_size_batch):
            boxes = target["boxes"]
            boxes = sv.xcycwh_to_xyxy(boxes)
            boxes = boxes * np.array([width, height, width, height])
            boxes = torch.tensor(boxes)
            labels = torch.tensor(target["class_labels"])
            post_processed_targets.append({"boxes": boxes, "labels": labels})
    return post_processed_targets

def collect_predictions(self, predictions, image_sizes):
    post_processed_predictions = []
    for batch, target_sizes in zip(predictions, image_sizes):
        batch_logits, batch_boxes = batch[1], batch[2]
        output = ModelOutput(logits=torch.tensor(batch_logits), pred_boxes=torch.tensor(batch_boxes))
        post_processed_output = self.image_processor.post_process_object_detection(
            output, threshold=self.threshold, target_sizes=target_sizes
        )
        post_processed_predictions.extend(post_processed_output)
    return post_processed_predictions

@torch.no_grad()
def __call__(self, evaluation_results):

    predictions, targets = evaluation_results.predictions, evaluation_results.label_ids

    image_sizes = self.collect_image_sizes(targets)
    post_processed_targets = self.collect_targets(targets, image_sizes)
    post_processed_predictions = self.collect_predictions(predictions, image_sizes)
    # print("preds",post_processed_predictions)
    # print("targets",post_processed_targets)
    evaluator = MeanAveragePrecision(iou_type="bbox",box_format="xyxy", class_metrics=True)
    evaluator.warn_on_many_detections = False
    evaluator.update(post_processed_predictions, post_processed_targets)

    metrics = evaluator.compute()

    # Replace list of per class metrics with separate metric for each class
    classes = metrics.pop("classes")
    map_per_class = metrics.pop("map_per_class")
    mar_100_per_class = metrics.pop("mar_100_per_class")
    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
        class_name = id2label[class_id.item()] if id2label is not None else class_id.item()
        metrics[f"map_{class_name}"] = class_map
        metrics[f"mar_100_{class_name}"] = class_mar

    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
    # metrico = MulticlassPrecision(num_classes=2,multidim_average='samplewise')
    # print("precision at epoch 1: ",metrico(tensor(post_processed_predictions), tensor(post_processed_targets)))

    return metrics

eval_compute_metrics_fn = MAPEvaluator(image_processor=processor, threshold=0.3, id2label=id2label)

#loading the model but redifining classes and out_features

model = AutoModelForObjectDetection.from_pretrained(
“PekingU/rtdetr_v2_r101vd”,
id2label=id2label,
label2id=label2id,
anchor_image_size=None,
ignore_mismatched_sizes=True,
)
summary(model=model,input=[1,3,64,64],col_names=[“trainable”])
print(model.config)

#freezing the backbone of the model

for p in model.model.backbone.parameters():
p.requires_grad=False
summary(model=model,input=[1,3,64,64],col_names=[“trainable”])

model.config.num_labels

backbone_params =
transformer_params =

for name, param in model.named_parameters():
if “backbone” in name:
backbone_params.append(param)
else:
transformer_params.append(param)

#setting lr and optimiser with regularisation

optimizer = torch.optim.AdamW([
{‘params’: backbone_params, ‘lr’: 1e-5}, # Lower learning rate for the backbone
{‘params’: transformer_params, ‘lr’: 5e-5} # Higher learning rate for the transformer
])

#setting train args to pass to trainer api

training_args = TrainingArguments(
output_dir=f"{dataset.name.replace(’ ', ‘-’)}-finetune",
num_train_epochs=10,
max_grad_norm=0.1,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
dataloader_num_workers=2,
metric_for_best_model=“eval_map”,
greater_is_better=True,
load_best_model_at_end=True,
eval_strategy=“epoch”,
save_strategy=“epoch”,
save_total_limit=2,
remove_unused_columns=False,
eval_do_concat_batches=False,
)

from torch import seed
torch.manual_seed(42)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=pytorch_dataset_train,
eval_dataset=pytorch_dataset_valid,
processing_class=processor,
data_collator=collate_fn,
optimizers=(optimizer, None),
compute_metrics=eval_compute_metrics_fn
)

trainer.train()

Please check this code and let me know what are the mistakes i made .