Wrongly transformed bounding box coordinates

I am trying to transform training images and bounding boxes for object detection, however when resizing (or any other operation needing bounding box coordinates to change), I observe a slight shift in bounding box coordinates. How can I solve this ?

My training data is done as such (using BoundingBoxes format for coordinates):

from torchvision import tv_tensors 
import PIL
import torchvision.transforms.v2 as transforms
from torchvision.datasets import VisionDataset
from torchvision.transforms.functional import to_tensor


# %%  Prepare datasets
class Dataset(VisionDataset):
    def __init__(self, root, data, classes, transforms=None, transform=None, target_transform=None):
        super().__init__(root, transforms, transform, target_transform)
        self.classes = classes
        self.images = glob.glob(os.path.join(self.root, "*.jpg"))
        self.data = data
        images = []
        for obj in data:
            filename = obj["filename"]
            if not filename in images:
                images.append(filename)
        self.images = images

    def __getitem__(self, i):
        image_path = os.path.join(self.root, self.images[i])
        print(image_path)
        image = PIL.Image.open(image_path).convert("RGB")
        width = self.data[i]["width"]
        height = self.data[i]["height"]
        objects, labels = [], []
        records = [x for x in self.data if x["filename"] == self.images[i]]
        for record in records:
            objects.append([int(record["xmin"]), int(record["ymin"]), int(record["xmax"]), int(record["ymax"])])
            labels.append(self.classes.index(record["name"]))
        boxes = tv_tensors.BoundingBoxes(objects, format="XYXY", canvas_size=(height, width))
        print(f"Non transformed bounding objects :{boxes} / Classes : {labels}")
        if self.transforms is not None:
            image, boxes, labels = self.transforms(image, boxes, labels)
            print(f"Transformed bounding boxes :{boxes}")
        tensor = to_tensor(image)
        target = {
            "labels": torch.as_tensor(labels),
            "image_id": i,
            "boxes": boxes,
            "area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:,0]) if len(boxes) > 0 else torch.zeros(1),
            "iscrowd": torch.zeros((len(boxes),), dtype=torch.int64) # if many objects in the same bounding box
        }
        return tensor, target
    def __len__(self):
        return len(self.images)

train_transforms = transforms.Compose([
        transforms.Resize(size=100)
])

train_dataset = Dataset(input_path, train_data, classes, transforms=train_transforms)

# Visualizing results
from torchvision.transforms.v2 import functional as F
from torchvision import io, utils 

img_tensor, target = train_dataset[9] 

viz = utils.draw_bounding_boxes(F.to_image(img_tensor), boxes=target["boxes"])
F.to_pil_image(viz).show()

Heres an example on an image before transformation (left) and after transformation (right):

If you are changing your image, you have to apply a similar change to the coordinates of the box as well. This is not automatic in torchvision.

Maybe an interesting read: Documentation

1 Like

After adding the correction myself, I observe that bounding boxes are still off.

First what I find weird is that torchvision.transforms.v2.Resize() is supposed to be able to transform BoundingBoxes coordinates according to this documentation. And in fact I do observe that bounding boxe coordinates change after transformation (see picture above, the coordinations are not on precisely the object but are different than the initial coordinates). For me the problem must come from the Resize() transformation of torchvision that does something weirder than just scaling down the image.

Here’s the new code with added bounding box correction (that still doesn’t work):

resizing_width = 100

# %%  Prepare datasets
class Dataset(VisionDataset):
    def __init__(self, root, data, classes, transforms=None, transform=None, target_transform=None):
        super().__init__(root, transforms, transform, target_transform)
        self.classes = classes
        self.images = glob.glob(os.path.join(self.root, "*.jpg"))
        self.data = data
        images = []
        for obj in data:
            filename = obj["filename"]
            if not filename in images:
                images.append(filename)
        self.images = images

    def __getitem__(self, i):
        image_path = os.path.join(self.root, self.images[i])
        image = PIL.Image.open(image_path).convert("RGB")
        width = self.data[i]["width"]
        height = self.data[i]["height"]
        resizing_height = height*(resizing_width/width)
        objects, labels = [], []
        records = [x for x in self.data if x["filename"] == self.images[i]]
        for record in records:
            coordinates = [int(record["xmin"]), int(record["ymin"]), int(record["xmax"]), int(record["ymax"])]
            coordinates = [int(coordinates[0]*(resizing_width/width)), 
                           int(coordinates[1]*(resizing_height/height)), 
                           int(coordinates[2]*(resizing_width/width)), 
                           int(coordinates[3]*(resizing_height/height))]
            objects.append(coordinates)
            labels.append(self.classes.index(record["name"]))
        boxes = tv_tensors.BoundingBoxes(objects, format="XYXY", canvas_size=(height, width))
        if self.transforms is not None:
            image = self.transforms(image)
        tensor = to_tensor(image)
        target = {
            "labels": torch.as_tensor(labels),
            "image_id": i,
            "boxes": boxes,
            "area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:,0]) if len(boxes) > 0 else torch.zeros(1),
            "iscrowd": torch.zeros((len(boxes),), dtype=torch.int64) # if many objects in the same bounding box
        }
        return tensor, target
    def __len__(self):
        return len(self.images)

# Transformation used on training data (possibility to add different techniques) & to use v2 transforms versions 
# https://docs.pytorch.org/vision/stable/transforms.html
train_transforms = transforms.Compose([
        transforms.Resize(size=resizing_width), #Resizing the training data to 100 pixels (shortest edge is 100px)
        #transforms.RandomAdjustSharpness(sharpness_factor=4),
        #transforms.RandomAutocontrast()
])

train_dataset = Dataset(input_path, train_data, classes, transforms=train_transforms)

and this code gives the same results:

Did not notice the update in the libs. My bad. Thanks for letting me know.

Below is some “vibe coded” code after noticing the issue.

import torch
import torchvision.transforms as transforms
from torchvision.datasets import VisionDataset
from torchvision import tv_tensors
from torchvision.transforms.functional import to_tensor
import PIL.Image
import PIL.ImageDraw
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os
import glob
import shutil

resizing_width = 100

# %%  Prepare datasets
class Dataset(VisionDataset):
    def __init__(self, root, data, classes, transforms=None, transform=None, target_transform=None):
        super().__init__(root, transforms, transform, target_transform)
        self.classes = classes
        self.images = glob.glob(os.path.join(self.root, "*.jpg"))
        self.data = data
        images = []
        for obj in data:
            filename = obj["filename"]
            if not filename in images:
                images.append(filename)
        self.images = images

    def __getitem__(self, i):
        image_path = os.path.join(self.root, self.images[i])
        image = PIL.Image.open(image_path).convert("RGB")
        width = self.data[i]["width"]
        height = self.data[i]["height"]
        resizing_height = height*(resizing_width/width)
        objects, labels = [], []
        records = [x for x in self.data if x["filename"] == self.images[i]]
        for record in records:
            coordinates = [int(record["xmin"]), int(record["ymin"]), int(record["xmax"]), int(record["ymax"])]
            coordinates = [int(coordinates[0]*(resizing_width/width)), 
                           int(coordinates[1]*(resizing_height/height)), 
                           int(coordinates[2]*(resizing_width/width)), 
                           int(coordinates[3]*(resizing_height/height))]
            objects.append(coordinates)
            labels.append(self.classes.index(record["name"]))
        boxes = tv_tensors.BoundingBoxes(objects, format="XYXY", canvas_size=(height, width))
        if self.transforms is not None:
            image = self.transforms(image)
        tensor = to_tensor(image)
        target = {
            "labels": torch.as_tensor(labels),
            "image_id": i,
            "boxes": boxes,
            "area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:,0]) if len(boxes) > 0 else torch.zeros(1),
            "iscrowd": torch.zeros((len(boxes),), dtype=torch.int64) # if many objects in the same bounding box
        }
        return tensor, target
    def __len__(self):
        return len(self.images)

# Transformation used on training data (possibility to add different techniques) & to use v2 transforms versions 
# https://docs.pytorch.org/vision/stable/transforms.html
train_transforms = transforms.Compose([
        transforms.Resize(size=resizing_width), #Resizing the training data to 100 pixels (shortest edge is 100px)
        #transforms.RandomAdjustSharpness(sharpness_factor=4),
        #transforms.RandomAutocontrast()
])

# train_dataset = Dataset(input_path, train_data, classes, transforms=train_transforms)

# Fixed Dataset class
class FixedDataset(VisionDataset):
    def __init__(self, root, data, classes, transforms=None, transform=None, target_transform=None):
        super().__init__(root, transforms, transform, target_transform)
        self.classes = classes
        self.data = data
        # Fixed: Get unique filenames from data
        images = []
        for obj in data:
            filename = obj["filename"]
            if filename not in images:
                images.append(filename)
        self.images = images
    
    def __getitem__(self, i):
        image_path = os.path.join(self.root, self.images[i])
        image = PIL.Image.open(image_path).convert("RGB")
        original_width, original_height = image.size
        
        # Fixed: Apply transforms first, then get new dimensions
        if self.transforms is not None:
            image = self.transforms(image)
        
        new_width, new_height = image.size
        
        objects, labels = [], []
        records = [x for x in self.data if x["filename"] == self.images[i]]
        for record in records:
            # Fixed: Scale coordinates based on actual resize ratios
            scale_x = new_width / original_width
            scale_y = new_height / original_height
            
            coordinates = [
                int(record["xmin"] * scale_x),
                int(record["ymin"] * scale_y), 
                int(record["xmax"] * scale_x),
                int(record["ymax"] * scale_y)
            ]
            objects.append(coordinates)
            labels.append(self.classes.index(record["name"]))
        
        # Fixed: Canvas size matches transformed image dimensions
        boxes = tv_tensors.BoundingBoxes(objects, format="XYXY", canvas_size=(new_height, new_width))
        tensor = to_tensor(image)
        
        target = {
            "labels": torch.as_tensor(labels),
            "image_id": i,
            "boxes": boxes,
            "area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) if len(boxes) > 0 else torch.zeros(1),
            "iscrowd": torch.zeros((len(boxes),), dtype=torch.int64)
        }
        return tensor, target
    
    def __len__(self):
        return len(self.images)

def create_dummy_image_with_circle(width=200, height=150, save_dir="test_images", filename="test_image.jpg"):
    """Create a black image with a white circle and return bounding box data"""
    image = PIL.Image.new('RGB', (width, height), 'black')
    draw = PIL.ImageDraw.Draw(image)
    
    # Draw white circle in center
    circle_radius = 30
    center_x, center_y = width // 2, height // 2
    draw.ellipse([
        center_x - circle_radius, center_y - circle_radius,
        center_x + circle_radius, center_y + circle_radius
    ], fill='white')
    
    filepath = os.path.join(save_dir, filename)
    image.save(filepath)
    
    return {
        "filename": filename,
        "width": width,
        "height": height,
        "xmin": center_x - circle_radius,
        "ymin": center_y - circle_radius, 
        "xmax": center_x + circle_radius,
        "ymax": center_y + circle_radius,
        "name": "circle"
    }

def plot_comparison(original_img, broken_result, fixed_result, original_bbox, case_name):
    """Plot original, broken, and fixed results side by side"""
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Original image
    axes[0].imshow(original_img)
    axes[0].set_title(f'Original ({original_img.size[0]}x{original_img.size[1]})')
    rect = patches.Rectangle((original_bbox['xmin'], original_bbox['ymin']), 
                           original_bbox['xmax'] - original_bbox['xmin'],
                           original_bbox['ymax'] - original_bbox['ymin'],
                           linewidth=2, edgecolor='red', facecolor='none')
    axes[0].add_patch(rect)
    axes[0].axis('off')
    
    # Broken result
    broken_tensor, broken_target = broken_result
    broken_img = broken_tensor.permute(1, 2, 0).numpy()
    axes[1].imshow(broken_img)
    axes[1].set_title(f'Broken ({broken_img.shape[1]}x{broken_img.shape[0]})')
    if len(broken_target['boxes']) > 0:
        bbox = broken_target['boxes'][0].tolist()
        rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1],
                               linewidth=2, edgecolor='red', facecolor='none')
        axes[1].add_patch(rect)
        axes[1].text(0.02, 0.98, f'Canvas: {broken_target["boxes"].canvas_size}', 
                    transform=axes[1].transAxes, va='top', color='yellow', fontsize=8)
    axes[1].axis('off')
    
    # Fixed result
    fixed_tensor, fixed_target = fixed_result
    fixed_img = fixed_tensor.permute(1, 2, 0).numpy()
    axes[2].imshow(fixed_img)
    axes[2].set_title(f'Fixed ({fixed_img.shape[1]}x{fixed_img.shape[0]})')
    if len(fixed_target['boxes']) > 0:
        bbox = fixed_target['boxes'][0].tolist()
        rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1],
                               linewidth=2, edgecolor='red', facecolor='none')
        axes[2].add_patch(rect)
        axes[2].text(0.02, 0.98, f'Canvas: {fixed_target["boxes"].canvas_size}', 
                    transform=axes[2].transAxes, va='top', color='yellow', fontsize=8)
    axes[2].axis('off')
    
    plt.suptitle(f'{case_name}', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

# Test cases
os.makedirs("test_images", exist_ok=True)

test_cases = [
    {"shape": (200, 150), "resize": 100, "name": "landscape_100"},
    {"shape": (200, 150), "resize": 80, "name": "landscape_80"},
    {"shape": (150, 200), "resize": 100, "name": "portrait_100"}, 
    {"shape": (150, 200), "resize": 80, "name": "portrait_80"}
]

# Transformation used on training data
train_transforms = transforms.Compose([
    transforms.Resize(size=resizing_width), #Resizing the training data to 100 pixels (shortest edge is 100px)
    #transforms.RandomAdjustSharpness(sharpness_factor=4),
    #transforms.RandomAutocontrast()
])

for case in test_cases:
    width, height = case["shape"]
    resize_target = case["resize"]
    
    # Create test image and data
    filename = f"test_{case['name']}.jpg"
    data_record = create_dummy_image_with_circle(width, height, "test_images", filename)
    
    # Load original image for display
    original_img = PIL.Image.open(os.path.join("test_images", filename))
    
    # Update transforms for this test case
    current_transforms = transforms.Compose([
        transforms.Resize(size=resize_target)
    ])
    
    # Test broken version (your original Dataset class)
    broken_dataset = Dataset("test_images", [data_record], ["circle"], transforms=current_transforms)
    broken_result = broken_dataset[0]
    
    # Test fixed version
    fixed_dataset = FixedDataset("test_images", [data_record], ["circle"], transforms=current_transforms)
    fixed_result = fixed_dataset[0]
    
    # Plot comparison
    plot_comparison(original_img, broken_result, fixed_result, data_record, 
                   f"{case['name']}: {width}x{height} → resize({resize_target})")
    
    print(f"\n=== {case['name']} ===")
    print(f"Original bbox: [{data_record['xmin']}, {data_record['ymin']}, {data_record['xmax']}, {data_record['ymax']}]")
    print(f"Broken bbox: {broken_result[1]['boxes'][0].tolist()}")
    print(f"Fixed bbox: {fixed_result[1]['boxes'][0].tolist()}")
    print(f"Broken canvas: {broken_result[1]['boxes'].canvas_size}")
    print(f"Fixed canvas: {fixed_result[1]['boxes'].canvas_size}")

# Clean up
shutil.rmtree("test_images")

It essentially applies transforms to image first, then calculates proper scaling ratios.

Does this help?

It works! Thank you so much!