Did not notice the update in the libs. My bad. Thanks for letting me know.
Below is some “vibe coded” code after noticing the issue.
import torch
import torchvision.transforms as transforms
from torchvision.datasets import VisionDataset
from torchvision import tv_tensors
from torchvision.transforms.functional import to_tensor
import PIL.Image
import PIL.ImageDraw
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import os
import glob
import shutil
resizing_width = 100
# %% Prepare datasets
class Dataset(VisionDataset):
def __init__(self, root, data, classes, transforms=None, transform=None, target_transform=None):
super().__init__(root, transforms, transform, target_transform)
self.classes = classes
self.images = glob.glob(os.path.join(self.root, "*.jpg"))
self.data = data
images = []
for obj in data:
filename = obj["filename"]
if not filename in images:
images.append(filename)
self.images = images
def __getitem__(self, i):
image_path = os.path.join(self.root, self.images[i])
image = PIL.Image.open(image_path).convert("RGB")
width = self.data[i]["width"]
height = self.data[i]["height"]
resizing_height = height*(resizing_width/width)
objects, labels = [], []
records = [x for x in self.data if x["filename"] == self.images[i]]
for record in records:
coordinates = [int(record["xmin"]), int(record["ymin"]), int(record["xmax"]), int(record["ymax"])]
coordinates = [int(coordinates[0]*(resizing_width/width)),
int(coordinates[1]*(resizing_height/height)),
int(coordinates[2]*(resizing_width/width)),
int(coordinates[3]*(resizing_height/height))]
objects.append(coordinates)
labels.append(self.classes.index(record["name"]))
boxes = tv_tensors.BoundingBoxes(objects, format="XYXY", canvas_size=(height, width))
if self.transforms is not None:
image = self.transforms(image)
tensor = to_tensor(image)
target = {
"labels": torch.as_tensor(labels),
"image_id": i,
"boxes": boxes,
"area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:,0]) if len(boxes) > 0 else torch.zeros(1),
"iscrowd": torch.zeros((len(boxes),), dtype=torch.int64) # if many objects in the same bounding box
}
return tensor, target
def __len__(self):
return len(self.images)
# Transformation used on training data (possibility to add different techniques) & to use v2 transforms versions
# https://docs.pytorch.org/vision/stable/transforms.html
train_transforms = transforms.Compose([
transforms.Resize(size=resizing_width), #Resizing the training data to 100 pixels (shortest edge is 100px)
#transforms.RandomAdjustSharpness(sharpness_factor=4),
#transforms.RandomAutocontrast()
])
# train_dataset = Dataset(input_path, train_data, classes, transforms=train_transforms)
# Fixed Dataset class
class FixedDataset(VisionDataset):
def __init__(self, root, data, classes, transforms=None, transform=None, target_transform=None):
super().__init__(root, transforms, transform, target_transform)
self.classes = classes
self.data = data
# Fixed: Get unique filenames from data
images = []
for obj in data:
filename = obj["filename"]
if filename not in images:
images.append(filename)
self.images = images
def __getitem__(self, i):
image_path = os.path.join(self.root, self.images[i])
image = PIL.Image.open(image_path).convert("RGB")
original_width, original_height = image.size
# Fixed: Apply transforms first, then get new dimensions
if self.transforms is not None:
image = self.transforms(image)
new_width, new_height = image.size
objects, labels = [], []
records = [x for x in self.data if x["filename"] == self.images[i]]
for record in records:
# Fixed: Scale coordinates based on actual resize ratios
scale_x = new_width / original_width
scale_y = new_height / original_height
coordinates = [
int(record["xmin"] * scale_x),
int(record["ymin"] * scale_y),
int(record["xmax"] * scale_x),
int(record["ymax"] * scale_y)
]
objects.append(coordinates)
labels.append(self.classes.index(record["name"]))
# Fixed: Canvas size matches transformed image dimensions
boxes = tv_tensors.BoundingBoxes(objects, format="XYXY", canvas_size=(new_height, new_width))
tensor = to_tensor(image)
target = {
"labels": torch.as_tensor(labels),
"image_id": i,
"boxes": boxes,
"area": (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0]) if len(boxes) > 0 else torch.zeros(1),
"iscrowd": torch.zeros((len(boxes),), dtype=torch.int64)
}
return tensor, target
def __len__(self):
return len(self.images)
def create_dummy_image_with_circle(width=200, height=150, save_dir="test_images", filename="test_image.jpg"):
"""Create a black image with a white circle and return bounding box data"""
image = PIL.Image.new('RGB', (width, height), 'black')
draw = PIL.ImageDraw.Draw(image)
# Draw white circle in center
circle_radius = 30
center_x, center_y = width // 2, height // 2
draw.ellipse([
center_x - circle_radius, center_y - circle_radius,
center_x + circle_radius, center_y + circle_radius
], fill='white')
filepath = os.path.join(save_dir, filename)
image.save(filepath)
return {
"filename": filename,
"width": width,
"height": height,
"xmin": center_x - circle_radius,
"ymin": center_y - circle_radius,
"xmax": center_x + circle_radius,
"ymax": center_y + circle_radius,
"name": "circle"
}
def plot_comparison(original_img, broken_result, fixed_result, original_bbox, case_name):
"""Plot original, broken, and fixed results side by side"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Original image
axes[0].imshow(original_img)
axes[0].set_title(f'Original ({original_img.size[0]}x{original_img.size[1]})')
rect = patches.Rectangle((original_bbox['xmin'], original_bbox['ymin']),
original_bbox['xmax'] - original_bbox['xmin'],
original_bbox['ymax'] - original_bbox['ymin'],
linewidth=2, edgecolor='red', facecolor='none')
axes[0].add_patch(rect)
axes[0].axis('off')
# Broken result
broken_tensor, broken_target = broken_result
broken_img = broken_tensor.permute(1, 2, 0).numpy()
axes[1].imshow(broken_img)
axes[1].set_title(f'Broken ({broken_img.shape[1]}x{broken_img.shape[0]})')
if len(broken_target['boxes']) > 0:
bbox = broken_target['boxes'][0].tolist()
rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1],
linewidth=2, edgecolor='red', facecolor='none')
axes[1].add_patch(rect)
axes[1].text(0.02, 0.98, f'Canvas: {broken_target["boxes"].canvas_size}',
transform=axes[1].transAxes, va='top', color='yellow', fontsize=8)
axes[1].axis('off')
# Fixed result
fixed_tensor, fixed_target = fixed_result
fixed_img = fixed_tensor.permute(1, 2, 0).numpy()
axes[2].imshow(fixed_img)
axes[2].set_title(f'Fixed ({fixed_img.shape[1]}x{fixed_img.shape[0]})')
if len(fixed_target['boxes']) > 0:
bbox = fixed_target['boxes'][0].tolist()
rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2] - bbox[0], bbox[3] - bbox[1],
linewidth=2, edgecolor='red', facecolor='none')
axes[2].add_patch(rect)
axes[2].text(0.02, 0.98, f'Canvas: {fixed_target["boxes"].canvas_size}',
transform=axes[2].transAxes, va='top', color='yellow', fontsize=8)
axes[2].axis('off')
plt.suptitle(f'{case_name}', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
# Test cases
os.makedirs("test_images", exist_ok=True)
test_cases = [
{"shape": (200, 150), "resize": 100, "name": "landscape_100"},
{"shape": (200, 150), "resize": 80, "name": "landscape_80"},
{"shape": (150, 200), "resize": 100, "name": "portrait_100"},
{"shape": (150, 200), "resize": 80, "name": "portrait_80"}
]
# Transformation used on training data
train_transforms = transforms.Compose([
transforms.Resize(size=resizing_width), #Resizing the training data to 100 pixels (shortest edge is 100px)
#transforms.RandomAdjustSharpness(sharpness_factor=4),
#transforms.RandomAutocontrast()
])
for case in test_cases:
width, height = case["shape"]
resize_target = case["resize"]
# Create test image and data
filename = f"test_{case['name']}.jpg"
data_record = create_dummy_image_with_circle(width, height, "test_images", filename)
# Load original image for display
original_img = PIL.Image.open(os.path.join("test_images", filename))
# Update transforms for this test case
current_transforms = transforms.Compose([
transforms.Resize(size=resize_target)
])
# Test broken version (your original Dataset class)
broken_dataset = Dataset("test_images", [data_record], ["circle"], transforms=current_transforms)
broken_result = broken_dataset[0]
# Test fixed version
fixed_dataset = FixedDataset("test_images", [data_record], ["circle"], transforms=current_transforms)
fixed_result = fixed_dataset[0]
# Plot comparison
plot_comparison(original_img, broken_result, fixed_result, data_record,
f"{case['name']}: {width}x{height} → resize({resize_target})")
print(f"\n=== {case['name']} ===")
print(f"Original bbox: [{data_record['xmin']}, {data_record['ymin']}, {data_record['xmax']}, {data_record['ymax']}]")
print(f"Broken bbox: {broken_result[1]['boxes'][0].tolist()}")
print(f"Fixed bbox: {fixed_result[1]['boxes'][0].tolist()}")
print(f"Broken canvas: {broken_result[1]['boxes'].canvas_size}")
print(f"Fixed canvas: {fixed_result[1]['boxes'].canvas_size}")
# Clean up
shutil.rmtree("test_images")
It essentially applies transforms to image first, then calculates proper scaling ratios.
Does this help?