Training FasterCNN: target boxes have wrong dimensions

I’m trying to train a model which should predict a bounding box around a hand. When I try to run the code I get the error message: Expected target boxes to be a tensor of shape [N, 4], got torch.Size([8, 1, 4]). This issue has already been discussed here but I did not follow whether or not he actually opened a new issue as stated.

def resize_image_and_bbox(img, bboxes, new_height, new_width):
    original_width, original_height = img.size
    
    height_factor = original_height / new_height
    width_factor = original_width / new_width

    transform = transforms.Compose([
        transforms.Resize((new_height, new_width)),
        transforms.ToTensor()
    ]) 
    img_as_tensor = transform(img)

    # Scale x coordinates
    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] / width_factor
    # Scale y coordinates
    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] / height_factor

    return img_as_tensor, bboxes

class CustomDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.class_lbl = "hand"
        self.label_paths = []
        self.label_paths += glob.glob(os.path.join(root_dir, "labels", "VOC", "*.xml"))

    def __len__(self):
        return len(self.label_paths)

    def __getitem__(self, index):
        tree = ET.parse(self.label_paths[index])
        root = tree.getroot()
        img_path = os.path.join(self.root_dir,"images", root.find("path").text.split("\\")[-1])
        img = PIL.Image.open(img_path)
        # img = decode_image(img_path, ImageReadMode.RGB).numpy(force=True)
        xmin = float(root.find("object/bndbox/xmin").text)
        ymin = float(root.find("object/bndbox/ymin").text)
        xmax = float(root.find("object/bndbox/xmax").text)
        ymax = float(root.find("object/bndbox/ymax").text)

        labels = torch.tensor([1])
        bboxes = torch.tensor([[xmin, ymin, xmax, ymax]], dtype=torch.float32)

        img_as_tensor, bboxes = resize_image_and_bbox(img, bboxes, IMAGE_HEIGHT, IMAGE_WIDTH)

        print(bboxes)

        target = {}
        target["boxes"] = bboxes
        target["labels"] = labels

        return img_as_tensor, target

# Set up the optimizer
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)
# Learning rate scheduler
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
# Train the model
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0

   # Training loop
    for images, targets in dataloaders["train"]:
        images = list(image.to(device) for image in images)
        targets = {k: v.to(device) for k, v in targets.items()}

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        loss_dict = model(images, [targets])
        losses = sum(loss for loss in loss_dict.values())

        # Backward pass
        losses.backward()
        optimizer.step()
        train_loss += losses.item()

    # Update the learning rate
    lr_scheduler.step()
    print(f'Epoch: {epoch + 1}, Loss: {train_loss / len(dataloaders["train"])}')
print("Training complete!")

I actually managed to solve this by writing a custom collate function

def custom_collate_fn(batch):
    # Separate images and targets from the batch
    images, targets = zip(*batch)
    
    # Stack the images into a tensor (shape: [batch_size, C, H, W])
    images = torch.stack(images, dim=0)
    
    # Now, we need to maintain the structure: List of dictionaries with "boxes" and "labels"
    batched_targets = []
    
    for target in targets:
        target_dict = {}
        target_dict["boxes"] = target["boxes"]
        target_dict["labels"] = target["labels"]
        batched_targets.append(target_dict)
    
    return images, batched_targets