[ Object detection error ] AssertionError: The boxes tensor shape is not correct as Tensor[K, 5]

Hello everyone,

I am struggle to detect the object but facing the issue [AssertionError: The boxes tensor shape is not correct as Tensor[K, 5]]

Below is my code? Can you help me to fix? I am not understanding.

import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from torch.nn import functional as F
from tqdm import tqdm
import numpy as np
import xml.etree.ElementTree as ET
from sklearn.metrics import average_precision_score
from torch.nn.utils.rnn import pad_sequence
class_to_idx = {
“background”: 0,
“aeroplane”: 1,
“bicycle”: 2,
“bird”: 3,
“boat”: 4,
“bottle”: 5,
“bus”: 6,
“car”: 7,
“cat”: 8,
“chair”: 9,
“cow”: 10,
“diningtable”: 11,
“dog”: 12,
“horse”: 13,
“motorbike”: 14,
“person”: 15,
“pottedplant”: 16,
“sheep”: 17,
“sofa”: 18,
“train”: 19,
“tvmonitor”: 20
}
def collate_fn_transform(batch):
images = []
targets = []
for img, target in batch:
images.append(img)

    boxes_list = []
    labels_list = []
    for obj in target:
        boxes = obj['box']
        labels = obj['label']
        print(type(labels))
        boxes_list.append(torch.tensor(boxes, dtype=torch.float32))
        labels_list.append(torch.tensor(labels, dtype=torch.int64))
    
    boxes = torch.stack(boxes_list)
    labels = torch.stack(labels_list)

    targets.append({
        'boxes': boxes,
        'labels': labels
    })

return images, targets

def collate_fn_transform(batch):

class_to_idx = {

“background”: 0,

“aeroplane”: 1,

“bicycle”: 2,

“bird”: 3,

“boat”: 4,

“bottle”: 5,

“bus”: 6,

“car”: 7,

“cat”: 8,

“chair”: 9,

“cow”: 10,

“diningtable”: 11,

“dog”: 12,

“horse”: 13,

“motorbike”: 14,

“person”: 15,

“pottedplant”: 16,

“sheep”: 17,

“sofa”: 18,

“train”: 19,

“tvmonitor”: 20

}

images = []

targets = []

for img, target in batch:

images.append(img)

targets.append({‘boxes’: torch.tensor(target[‘boxes’], dtype=torch.float32),

‘labels’: torch.tensor(target[‘labels’], dtype=torch.int64)})

return images, targets

class_to_idx = {
“background”: 0,
“aeroplane”: 1,
“bicycle”: 2,
“bird”: 3,
“boat”: 4,
“bottle”: 5,
“bus”: 6,
“car”: 7,
“cat”: 8,
“chair”: 9,
“cow”: 10,
“diningtable”: 11,
“dog”: 12,
“horse”: 13,
“motorbike”: 14,
“person”: 15,
“pottedplant”: 16,
“sheep”: 17,
“sofa”: 18,
“train”: 19,
“tvmonitor”: 20
}

Define dataset class for VOC2007 detection

class VOC2007Detection(torchvision.datasets.VOCDetection):
“”“VOC 2007 detection dataset.”“”

def __init__(self, root, transform=None, target_transform=None, transforms=None):
    super(VOC2007Detection, self).__init__(root, year='2007', image_set='trainval',
                                           download=True, transform=transform,
                                           target_transform=target_transform, transforms=transforms)
    
def __getitem__(self, index):
    img, target = super(VOC2007Detection, self).__getitem__(index)

    # Convert target to tensor
    objects = []
    for obj in target['annotation']['object']:
        if obj['difficult'] == '0':
            bbox = obj['bndbox']
            objects.append({'box': [float(bbox['xmin']), float(bbox['ymin']),
                                    float(bbox['xmax']), float(bbox['ymax'])],
                             'label': int(class_to_idx[obj['name']])})

    target = objects

    return img, target

Define data augmentation transforms

train_transforms = transforms.Compose([
transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

Create dataset and data loader

train_dataset = VOC2007Detection(root=‘./data/VOCdevkit’, transform=train_transforms)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn_transform)

Define Fast R-CNN model

class FastRCNN(torch.nn.Module):
def init(self, num_classes):
super(FastRCNN, self).init()

    # Load pre-trained VGG16 backbone
    self.backbone = torchvision.models.vgg16(pretrained=True).features

    # Replace VGG16 classifier with a ROI pooling layer
    self.classifier = torch.nn.Sequential(torch.nn.Linear(25088, 4096),
                                           torch.nn.ReLU(inplace=True),
                                           torch.nn.Dropout(),
                                           torch.nn.Linear(4096, 4096),
                                           torch.nn.ReLU(inplace=True),
                                           torch.nn.Dropout(),
                                           torch.nn.Linear(4096, num_classes))

    self.roi_pooling = torchvision.ops.RoIPool(output_size=(7, 7), spatial_scale=0.0625)

    self.bbox_reg = torch.nn.Linear(4096, num_classes * 4)

def forward(self, x, proposals):
    x = self.backbone(x)
    x = x.view(x.size(0), -1)
    x = self.classifier(x)

    # ROI pooling
    roi_feats = []
    for proposal in proposals:
        roi_feats.append(self.roi_pooling(x.unsqueeze(0), proposal))

    roi_feats = torch.cat(roi_feats, dim=0)

    # Bounding box regression
    bbox_reg = self.bbox_reg(roi_feats)
    bbox_reg = bbox_reg.view(-1, num_classes, 4)

    return x, roi_feats, bbox_reg

Define device

device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)

Define Fast R-CNN model

Extract class names from dataset

class_names = train_dataset.class_names

Create class-to-index dictionary

class_to_idx = {
“background”: 0,
“aeroplane”: 1,
“bicycle”: 2,
“bird”: 3,
“boat”: 4,
“bottle”: 5,
“bus”: 6,
“car”: 7,
“cat”: 8,
“chair”: 9,
“cow”: 10,
“diningtable”: 11,
“dog”: 12,
“horse”: 13,
“motorbike”: 14,
“person”: 15,
“pottedplant”: 16,
“sheep”: 17,
“sofa”: 18,
“train”: 19,
“tvmonitor”: 20
}

Define Fast R-CNN model

num_classes = 21
model = FastRCNN(num_classes=num_classes).to(device)

Define loss function and optimizer

criterion_cls = torch.nn.CrossEntropyLoss()
criterion_reg = torch.nn.SmoothL1Loss()
optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005)
scheduler = MultiStepLR(optimizer, milestones=[8, 10], gamma=0.1)

Train the model

num_epochs = 12

for epoch in range(num_epochs):
# Train
model.train()
train_loss_cls = 0
train_loss_reg = 0
num_batches = 0
for images, targets in tqdm(train_loader):
# images = images.to(device)
images = [image.to(device) for image in images]
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
images = torch.stack(images)
# Generate region proposals
proposals = []
for target in targets:
# proposal = torchvision.ops.box.convert_boxes_to_roi(target[‘boxes’])
proposal = torchvision.ops.boxes.box_convert(target[‘boxes’], ‘xyxy’, ‘xywh’)
proposals.append(proposal)
# proposals = torch.stack(proposals)
proposals = pad_sequence(proposals, batch_first=True, padding_value=0)

    # Forward pass
    optimizer.zero_grad()
    _, roi_feats, bbox_reg = model(images, proposals)

    # Compute classification loss
    labels = torch.cat([t['labels'] for t in targets], dim=0)
    cls_scores = torch.cat([model.classifier(roi_feat.unsqueeze(0)) for roi_feat in roi_feats])
    loss_cls = criterion_cls(cls_scores, labels)

    # Compute bounding box regression loss
    gt_bbox_reg = torch.cat([t['boxes'] for t in targets], dim=0)
    gt_roi = torchvision.ops.box.convert_boxes_to_roi(gt_bbox_reg)
    mask = (labels > 0).float()
    pred_bbox_reg = model.bbox_regressor(bbox_reg, proposals)
    loss_reg = criterion_reg(pred_bbox_reg * mask.unsqueeze(-1), gt_roi * mask.unsqueeze(-1))

    # Compute total loss
    loss = loss_cls + loss_reg
    train_loss_cls += loss_cls.item()
    train_loss_reg += loss_reg.item()
    num_batches += 1

    # Backward pass and optimization
    loss.backward()
    optimizer.step()
print("Done ", epoch)