Hello everyone,
I am struggle to detect the object but facing the issue [AssertionError: The boxes tensor shape is not correct as Tensor[K, 5]]
Below is my code? Can you help me to fix? I am not understanding.
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.optim import SGD
from torch.optim.lr_scheduler import MultiStepLR
from torch.nn import functional as F
from tqdm import tqdm
import numpy as np
import xml.etree.ElementTree as ET
from sklearn.metrics import average_precision_score
from torch.nn.utils.rnn import pad_sequence
class_to_idx = {
“background”: 0,
“aeroplane”: 1,
“bicycle”: 2,
“bird”: 3,
“boat”: 4,
“bottle”: 5,
“bus”: 6,
“car”: 7,
“cat”: 8,
“chair”: 9,
“cow”: 10,
“diningtable”: 11,
“dog”: 12,
“horse”: 13,
“motorbike”: 14,
“person”: 15,
“pottedplant”: 16,
“sheep”: 17,
“sofa”: 18,
“train”: 19,
“tvmonitor”: 20
}
def collate_fn_transform(batch):
images = []
targets = []
for img, target in batch:
images.append(img)
boxes_list = []
labels_list = []
for obj in target:
boxes = obj['box']
labels = obj['label']
print(type(labels))
boxes_list.append(torch.tensor(boxes, dtype=torch.float32))
labels_list.append(torch.tensor(labels, dtype=torch.int64))
boxes = torch.stack(boxes_list)
labels = torch.stack(labels_list)
targets.append({
'boxes': boxes,
'labels': labels
})
return images, targets
def collate_fn_transform(batch):
class_to_idx = {
“background”: 0,
“aeroplane”: 1,
“bicycle”: 2,
“bird”: 3,
“boat”: 4,
“bottle”: 5,
“bus”: 6,
“car”: 7,
“cat”: 8,
“chair”: 9,
“cow”: 10,
“diningtable”: 11,
“dog”: 12,
“horse”: 13,
“motorbike”: 14,
“person”: 15,
“pottedplant”: 16,
“sheep”: 17,
“sofa”: 18,
“train”: 19,
“tvmonitor”: 20
}
images = []
targets = []
for img, target in batch:
images.append(img)
targets.append({‘boxes’: torch.tensor(target[‘boxes’], dtype=torch.float32),
‘labels’: torch.tensor(target[‘labels’], dtype=torch.int64)})
return images, targets
class_to_idx = {
“background”: 0,
“aeroplane”: 1,
“bicycle”: 2,
“bird”: 3,
“boat”: 4,
“bottle”: 5,
“bus”: 6,
“car”: 7,
“cat”: 8,
“chair”: 9,
“cow”: 10,
“diningtable”: 11,
“dog”: 12,
“horse”: 13,
“motorbike”: 14,
“person”: 15,
“pottedplant”: 16,
“sheep”: 17,
“sofa”: 18,
“train”: 19,
“tvmonitor”: 20
}
Define dataset class for VOC2007 detection
class VOC2007Detection(torchvision.datasets.VOCDetection):
“”“VOC 2007 detection dataset.”“”
def __init__(self, root, transform=None, target_transform=None, transforms=None):
super(VOC2007Detection, self).__init__(root, year='2007', image_set='trainval',
download=True, transform=transform,
target_transform=target_transform, transforms=transforms)
def __getitem__(self, index):
img, target = super(VOC2007Detection, self).__getitem__(index)
# Convert target to tensor
objects = []
for obj in target['annotation']['object']:
if obj['difficult'] == '0':
bbox = obj['bndbox']
objects.append({'box': [float(bbox['xmin']), float(bbox['ymin']),
float(bbox['xmax']), float(bbox['ymax'])],
'label': int(class_to_idx[obj['name']])})
target = objects
return img, target
Define data augmentation transforms
train_transforms = transforms.Compose([
transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
Create dataset and data loader
train_dataset = VOC2007Detection(root=‘./data/VOCdevkit’, transform=train_transforms)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn_transform)
Define Fast R-CNN model
class FastRCNN(torch.nn.Module):
def init(self, num_classes):
super(FastRCNN, self).init()
# Load pre-trained VGG16 backbone
self.backbone = torchvision.models.vgg16(pretrained=True).features
# Replace VGG16 classifier with a ROI pooling layer
self.classifier = torch.nn.Sequential(torch.nn.Linear(25088, 4096),
torch.nn.ReLU(inplace=True),
torch.nn.Dropout(),
torch.nn.Linear(4096, 4096),
torch.nn.ReLU(inplace=True),
torch.nn.Dropout(),
torch.nn.Linear(4096, num_classes))
self.roi_pooling = torchvision.ops.RoIPool(output_size=(7, 7), spatial_scale=0.0625)
self.bbox_reg = torch.nn.Linear(4096, num_classes * 4)
def forward(self, x, proposals):
x = self.backbone(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
# ROI pooling
roi_feats = []
for proposal in proposals:
roi_feats.append(self.roi_pooling(x.unsqueeze(0), proposal))
roi_feats = torch.cat(roi_feats, dim=0)
# Bounding box regression
bbox_reg = self.bbox_reg(roi_feats)
bbox_reg = bbox_reg.view(-1, num_classes, 4)
return x, roi_feats, bbox_reg
Define device
device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)
Define Fast R-CNN model
Extract class names from dataset
class_names = train_dataset.class_names
Create class-to-index dictionary
class_to_idx = {
“background”: 0,
“aeroplane”: 1,
“bicycle”: 2,
“bird”: 3,
“boat”: 4,
“bottle”: 5,
“bus”: 6,
“car”: 7,
“cat”: 8,
“chair”: 9,
“cow”: 10,
“diningtable”: 11,
“dog”: 12,
“horse”: 13,
“motorbike”: 14,
“person”: 15,
“pottedplant”: 16,
“sheep”: 17,
“sofa”: 18,
“train”: 19,
“tvmonitor”: 20
}
Define Fast R-CNN model
num_classes = 21
model = FastRCNN(num_classes=num_classes).to(device)
Define loss function and optimizer
criterion_cls = torch.nn.CrossEntropyLoss()
criterion_reg = torch.nn.SmoothL1Loss()
optimizer = SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005)
scheduler = MultiStepLR(optimizer, milestones=[8, 10], gamma=0.1)
Train the model
num_epochs = 12
for epoch in range(num_epochs):
# Train
model.train()
train_loss_cls = 0
train_loss_reg = 0
num_batches = 0
for images, targets in tqdm(train_loader):
# images = images.to(device)
images = [image.to(device) for image in images]
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
images = torch.stack(images)
# Generate region proposals
proposals = []
for target in targets:
# proposal = torchvision.ops.box.convert_boxes_to_roi(target[‘boxes’])
proposal = torchvision.ops.boxes.box_convert(target[‘boxes’], ‘xyxy’, ‘xywh’)
proposals.append(proposal)
# proposals = torch.stack(proposals)
proposals = pad_sequence(proposals, batch_first=True, padding_value=0)
# Forward pass
optimizer.zero_grad()
_, roi_feats, bbox_reg = model(images, proposals)
# Compute classification loss
labels = torch.cat([t['labels'] for t in targets], dim=0)
cls_scores = torch.cat([model.classifier(roi_feat.unsqueeze(0)) for roi_feat in roi_feats])
loss_cls = criterion_cls(cls_scores, labels)
# Compute bounding box regression loss
gt_bbox_reg = torch.cat([t['boxes'] for t in targets], dim=0)
gt_roi = torchvision.ops.box.convert_boxes_to_roi(gt_bbox_reg)
mask = (labels > 0).float()
pred_bbox_reg = model.bbox_regressor(bbox_reg, proposals)
loss_reg = criterion_reg(pred_bbox_reg * mask.unsqueeze(-1), gt_roi * mask.unsqueeze(-1))
# Compute total loss
loss = loss_cls + loss_reg
train_loss_cls += loss_cls.item()
train_loss_reg += loss_reg.item()
num_batches += 1
# Backward pass and optimization
loss.backward()
optimizer.step()
print("Done ", epoch)