Hello,
I’m trying to create a neural network from scratch for object detection in images. The problem is I don’t have the same number of bounding boxes for each image. An image can have 0, 1, 2 or even more bounding boxes. So I tried to add padded_boxes.
But I don’t know how to mask them out.
Here’s my code :
class CustomDataset(Dataset):
def __init__(self, data_dir, class_mapping, transform=None):
self.data_dir = data_dir
self.transform = transform
self.image_dir = os.path.join(data_dir, 'images')
self.label_dir = os.path.join(data_dir, 'labels')
self.image_paths = [os.path.join(self.image_dir, filename) for filename in os.listdir(self.image_dir) if filename.endswith('.jpg')]
self.csv_paths = [os.path.join(self.label_dir, filename) for filename in os.listdir(self.label_dir) if filename.endswith('.csv')]
self.class_mapping = class_mapping
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
image_path = self.image_paths[idx]
csv_path = self.csv_paths[idx]
image = Image.open(image_path).convert('RGB')
try:
csv_data = pd.read_csv(csv_path, header=None) # Read CSV without header
if csv_data.empty:
raise ValueError("Empty CSV file")
bounding_boxes = []
class_labels = []
# Extract bounding box coordinates and class labels
for i in range(len(csv_data)):
label = csv_data.iloc[i, 4]
if label in self.class_mapping:
bounding_box = csv_data.iloc[i, :4].values
bounding_boxes.append(bounding_box)
class_labels.append(self.class_mapping[label])
# Convert lists to tensors
bounding_boxes = torch.tensor(bounding_boxes, dtype=torch.float32)
class_labels = torch.tensor(class_labels, dtype=torch.long)
except pd.errors.EmptyDataError:
# Handle empty CSV file
bounding_boxes = torch.zeros((1, 4), dtype=torch.float32)
class_labels = torch.tensor([-1], dtype=torch.long)
if self.transform:
image = self.transform(image)
targets = {'boxes': bounding_boxes, 'labels': class_labels}
return image, targets
def pad_collate(batch):
images, targets = zip(*batch)
images = torch.stack(images, 0)
# Pad bounding box tensors to have the same number of bounding boxes within each batch
bounding_boxes = [target['boxes'] for target in targets]
padded_boxes = pad_sequence(bounding_boxes, batch_first=True, padding_value=-1)
# Pad labels similarly
labels = [target['labels'] for target in targets]
padded_labels = pad_sequence(labels, batch_first=True, padding_value=-1)
return images, {'boxes': padded_boxes, 'labels': padded_labels}
# Initialize the dataset and dataloader
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor()
])
dataset = CustomDataset(data_dir=train_path, class_mapping=class_mapping, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
# Initialize the model, loss function, and optimizer
model = ConvNet(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
bbox_criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(num_epochs):
running_loss = 0.0
for images, targets in dataloader:
optimizer.zero_grad()
# Forward pass
class_output, bbox_output = model(images)
# Unpack targets
bounding_boxes = targets['boxes']
class_labels = targets['labels']
# Mask out padded values
mask = (bounding_boxes != -1).any(dim=-1) # Create mask indicating valid (non-padded) values
print(mask.shape)
print(bounding_boxes.shape)
# Apply mask to bounding box output and class labels for each image in the batch
masked_boxes = torch.masked_select(bounding_boxes, mask).view(-1, 4)
masked_bbox_output = torch.masked_select(bbox_output, mask).view(-1, 4)
masked_labels = torch.masked_select(class_labels, mask[:, :, :, :, 0])
# Compute classification loss
class_loss = criterion(class_output, masked_labels)
# Compute bounding box regression loss
bbox_loss = bbox_criterion(masked_bbox_output, masked_boxes)
# Total loss
loss = class_loss + bbox_loss
# Backward pass and optimization
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader)}")
The error comes from the shape of the mask. I already tried to reshape the mask with code such as mask = mask.unsqueeze(-1).expand_as(bbox_output)
. But it didn’t work.