Create a neural network from scratch for object detection with a variable number of labels

Hello,
I’m trying to create a neural network from scratch for object detection in images. The problem is I don’t have the same number of bounding boxes for each image. An image can have 0, 1, 2 or even more bounding boxes. So I tried to add padded_boxes.
But I don’t know how to mask them out.
Here’s my code :

class CustomDataset(Dataset):
    def __init__(self, data_dir, class_mapping, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.image_dir = os.path.join(data_dir, 'images')
        self.label_dir = os.path.join(data_dir, 'labels')
        self.image_paths = [os.path.join(self.image_dir, filename) for filename in os.listdir(self.image_dir) if filename.endswith('.jpg')]
        self.csv_paths = [os.path.join(self.label_dir, filename) for filename in os.listdir(self.label_dir) if filename.endswith('.csv')]
        self.class_mapping = class_mapping

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        csv_path = self.csv_paths[idx]
        
        image = Image.open(image_path).convert('RGB')
        try:
            csv_data = pd.read_csv(csv_path, header=None)  # Read CSV without header
            
            if csv_data.empty:
                raise ValueError("Empty CSV file")

            
            bounding_boxes = []
            class_labels = []

            # Extract bounding box coordinates and class labels
            for i in range(len(csv_data)):
                label = csv_data.iloc[i, 4]
                if label in self.class_mapping:
                    bounding_box = csv_data.iloc[i, :4].values
                    bounding_boxes.append(bounding_box)
                    class_labels.append(self.class_mapping[label])
            
            # Convert lists to tensors
            bounding_boxes = torch.tensor(bounding_boxes, dtype=torch.float32)
            class_labels = torch.tensor(class_labels, dtype=torch.long)
            
        except pd.errors.EmptyDataError:
            # Handle empty CSV file
            bounding_boxes = torch.zeros((1, 4), dtype=torch.float32)
            class_labels = torch.tensor([-1], dtype=torch.long)
        if self.transform:
            image = self.transform(image)
        
        targets = {'boxes': bounding_boxes, 'labels': class_labels}
        return image, targets
def pad_collate(batch):
    images, targets = zip(*batch)
    images = torch.stack(images, 0)
    
    # Pad bounding box tensors to have the same number of bounding boxes within each batch
    bounding_boxes = [target['boxes'] for target in targets]
    padded_boxes = pad_sequence(bounding_boxes, batch_first=True, padding_value=-1)
    
    # Pad labels similarly
    labels = [target['labels'] for target in targets]
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-1)
    
    return images, {'boxes': padded_boxes, 'labels': padded_labels}

# Initialize the dataset and dataloader
transform = transforms.Compose([
    transforms.Resize((224, 224)), 
    transforms.ToTensor()
])

dataset = CustomDataset(data_dir=train_path, class_mapping=class_mapping, transform=transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=pad_collate)
# Initialize the model, loss function, and optimizer
model = ConvNet(num_classes=num_classes)
criterion = nn.CrossEntropyLoss()
bbox_criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, targets in dataloader:
        optimizer.zero_grad()
        
        # Forward pass
        class_output, bbox_output = model(images)
        
        # Unpack targets
        bounding_boxes = targets['boxes']
        class_labels = targets['labels']
        
        # Mask out padded values
        mask = (bounding_boxes != -1).any(dim=-1)  # Create mask indicating valid (non-padded) values
        print(mask.shape)
        print(bounding_boxes.shape)

        
        # Apply mask to bounding box output and class labels for each image in the batch
        masked_boxes = torch.masked_select(bounding_boxes, mask).view(-1, 4)
        masked_bbox_output = torch.masked_select(bbox_output, mask).view(-1, 4)
        masked_labels = torch.masked_select(class_labels, mask[:, :, :, :, 0])
        
        # Compute classification loss
        class_loss = criterion(class_output, masked_labels)
        
        # Compute bounding box regression loss
        bbox_loss = bbox_criterion(masked_bbox_output, masked_boxes)
        
        # Total loss
        loss = class_loss + bbox_loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader)}")

The error comes from the shape of the mask. I already tried to reshape the mask with code such as mask = mask.unsqueeze(-1).expand_as(bbox_output). But it didn’t work.