RuntimeError: stack expects each tensor to be equal size, but got [1, 4] at entry 0 and [1, 8] at entry 1

I obtained the image through Google crawling, and have the coordinates (x_min, y_min, x_max, y_max) where I drew the bounding box myself as a txt file.

I’m currently trying to create a customdataset, but I’m getting an error.

My code:
class CustomDataset(Dataset):
def init(self, img_dir, label_dir, transform=None):
self.img_dir = img_dir
self.label_dir = label_dir
self.transform = transform
self.img_file_list, self.txt_file_list = self.load_file_lists()

 def __len__(self):
     return len(self.img_file_list)

 def __getitem__(self, idx):
     img_name = os.path.join(self.img_dir, self.img_file_list[idx])
     label_name = os.path.join(self.label_dir, self.txt_file_list[idx])

     # Load image and convert to RGB
     image = Image.open(img_name).convert("RGB")

     # load label
     with open(label_name, 'r') as file:
         label_content = file.read()
    
     # Convert labels
     transformed_label = self.transform_label(label_content, image.size, (300,300))
     print("trans : ",transformed_label.shape , transformed_label)

     # Apply transformations if specified
     if self.transform:
         image = self.transform(image)
     print(image.shape)

     # print(image.shape, torch.Tensor(transformed_label))
     return image, transformed_label

 def load_file_lists(self):
     img_file_list = []
     txt_file_list = []

     for filename in sorted(os.listdir(self.img_dir), key=lambda x: int(re.search(r'\d+', x).group())):
         img_file_path = os.path.join(self.img_dir, filename)
         txt_file_path = os.path.join(self.label_dir, filename.replace('.jpg', '.txt'))

         if os.path.isfile(img_file_path) and os.path.isfile(txt_file_path):
             img_file_list.append(os.path.basename(img_file_path))
             txt_file_list.append(os.path.basename(txt_file_path))

     return img_file_list, txt_file_list

 def transform_label(self, label_content, original_img_size, transformed_img_size):
     values = [list(map(int, match.groups())) for match in re.finditer(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+)', label_content )]
    
     transformed_labels = []

     for xmin, ymin, xmax, ymax in values:
         # Update coordinates based on image resize (xmin, ymin, xmax, ymax) > (xc, yc, w, h)
         xmin = int(xmin * transformed_img_size[0] / original_img_size[0])
         ymin = int(ymin * transformed_img_size[1] / original_img_size[1])
         xmax = int(xmax * transformed_img_size[0] / original_img_size[0])
         ymax = int(ymax * transformed_img_size[1] / original_img_size[1])

         xc = (xmin + xmax) / 2
         yc = (ymin + ymax) / 2
         w = xmax - xmin
         h = ymax - ymin

         # Coordinate normalization
         xc /= transformed_img_size[0]
         yc /= transformed_img_size[1]
         w /= transformed_img_size[0]
         h /= transformed_img_size[1]

         # Create a tensor for each bounding box
         bbox_coord = [xc, yc, w, h]

         transformed_labels.append(bbox_coord)
     transformed_labels = torch.tensor(transformed_labels)

     if len(transformed_labels) > 1 :
         transformed_labels = transformed_labels.view(1,-1)
     print(len(transformed_labels))
    
     return transformed_labels

An image may have one or multiple bounding boxes. All images were sized to (300,300).
My guess is that the error occurs in the process of creating batches of coordinates for multiple bounding boxes in a single image.

You are right I think the solution would be to follow the standard used by yolov5.
1)have fixed number of detections for each image.
2)follow this format for each detection label
present_or_not(1 or 0),xcenter,ycenter,width,height,probability_of_class1,probability_of_class2…
3)when you are getting the target label and model output no matter if the frame has 2 or 0 detections the tensor size will not change just the initial column.

Problem solved. I just applied the following changes in batch format.

def collate_fn(batch):
return tuple(zip(*batch))

transform = tr.Compose([tr.Resize((300,300)),
tr.ToTensor() , # ToTensor시 자동 normalize
])

dataset = CustomDataset(img_dir=“./Dataset/drones”, label_dir=“./Dataset/labels”, transform=transform)

Create DataLoader

dataloader = DataLoader(dataset, batch_size= 4, shuffle=True, collate_fn= collate_fn)

check batch

for batch_idx, (images, labels) in enumerate(dataloader):
pass