Object Detection - RuntimeError: stack expects each tensor to be equal size

Hi,

I created a custom dataset for object detection named ReceiptDataset as below

class ReceiptDataset(torch.utils.data.Dataset):
  def __init__(self, train_dir,width,height,labels,transforms=None):
    self.images = os.listdir(train_dir)
    self.width = width
    self.height = height
    self.train_dir = train_dir
    self.labels = labels
    self.transforms = transforms

  def __getitem__(self,idx):
    img_name = self.images[idx]
    img_path = os.path.join(self.train_dir,img_name)
    img = cv2.imread(img_path)
    img_res = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
    img_res = cv2.resize(img_res,(self.width,self.height), cv2.INTER_AREA)

    annot = self.labels[str(img_name)]

    lbls = []
    boxes = []
    target = {}

    box_wt, box_ht, _ = img_res.shape
    print(f"img_res shape: {img_res.shape}")

    for item in annot:
      x,y,wt,ht,lbl = item

      x_min = x - wt/2
      x_max = x + wt/2
      y_min = y - ht/2
      y_max = y + ht/2

      x_min_new = int(x_min * box_wt)
      x_max_new = int(x_max * box_wt)
      y_min_new = int(y_min * box_ht)
      y_max_new = int(y_max * box_ht)

      boxes.append([x_min_new,x_max_new,y_min_new,y_max_new])

      lbls.append( classes.index(str(lbl)) )

    print(f"dls_lbls: {lbls}")

    boxes = torch.as_tensor(boxes, dtype=torch.int64)
    lbls = torch.as_tensor(lbls, dtype=torch.int64)

    target["boxes"]  = boxes
    target["labels"] = lbls
    target["image_id"] = torch.as_tensor(idx)

    if self.transforms:
      trans = self.transforms(image=img_res,
                              bboxes = target["boxes"],
                              labels=lbls
                              )
      img_res = trans["image"]
      target["boxes"] = torch.Tensor(trans["bboxes"])


    return img_res, target


  def __len__(self):
    return len(self.images)

and I created an instance with:

train_dataset = ReceiptDataset("label-detector/images",width,height,plabels)

and my training snippet is :

from engine import train_one_epoch, evaluate

for epoch in range(num_epochs):
  train_one_epoch(model,optim,train_loader,device,epoch,print_freq=2)

  lr_scheduler.step()

  evaluate(model,test_loader,device)

but anytime I run the training loop, I’m getting a runtime error:

RuntimeError: stack expects each tensor to be equal size, but got [11,4] at entry 0 and [9,4] at entry 1

There are 17 classes in total and each image has a minimum of 4 annotations.
I noticed the problem seems to be coming from my labels list/tensor in the dataset class, the size of the labels list/tensor varies based on the number of annotated items in an image, but I can’t seem to figure out a way to fix this.

Thanks!