RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 0

I did some conversion between PIL and openCV for images in order to use the Albumentation library. However I got the error:
return (image - mean[:, None, None]) / std[:, None, None]
RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 0

Traceback (most recent call last):
  File "train_detector.py", line 357, in <module>
    main(args)
  File "train_detector.py", line 335, in main
    engine(args.num_epochs)
  File "/home/students/s121md105_02/package-to-linux/pocket/pocket/core/engines.py", line 168, in __call__
    self._on_each_iteration()
  File "train_detector.py", line 44, in _on_each_iteration
    self._state.output = self._state.net(*self._state.inputs, targets=self._state.targets)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
    output.reraise()
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/_utils.py", line 425, in reraise
    raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torchvision/models/detection/generalized_rcnn.py", line 77, in forward
    images, targets = self.transform(images, targets)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torchvision/models/detection/transform.py", line 114, in forward
    image = self.normalize(image)
  File "/home/students/s121md105_02/anaconda3/envs/pocket/lib/python3.8/site-packages/torchvision/models/detection/transform.py", line 140, in normalize
    return (image - mean[:, None, None]) / std[:, None, None]
RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 0

I tried to print the image data passed for normalize, it looked something as this:

tensor([[[0.5804, 0.5725, 0.5686,  ..., 0.8431, 0.8471, 0.8431],
         [0.5686, 0.5686, 0.5686,  ..., 0.8392, 0.8392, 0.8392],
         [0.5686, 0.5686, 0.5686,  ..., 0.8353, 0.8353, 0.8314],
         ...,
         [0.4275, 0.4275, 0.4314,  ..., 0.2471, 0.2431, 0.2431],
         [0.4314, 0.4275, 0.4196,  ..., 0.3137, 0.2745, 0.2549],
         [0.4392, 0.4392, 0.4275,  ..., 0.3882, 0.3451, 0.3176]],

        [[0.4941, 0.4863, 0.4824,  ..., 0.2980, 0.2902, 0.2863],
         [0.4824, 0.4824, 0.4824,  ..., 0.2980, 0.2941, 0.2941],
         [0.4824, 0.4824, 0.4824,  ..., 0.3020, 0.3020, 0.2980],
         ...,
         [0.4000, 0.4000, 0.4039,  ..., 0.0118, 0.0118, 0.0235],
         [0.4039, 0.4000, 0.3922,  ..., 0.0353, 0.0157, 0.0118],
         [0.4118, 0.4118, 0.4000,  ..., 0.0745, 0.0510, 0.0392]]],
       device='cuda:0')
tensor([[[0.9216, 0.9098, 0.9020,  ..., 0.8275, 0.8353, 0.8588],
         [0.9098, 0.9098, 0.9176,  ..., 0.8549, 0.8510, 0.8588],
         [0.9059, 0.9059, 0.9098,  ..., 0.8588, 0.8510, 0.8392],
         ...,
         [0.7176, 0.7137, 0.7137,  ..., 0.5804, 0.5843, 0.5922],
         [0.7255, 0.7255, 0.7255,  ..., 0.5686, 0.5765, 0.5804],
         [0.7137, 0.7137, 0.7137,  ..., 0.5804, 0.5843, 0.5804]]],
       device='cuda:1')

Does it mean that my image input channel is 2 but my the values in my normalize is 3? But I don’t get why my image input channel is 2.

Hello, please provide sample code to reproduce the error so we can know what’s going wrong.

class DetectorEngine(pocket.core.LearningEngine):
def init(self, net, train_loader, val_loader, **kwargs):
super().init(net, None, train_loader, **kwargs)
self._val_loader = val_loader
self.timer = pocket.utils.HandyTimer(1)

def _on_each_iteration(self):
    self._state.output = self._state.net(*self._state.inputs, targets=self._state.targets)
    self._state.loss = sum(loss for loss in self._state.output.values())
    self._state.optimizer.zero_grad()
    self._state.loss.backward()
    self._state.optimizer.step()

def _on_end_epoch(self):
    with self.timer:
        ap, max_rec = self.validate()
        print(ap)
        ap = ap[80:]
        max_rec = max_rec[80:]
    print("\n=> Validation (+{:.2f})\n"
        "Epoch: {} | mAP: {:.4f}, mRec: {:.4f} | Time: {:.2f}s\n".format(
            time.time() - self._dawn, self._state.epoch,
            ap.mean().item(), max_rec.mean().item(), self.timer[0]
        ))
    print(ap)
    super()._on_end_epoch()

@torch.no_grad()
def validate(self, min_iou=0.5, nms_thresh=0.5):
    num_gt = torch.zeros(88)
    associate = pocket.utils.BoxAssociation(min_iou=min_iou)
    #update detection class from 80 to x, where x is the number of new classes
    meter = pocket.utils.DetectionAPMeter(
        88, algorithm='INT', nproc=10
    )
    self._state.net.eval()
    for batch in tqdm(self._val_loader):
        inputs = pocket.ops.relocate_to_cuda(batch[0])
        output = self._state.net(inputs)
        assert len(output) == 1, "The batch size should be one"
        # Relocate back to cpu
        output = pocket.ops.relocate_to_cpu(output[0])
        target = batch[1][0]
        # Do NMS on ground truth boxes
        # NOTE This is because certain objects appear multiple times in
        # different pairs and different interactions
        keep_gt_idx = torchvision.ops.boxes.batched_nms(
            target['boxes'], torch.ones_like(target['labels']).float(),
            target['labels'], nms_thresh
        )
       
        gt_boxes = target['boxes'][keep_gt_idx].view(-1, 4)
        gt_classes = target['labels'][keep_gt_idx].view(-1)
        # Update the number of ground truth instances
        # Convert the object index to zero based
        for c in gt_classes:
            num_gt[c - 1] += 1
        # Associate detections with ground truth
        binary_labels = torch.zeros_like(output['scores'])
        unique_obj = output['labels'].unique()
        for obj_idx in unique_obj:
            det_idx = torch.nonzero(output['labels'] == obj_idx).squeeze(1)
            gt_idx = torch.nonzero(gt_classes == obj_idx).squeeze(1)
            if len(gt_idx) == 0:
                continue
            binary_labels[det_idx] = associate(
                gt_boxes[gt_idx].view(-1, 4),
                output['boxes'][det_idx].view(-1, 4),
                output['scores'][det_idx].view(-1)
            )
        meter.append(output['scores'], output['labels'] - 1, binary_labels)

    meter.num_gt = num_gt.tolist()
    ap = meter.eval()
    return ap, meter.max_rec

class HICODetObject(Dataset):
def init(self, dataset, data_root, nms_thresh=0.5, random_flip=False):
self.dataset = dataset
self.nms_thresh = nms_thresh
with open(os.path.join(data_root, ‘coco80tohico80.json’), ‘r’) as f:
corr = json.load(f)
self.hico2coco91 = dict(zip(corr.values(), corr.keys()))
#self.transform = RandomHorizontalFlip() if random_flip else None

    #use Albumentations for transform both image and bboxes
    #note that 1. Albumentation takes openCV image inputs, 2. the annotation used here is [x1, y1, x2, y2], which is in pascal_voc
    #now input openCV data for transform       
    self.transform = A.Compose([
        #A.RandomCrop(width=128, height=128),
        A.HorizontalFlip(p=0.5),
        ],
        bbox_params=A.BboxParams(format='pascal_voc',label_fields=['category_ids'])) if random_flip else None
    
def __len__(self):
    return len(self.dataset)
def __getitem__(self, idx):
    image, target = self.dataset[idx]
    boxes = torch.cat([
        target['boxes_h'],
        target['boxes_o']
    ])
    #print(boxes)
    # Convert ground truth boxes to zero-based index and the
    # representation from pixel indices to coordinates
    labels = torch.cat([
        49 * torch.ones_like(target['object']),
        target['object']
    ])
    # Convert HICODet object indices to COCO indices
    converted_labels = torch.tensor([int(self.hico2coco91[i.item()]) for i in labels])

    #convert PIL image to openCV (RGB)
    openCV_img = np.array(image)
    # Apply transform
    if self.transform is not None:
        #pocket default
        #image, boxes = self.transform(image, boxes)
        #albumentatiion
        transformed = self.transform(image=openCV_img, bboxes=boxes,category_ids=labels)
        #bboxes from numpy array to a Tensor
        openCV_img = transformed['image']
        boxes = transformed['bboxes']
        boxes = torch.Tensor(boxes)
        boxes[:, :2] -= 1
        #print(boxes)
        #openCV to PIL 
        image = Image.fromarray(openCV_img)

    image = to_tensor(image, input_format='pil')
    return [image], [dict(boxes=boxes, labels=converted_labels)]