Pytorch Runtime Error - The size of tensor a (5) must match the size of tensor b (3) at non-singleton dimension

I am trying to train a Faster RCNN Network on a custom dataset consisting of images for object detection. However, I don’t want to directly give an RGB image as input, I actually need to pass it through another network (a feature extractor) along with the corresponding thermal image and give the extracted features as the input to the FRCNN Network. The feature extractor combines these two images into a 4 channel tensor and the output is a 5 channel tensor. It is this 5 channel tensor that I wish to give as input to the Faster RCNN Network.

I followed the PyTorch docs for Object Detection Finetuning (link here) and came up with the following code to suit my dataset.

class CustomDataset(torch.utils.data.Dataset):
    
    def __getitem__(self, idx):
        self.num_classes = 5
        img_rgb_path = os.path.join(self.root, "rgb/", self.rgb_imgs[idx])
        img_thermal_path = os.path.join(self.root, "thermal/", self.thermal_imgs[idx])
        

        img_rgb = Image.open(img_rgb_path)
        img_rgb = np.array(img_rgb)
        x_rgb = TF.to_tensor(img_rgb)
        x_rgb.unsqueeze_(0)

        img_thermal = Image.open(img_thermal_path)
        img_thermal = np.array(img_thermal)
        img_thermal = np.expand_dims(img_thermal,-1)
        x_th = TF.to_tensor(img_thermal)
        x_th.unsqueeze_(0)       

        print(x_rgb.shape)  # shape of [3,640,512]
        print(x_th.shape) # shape of [1,640,512]

        input = torch.cat((x_rgb,x_th),dim=1) # shape of [4,640,512]

        
        img = self.feature_extractor(input) #  My custom feature extractor which returns a 5 dimensional tensor
        
        print(img.shape) # shape of [5,640,512]
    
        filename = os.path.join(self.root,'annotations',self.annotations[idx])
        tree = ET.parse(filename)
        objs = tree.findall('object')

        num_objs = len(objs)
        boxes = np.zeros((num_objs, 4), dtype=np.uint16)
        labels = np.zeros((num_objs), dtype=np.float32)
        seg_areas = np.zeros((num_objs), dtype=np.float32)
        
        boxes = []
        for ix, obj in enumerate(objs):
            bbox = obj.find('bndbox')
            x1 = float(bbox.find('xmin').text)
            y1 = float(bbox.find('ymin').text)
            x2 = float(bbox.find('xmax').text)
            y2 = float(bbox.find('ymax').text)

            cls = self._class_to_ind[obj.find('name').text.lower().strip()]
            boxes.append([x1, y1, x2, y2])
            labels[ix] = cls
            seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
        
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        seg_areas = torch.as_tensor(seg_areas, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.float32)
    
        target =  {'boxes': boxes,
                'labels': labels,
                'seg_areas': seg_areas,
                }
        
        return img,target

My main function code is as follows

import utils

backbone = torchvision.models.mobilenet_v2(pretrained=True).features
backbone.out_channels = 1280

anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=[0],
                                                output_size=7,
                                                sampling_ratio=2)
num_classes = 5

model = FasterRCNN(backbone = backbone,num_classes=5,rpn_anchor_generator=anchor_generator,box_roi_pool=roi_pooler)

dataset = CustomDataset('train_folder/')
data_loader_train = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True,collate_fn=utils.collate_fn)

train_model(model, criterion, data_loader_train, num_epochs=10)

The collate_fn defined in the utils.py file is the following

def collate_fn(batch):
    return tuple(zip(*batch))

I, however, get the following error while training

Traceback (most recent call last):
  File "train.py", line 147, in <module>
    train_model(model, criterion, data_loader_train, num_epochs)
  File "train.py", line 58, in train_model
    outputs = model(inputs, labels)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/generalized_rcnn.py", line 66, in forward
    images, targets = self.transform(images, targets)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/transform.py", line 46, in forward
    image = self.normalize(image)
  File "/usr/local/lib/python3.6/dist-packages/torchvision/models/detection/transform.py", line 66, in normalize
    return (image - mean[:, None, None]) / std[:, None, None]
RuntimeError: The size of tensor a (5) must match the size of tensor b (3) at non-singleton dimension 0

Can someone please help me out? I am a newbie in Pytorch