Pytorch MaskRCNN - KeyError:0 in target

I am trying to use the pretrained maskrcnn in pytorch. Since my input is 6 channel, I have changed the first Conv2d operation in the backbone to reflect this. My mask is a grayscale opencv image with just 2 classes.
I get the following error message:
Traceback (most recent call last):
File “c:/Users/ritvi/”, line 65, in
outputs = net(image, target)
File “C:\Users\ritvi\python\lib\site-packages\torch\nn\modules\”, line 550, in call
result = self.forward(*input, **kwargs)
File “C:\Users\ritvi\python\lib\site-packages\torchvision\models\detection\”, line 66, in forward
images, targets = self.transform(images, targets)
File “C:\Users\ritvi\python\lib\site-packages\torch\nn\modules\”, line 550, in call
result = self.forward(*input, **kwargs)
File “C:\Users\ritvi\python\lib\site-packages\torchvision\models\detection\”, line 39, in forward
target_index = targets[i] if targets is not None else None
KeyError: 0

I presume the issue is with one of the fields in the target being improperly formatted but not sure which and how it should be corrected. Can someone help me out with the same?
I am including a minimal example to replicate the issue

import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import torch
import numpy as np

def get_maskrcnn_model(n_channels=6, n_classes=2):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)

    # Replacing the number of input channels of the first layer
    model.backbone.body.conv1 = torch.nn.Conv2d(in_channels=n_channels, out_channels=64, kernel_size=(7, 7),
                                                stride=(2, 2), padding=(3, 3), bias=False)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, n_classes)

    # now get the number of input features for the mask classifier
    in_features_mask = model.roi_heads.mask_predictor.conv5_mask.in_channels
    hidden_layer = 256
    # and replace the mask predictor with a new one
    model.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask,
    return model

# My input data has 6 channels, hence I have changed the first Conv2d in the backbone
net = get_maskrcnn_model(n_channels=6, n_classes=2)

# Imagine a dummy 6 channel image
image = np.ones((6, 512, 512), dtype=np.float64)
# Imagine a mask which is an opencv grayscale image
mask = np.zeros((512, 512), dtype=np.float64)

# Adding the target required by Mask_RCNN
# Assume that there is at most only one instance of the object in the image
num_objs = 1
boxes = []
# If mask has any foreground object, define the box
if np.count_nonzero(mask) > 0:
    pos = np.where(mask)
    xmin = np.min(pos[1])
    xmax = np.max(pos[1])
    ymin = np.min(pos[0])
    ymax = np.max(pos[0])
    boxes.append([xmin, ymin, xmax, ymax])
# If mask does not have any foreground object, define a dummy box
    boxes.append([-1, -1, -1, -1])
# convert everything into a torch.Tensor
boxes = torch.as_tensor(boxes, dtype=torch.float32)
labels = torch.ones((num_objs,), dtype=torch.int64)
masks = torch.as_tensor([mask], dtype=torch.uint8) # This will cause a deprecation warning, please ignore
# Create the target dictionary
target = {}
target["boxes"] = boxes
target["labels"] = labels
target["masks"] = masks

# Forward pass
outputs = net(image, target)

I think you should send the targets and images as list, like so

outputs = net([image], [target])