Faster R-CNN - ResNet.forward() takes 2 positional arguments but 3 were given

Hello everyone,

I followed this tutorial (Custom Object Detection using PyTorch Faster RCNN) on the author’s dataset. I have followed everything as they described, as well as the structure of the dataset. I only changed the model.py file, as I want to use the ResNet-50 model. For the model, I used a ResNet-50 model from scratch, as I want to customise it later. When trying to train the model, I get the following error.

Traceback (most recent call last):
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 128, in <module>
    train_loss = train(train_loader, model)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 36, in train
    loss_dict = model(images, targets)
                ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/v/va95/miniconda3/envs/odenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/v/va95/miniconda3/envs/odenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: ResNet.forward() takes 2 positional arguments but 3 were given

The custom ResNet-50 model, I am using is this one. https://github.com/JayPatwardhan/ResNet-PyTorch/blob/master/ResNet/ResNet.py

I am not sure how to face this error.

Any help is appreciated. Thank you.

Based on the error message the original model accepted the image and tareget tensors in its forward method while a standard ResNet only accepts the input argument. Check the definition of the original model and adapt your ResNet to also accept the target tensor or calculate the loss outside of the model’s forward method.

Thanks a lot for your help. I have changed my forward function to accept one argument, and I calculate the loss outside of it. A part of my training script looks like that now:

def train(train_data_loader, model):
    print('Training')
    global train_itr
    global train_loss_list
    
     # initialize tqdm progress bar
    prog_bar = tqdm(train_data_loader, total=len(train_data_loader))
    
    for i, data in enumerate(prog_bar):
        optimizer.zero_grad()
        images, targets = data
        
        #images = list(image.to(DEVICE) for image in images)
        images = torch.stack(images).to(DEVICE)
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        
        print('Images shape ' + str(images[0].shape))
        print('Targets ' + str(targets[0]))

        #loss_dict = model(images, targets)
        predictions = model(images)
        
        print('Predictions shape ' + str(predictions[0].shape))
        loss = model.compute_loss(predictions, targets)
        
        losses = loss
        loss_value = losses.item()
        train_loss_list.append(loss_value)

        train_loss_hist.send(loss_value)

        losses.backward()
        optimizer.step()

        train_itr += 1
    
        # update the loss value beside the progress bar for each iteration
        prog_bar.set_description(desc=f"Loss: {loss_value:.4f}")
    return train_loss_list

and a part of my model script is:

class ResNet(nn.Module):
    def __init__(self, ResBlock, layer_list, num_classes, num_channels=3):
        super(ResNet, self).__init__()
        self.in_channels = 64
        
        self.conv1 = nn.Conv2d(num_channels, 64, kernel_size=7, stride=2, padding=3, bias=False)
        self.batch_norm1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.max_pool = nn.MaxPool2d(kernel_size = 3, stride=2, padding=1)
        
        self.layer1 = self._make_layer(ResBlock, layer_list[0], planes=64)
        self.layer2 = self._make_layer(ResBlock, layer_list[1], planes=128, stride=2)
        self.layer3 = self._make_layer(ResBlock, layer_list[2], planes=256, stride=2)
        self.layer4 = self._make_layer(ResBlock, layer_list[3], planes=512, stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512*ResBlock.expansion, num_classes)
        
    def forward(self, x):
        x = self.relu(self.batch_norm1(self.conv1(x)))
        print('FIRST: ' + str(x.shape))
        x = self.max_pool(x)
        print('MAXPOOL: ' + str(x.shape))
        x = self.layer1(x)
        print('LAYER1: ' + str(x.shape))
        x = self.layer2(x)
        print('LAYER2: ' + str(x.shape))
        x = self.layer3(x)
        print('LAYER3: ' + str(x.shape))
        x = self.layer4(x)
        print('LAYER4: ' + str(x.shape))
        x = self.avgpool(x)
        print('AVG: ' + str(x.shape))
        x = x.view(x.size(0), -1)
        print('FC: ' + str(x.shape))
        x = self.fc(x)
               
        return x
        
    def _make_layer(self, ResBlock, blocks, planes, stride=1):
        ii_downsample = None
        layers = []
        
        if stride != 1 or self.in_channels != planes*ResBlock.expansion: #tto en commented out
            ii_downsample = nn.Sequential(
                nn.Conv2d(self.in_channels, planes*ResBlock.expansion, kernel_size=1, stride=stride),
                nn.BatchNorm2d(planes*ResBlock.expansion)
            )
            
        layers.append(ResBlock(self.in_channels, planes, i_downsample=ii_downsample, stride=stride))
        self.in_channels = planes*ResBlock.expansion
        
        for i in range(blocks-1):
            layers.append(ResBlock(self.in_channels, planes))
            
        return nn.Sequential(*layers)
        
    def compute_loss(self, predictions, targets):
        target_labels = torch.cat([t['labels'] for t in targets])

        loss = F.cross_entropy(predictions, target_labels)
        return loss

However, I am getting this error now:

EPOCH 1 of 10
Training
  0%|                                                                                   | 0/48 [00:00<?, ?it/s]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[210.0800, 188.7600, 314.0800, 252.2000]]), 'labels': tensor([1]), 'area': tensor([6597.7603]), 'iscrowd': tensor([0]), 'image_id': tensor([64])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 1.3513:   2%|█▎                                                           | 1/48 [00:01<01:04,  1.38s/it]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[165.8800, 133.1200, 323.4400, 278.0267]]), 'labels': tensor([4]), 'area': tensor([22831.4961]), 'iscrowd': tensor([0]), 'image_id': tensor([100])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 2.5029:   4%|██▌                                                          | 2/48 [00:02<01:01,  1.33s/it]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[232.9600, 199.1600, 307.8400, 307.3200]]), 'labels': tensor([1]), 'area': tensor([8099.0200]), 'iscrowd': tensor([0]), 'image_id': tensor([4])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 0.8967:   6%|███▊                                                         | 3/48 [00:03<00:57,  1.27s/it]Images shape torch.Size([3, 416, 416])
Targets {'boxes': tensor([[136.5867, 164.8400, 288.4267, 328.1200]]), 'labels': tensor([2]), 'area': tensor([24792.4336]), 'iscrowd': tensor([0]), 'image_id': tensor([122])}
FIRST: torch.Size([3, 64, 208, 208])
MAXPOOL: torch.Size([3, 64, 104, 104])
LAYER1: torch.Size([3, 256, 104, 104])
LAYER2: torch.Size([3, 512, 52, 52])
LAYER3: torch.Size([3, 1024, 26, 26])
LAYER4: torch.Size([3, 2048, 13, 13])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([5])
Loss: 0.8967:   6%|███▊                                                         | 3/48 [00:04<01:03,  1.42s/it]
Traceback (most recent call last):
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 134, in <module>
    train_loss = train(train_loader, model)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 43, in train
    loss = model.compute_loss(predictions, targets)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/model.py", line 139, in compute_loss
    loss = F.cross_entropy(predictions, target_labels)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/v/va95/miniconda3/envs/odenv/lib/python3.11/site-packages/torch/nn/functional.py", line 3053, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: Expected input batch_size (3) to match target batch_size (5).

I have check another thread that covers this error (ValueError: Expected input batch_size (324) to match target batch_size (4) - #26 by william_hero), but unfortunately, I couldn’t figure out the problem, yet. Could you please help me with that?

I guess the last batch contains a single sample and might cause a shape mismatch error if the batch size is not explicitly set. Could you check the shape and length of the model inputs, outputs, as well as targets directly instead of the first element?

I managed to print the requested shapes and lengths, except the target’s shape. I also changed the input image size from 416 to 512, in case it was the reason of this problem, but still received the error, after some more training iterations. Here is the output:

Number of training samples: 142
Number of validation samples: 7

23544773
cpu

EPOCH 1 of 10
Training
  0%|                                                                           | 0/48 [00:00<?, ?it/s]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 2.0571:   2%|█                                                    | 1/48 [00:02<01:38,  2.10s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 1.3844:   4%|██▏                                                  | 2/48 [00:03<01:25,  1.85s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 2.8780:   6%|███▎                                                 | 3/48 [00:05<01:21,  1.82s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 1.8169:   8%|████▍                                                | 4/48 [00:07<01:20,  1.82s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 1.9297:  10%|█████▌                                               | 5/48 [00:09<01:17,  1.80s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 1.6944:  12%|██████▋                                              | 6/48 [00:10<01:13,  1.76s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 1.5850:  15%|███████▋                                             | 7/48 [00:12<01:09,  1.70s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 0.6640:  17%|████████▊                                            | 8/48 [00:14<01:07,  1.69s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 0.9549:  19%|█████████▉                                           | 9/48 [00:15<01:06,  1.71s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 3.3928:  21%|██████████▊                                         | 10/48 [00:17<01:08,  1.80s/it]
Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Loss: 3.3928:  21%|██████████▊                                         | 10/48 [00:18<01:10,  1.85s/it]
Traceback (most recent call last):
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 138, in <module>
    train_loss = train(train_loader, model)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 47, in train
    loss = model.compute_loss(predictions, targets)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/model.py", line 139, in compute_loss
    loss = F.cross_entropy(predictions, target_labels)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/v/va95/miniconda3/envs/odenv/lib/python3.11/site-packages/torch/nn/functional.py", line 3053, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: Expected input batch_size (3) to match target batch_size (4).


Also,

def compute_loss(self, predictions, targets):

        # Extract class labels from the list of dictionaries
        target_labels = torch.cat([t['labels'] for t in targets])
        print('Target_labels: ', target_labels)
        print('Target_labels shape: ', target_labels.shape)
        print('Targets ' + str(target_labels[0]))
        print('Targets length', len(target_labels))

        # Calculate the loss using predictions and the extracted labels
        loss = F.cross_entropy(predictions, target_labels)
        return loss

Produces:

Number of training samples: 142
Number of validation samples: 7

23544773
cpu

EPOCH 1 of 10
Training
  0%|                                                                           | 0/48 [00:00<?, ?it/s]IMG_20190104_164604
IMG_20190104_163849
IMG_20190104_164635

Images shape torch.Size([3, 3, 512, 512])
Images length  3
Targets {'boxes': tensor([[116.4800, 238.9333, 305.9200, 400.2133]]), 'labels': tensor([3]), 'area': tensor([30552.8848]), 'iscrowd': tensor([0]), 'image_id': tensor([128])}
Targets length 3
FIRST: torch.Size([3, 64, 256, 256])
MAXPOOL: torch.Size([3, 64, 128, 128])
LAYER1: torch.Size([3, 256, 128, 128])
LAYER2: torch.Size([3, 512, 64, 64])
LAYER3: torch.Size([3, 1024, 32, 32])
LAYER4: torch.Size([3, 2048, 16, 16])
AVG: torch.Size([3, 2048, 1, 1])
FC: torch.Size([3, 2048])
Predictions shape torch.Size([3, 5])
Predictions length  3
Target_labels:  tensor([3, 4, 2, 3, 4, 1])
Target_labels shape:  torch.Size([6])
Targets tensor(3)
Targets length 6
  0%|                                                                           | 0/48 [00:00<?, ?it/s]
Traceback (most recent call last):
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 138, in <module>
    train_loss = train(train_loader, model)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/engine.py", line 47, in train
    loss = model.compute_loss(predictions, targets)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/alice3/scratch/aiadapt/va95/od/2/20211025_Custom_Object_Detection_using_PyTorch_Faster_RCNN/src/model.py", line 142, in compute_loss
    loss = F.cross_entropy(predictions, target_labels)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/v/va95/miniconda3/envs/odenv/lib/python3.11/site-packages/torch/nn/functional.py", line 3053, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: Expected input batch_size (3) to match target batch_size (6).

This is happening because for one of the images, there are more than one labels, and hence the error. I think there is something wrong with my loss function, but I am not sure what yet.