RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [2, 12]

Hello everyone,

I’m working on a project for my school in the field of computer vision, this topic is very new to me and I have no one to help me (university particularity). I encounter the specific error “RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [2, 12]”.
I have tried using ChatGPT to help me but I keep getting the same things over and over again, but I’m still struggling to understand why my model got input of size [2, 12].

I am using the SSD (Single Shot Detector) model with a resnet18 backbone. Also,I’m using a specific dataset with space images of different objects like spacecraft or debris. The goal is to train a model to be able to detect these different objects with a good accuracy on the images (11 classes).

There is not so much information about the ssd model I think, and my case is a little bit particular from what I have seen on internet. For now I need to run my model locally on my pc with a data sample (1100 images, 100 of each class), and then i will run and train my model on a cloud platform from my university.

Main code :

# Main code
import torch
from utilsSample import SPARKDataset  # Assuming SPARKDataset is defined in utils.py
from utilsSample import PyTorchSparkDataset  # Assuming SPARKDataset is defined in utils.py
from torchvision import transforms
from torch.utils.data import DataLoader
from torchvision.models.detection import ssd
from torchvision.models.mobilenet import mobilenet_v2
from torchvision.models.detection.anchor_utils import AnchorGenerator
from torchvision.models import resnet18
import matplotlib.pyplot as plt
from random import randint

# Assuming you have your dataset defined in SPARKDataset class in utils.py

# # Define transforms (adjust these based on your requirements)
# transform = transforms.Compose([
#     transforms.Resize((300, 300)),  # Resizing
#     transforms.ToTensor()
# ])

# Path to your dataset directory
data_root = 'C:\\Users\\alibe\\Documents\\UPSSITECH3\\ERASMUS cours\\Computer Vision and Image Analysis\\stream-1\\data samples\\'

# Define the class map, you can use the same as previously defined
class_map = {
    'proba_2': 0, 'cheops': 1, 'debris': 2, 'double_star': 3, 'earth_observation_sat_1': 4, 'lisa_pathfinder': 5,
    'proba_3_csc': 6, 'proba_3_ocs': 7, 'smart_1': 8, 'soho': 9, 'xmm_newton': 10
}

# Dataset initialization
visualize_train_dataset = SPARKDataset(class_map, root_dir=data_root, split='train', transform=None)
visualize_val_dataset = SPARKDataset(class_map, root_dir=data_root, split='val', transform=None)

train_dataset = PyTorchSparkDataset(class_map, root_dir=data_root, split='train', transform=None)
val_dataset = PyTorchSparkDataset(class_map, root_dir=data_root, split='val', transform=None)

# Define the model
backbone = resnet18(pretrained=True)

num_classes = 12  # Number of classes in your dataset
num_channels = 3 #Number of channels for one image in the dataset

# Assuming fc layer is the fully connected layer in your model
backbone.fc = torch.nn.Linear(backbone.fc.in_features, num_classes)

# SSD needs an anchor generator - define it
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                   aspect_ratios=((0.5, 1.0, 2.0),))

# Size of the image used as input (1024x1024 in your case)
image_size = (300, 300)

# Create the SSD model with the specified components
model = ssd.SSD(backbone, anchor_generator, image_size, num_classes)

#######
# Define the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

print(type(train_dataset))

# Define the custom collate function
# def custom_collate(batch):
#     images, targets = zip(*batch)
#     images = torch.stack(images)
#     return images, targets

# # Define the data loaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
# Define the data loaders with custom collate function
# train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=custom_collate)
# val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=custom_collate)


print(type(train_loader))
print(train_loader)

# setting device to model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
torch.cuda.empty_cache()

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (images, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        
        # No need to stack, as images is already a batched tensor
        # images = torch.stack(images)
        # Convert images to the appropriate format
        # images = images.to(device)
    
        print(targets["labels"])
        
        print(batch_idx)
        print(images)
        print(targets)
        print("images (input) variable type : ", type(images))
        print("targets (input) variable type : ", type(targets))
        print(len(images))
        print(images.size)
        print(images.shape)
        
        print("")     
        print("")
        # Debug print
        # print("Mean of Image:", torch.mean(images))
        # print("Std of Image:", torch.std(images))
        print("")
        print(type(images))
        # print("Input Size:", images[0].size())
        # print("Images Shape:", images[0].shape)
        #print(model)
        print("")

        print('TARGETS PART')
        print(type(targets))
        print(targets)
        print("")
        print(targets['boxes'])
        print(targets['boxes'][0])
        print("")
        print(targets['labels'])
        print("")
        for key, value in targets.items():
             print(value[0])
        
        
        
        targets_list = []
        for i in range(len(targets["labels"])):
            boxes = targets["boxes"][i].float()  # Remove the singleton dimension
            # boxes = targets["boxes"][i].squeeze(0).float()  # Remove the singleton dimension
            # labels = targets["labels"][i].squeeze(0)  # Remove the singleton dimension
            labels = targets["labels"][i].int() # Remove the singleton dimension
            
            # Construct a dictionary with the required structure
            targets_dict = {"boxes": boxes, "labels": labels}
            # Append the dictionary to the list
            targets_list.append(targets_dict)
        
        
        # print('TARGETS PART 2')
        # print(targets)
        # print("")
        # print(targets["boxes"])
        # print(targets["boxes"].shape)

        
        
        print(type(model))
        print(model.transform)
        #print(model)
        
        images = (images.float() / 255.0).to(device)
        images = images.permute(0, 3, 1, 2).float() / 255.0  # Assuming the last dimension is channels
        images = images.to(device)
        
        
        print("Images SHAPE : ", images.shape)
        print("Targets_list SHAPE : ", targets_dict["boxes"].shape)
        print("Targets_list SHAPE : ", targets_dict["labels"].shape)
        print("Targets_list TYPE : ", type(targets_dict["boxes"]))
        print("Targets_list TYPE : ", type(targets_dict["labels"]))
        print(targets_list)
        
        #print(model)
        print(images.shape)
        print(images)
        
        loss_dict = model(images, targets_list)
        
        
        total_loss = sum(loss for loss in loss_dict.values())
        total_loss.backward()
        optimizer.step()
        
        
        # # Print or log the loss if needed
        # if batch_idx % log_interval == 0:
        #     print(f'Epoch {epoch}, Batch {batch_idx}, Loss: {total_loss.item()}')


utilsSample.py

from ast import literal_eval
import os
import matplotlib.pyplot as plt
from skimage import io , img_as_uint
import pandas as pd
import matplotlib.patches as mpatches
from PIL import Image
import torch
import torchvision
from torchvision import transforms

class YourCustomTransformation(transforms.Compose):
    def __call__(self, img):
        for t in self.transforms:
            img = t(img)
        return img


class ResizeWithBbox(transforms.Resize):
    def __init__(self, size, apply_to_bbox=True):
        super(ResizeWithBbox, self).__init__(size)
        self.apply_to_bbox = apply_to_bbox

    def __call__(self, img):
        img = super(ResizeWithBbox, self).__call__(img)
        
        if self.apply_to_bbox:
            # Bounding box adjustments can be done in SPARKDataset.__getitem__
            return img
        
            
        return img



def process_labels(labels_dir, split):
    #path = os.path.join(labels_dir, file_name)
    labels_filename = os.path.join(labels_dir, split + '.csv')  # Construct the file path correctly
    labels = pd.read_csv(labels_filename, sep=';')
    return labels


class SPARKDataset:

    """ Class for dataset inspection: easily accessing single images, and corresponding ground truth pose data. """

    def __init__(self, class_map, root_dir ,split, transform=None, detection=True):
        # self.root_dir = os.path.join(data_dir, split)
        self.root_dir = root_dir
        self.labels = process_labels(root_dir, split)
        self.class_map =  class_map
        self.transform = transform #Added the transform attribute
        self.detection = detection #Add the detection attribute


    def __getitem__(self, idx):
        sat_name = self.labels.iloc[idx]['class']
        img_name = self.labels.iloc[idx]['filename']
        img_name_ok = 'train\\' + img_name
        image_name = os.path.join(self.root_dir, img_name_ok)

        # Load image as PIL image
        image = Image.open(image_name)
        
        # Convert PIL image to tensor
        image = transforms.ToTensor()(image)

        # Initialize bbox
        bbox = None

        # Apply transformation if specified
        if self.transform is not None:
            image = self.transform(image)

        if self.detection:
            # Adjust bounding box accordingly
            bbox = self.labels.iloc[idx]['bbox']
            bbox = literal_eval(bbox)
            bbox = [bbox[1], bbox[0], bbox[3], bbox[2]]  # Convert to [x_min, y_min, x_max, y_max]

            targets = {
                'boxes': torch.tensor([bbox], dtype=torch.float32),
                'labels': torch.tensor([self.class_map[sat_name]], dtype=torch.int64),
                #'image_id': img_name,  # Assuming 'filename' uniquely identifies each image
            }

        else:
            # Include labeling information for non-detection case
            targets = {
                'labels': torch.tensor([self.class_map[sat_name]], dtype=torch.int64),
                'image_id': img_name,  # Assuming 'filename' uniquely identifies each image
            }

        return image, targets




    
    
    def __len__(self):
        length = 1101
        #return 66000  # Replace 66000 with the actual length of your dataset
        return length  # Replace "length" with the dataset list, array, or the length of your dataset
    
    
    def get_image(self, i=0):

        """ Loading image as PIL image. """
        sat_name = self.labels.iloc[i]['class']
        img_name = self.labels.iloc[i]['filename']
        
        img_name_ok = 'train\\' + img_name ###### To modify with the group of images choosen (train, val, test)
        image_name = os.path.join(self.root_dir, img_name_ok) #Ensure image path is correctly created
        
        print('img_name_ok is : ' + img_name_ok)
        print("Image Path:", image_name) #Add this line for debugging
        
        image = io.imread(image_name)
        return image , self.class_map[sat_name]

    def get_bbox(self, i=0):

        """ Getting bounding box for image. """
        bbox = self.labels.iloc[i]['bbox']
        bbox    = literal_eval(bbox)
        min_x, min_y, max_x, max_y = bbox

        return min_x, min_y, max_x, max_y 



    def visualize(self,i, size=(15,15),  ax=None):

        """ Visualizing image, with ground truth pose with axes projected to training image. """

        if ax is None:
            ax = plt.gca()
            
        image, img_class = self.get_image(i)
        min_x, min_y, max_x, max_y   = self.get_bbox(i)

        ax.imshow(image,vmin=0, vmax=255)


        rect = mpatches.Rectangle((min_y, min_x), max_y - min_y, max_x - min_x,
                                        fill=False, edgecolor='red', linewidth=2)
        ax.add_patch(rect)
        
        label = f"{list(self.class_map.keys())[list(self.class_map.values()).index(img_class)]}"
        
        ax.text(min_y, min_x-20, label,color='white',fontsize=15)
        ax.set_axis_off()

        return 

    
try:
    import torch
    from torch.utils.data import Dataset
    from torchvision import transforms
    has_pytorch = True
    print('Found Pytorch')
except ImportError:
    has_pytorch = False

    
if has_pytorch:
    class PyTorchSparkDataset(Dataset):

        """ SPARK dataset that can be used with DataLoader for PyTorch training. """

        def __init__(self, class_map, split, root_dir, transform=None,detection = True):

            if not has_pytorch:
                raise ImportError('Pytorch was not imported successfully!')

            if split not in {'train', 'val', 'test'}:
                raise ValueError('Invalid split, has to be either \'train\', \'val\' or \'test\'')


            self.class_map =  class_map
            
            self.detection = detection
            self.split = split 
            self.root_dir = os.path.join(root_dir, self.split)
            
            self.labels = process_labels(root_dir,split)
                
            self.transform = transform

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            
            sat_name = self.labels.iloc[idx]['class']
            img_name = self.labels.iloc[idx]['filename']
            image_name = f'{self.root_dir}/{img_name}'
            
            image = io.imread(image_name)


            if self.transform is not None:
                torch_image = self.transform(image)
            
            else:
                torch_image = torch.from_numpy(image).permute(2,1,0)
                
            if self.detection:
                
                bbox = self.labels.iloc[idx]['bbox']
                bbox = literal_eval(bbox)
                bbox = [bbox[1], bbox[0], bbox[3], bbox[2]]  # Convert to [x_min, y_min, x_max, y_max]

                targets = {
                    'boxes': torch.tensor([bbox], dtype=torch.float32),
                    'labels': torch.tensor([self.class_map[sat_name]], dtype=torch.int64),  # Use class_map here
                    #'image_id': idx,  # Change to a single integer
                }

                return image, targets

            return image, targets


else:
    class PyTorchSparkDataset:
        def __init__(self, *args, **kwargs):
            raise ImportError('Pytorch is not available!')

My code is a bit messy with a lot of prints because I tried several things to get the right input format for the model.

After running of the training loop, i have this output and this error :

Found Pytorch
C:\Users\alibe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchvision\models\_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead.
  warnings.warn(
C:\Users\alibe\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchvision\models\_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet18_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet18_Weights.DEFAULT` to get the most up-to-date weights.
  warnings.warn(msg)
<class 'utilsSample.PyTorchSparkDataset'>
<class 'torch.utils.data.dataloader.DataLoader'>
<torch.utils.data.dataloader.DataLoader object at 0x0000028DDDD64E50>
img_name_ok is : train\img084467.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img084467.jpg
img_name_ok is : train\img068659.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img068659.jpg
img_name_ok is : train\img087785.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img087785.jpg
img_name_ok is : train\img040469.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img040469.jpg
img_name_ok is : train\img043975.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img043975.jpg
img_name_ok is : train\img020490.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img020490.jpg
img_name_ok is : train\img080816.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img080816.jpg
img_name_ok is : train\img073756.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img073756.jpg
img_name_ok is : train\img025365.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img025365.jpg
img_name_ok is : train\img031010.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img031010.jpg
img_name_ok is : train\img061777.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img061777.jpg
img_name_ok is : train\img068321.jpg
Image Path: C:\Users\alibe\Documents\UPSSITECH3\ERASMUS cours\Computer Vision and Image Analysis\stream-1\data samples\train\img068321.jpg

tensor([[4],
        [9]])
0
tensor([[[[ 32,  32,  32],
          [ 57,  57,  57],
          [ 44,  44,  44],
          ...,
          [ 90,  89,  87],
          [ 80,  79,  77],
          [ 70,  69,  67]],

         [[ 46,  46,  46],
          [ 52,  52,  52],
          [ 48,  48,  48],
          ...,
          [ 77,  76,  74],
          [ 88,  87,  85],
          [116, 115, 113]],

         [[ 47,  47,  47],
          [ 42,  42,  42],
          [ 55,  55,  55],
          ...,
          [ 59,  58,  56],
          [142, 141, 139],
          [135, 134, 132]],

         ...,

         [[ 48,  48,  48],
          [ 48,  48,  48],
          [ 39,  39,  39],
          ...,
          [ 54,  54,  54],
          [ 36,  36,  36],
          [ 52,  52,  52]],

         [[ 53,  53,  53],
          [ 51,  51,  51],
          [ 35,  35,  35],
          ...,
          [ 47,  47,  47],
          [ 40,  40,  40],
          [ 46,  46,  46]],

         [[ 40,  40,  40],
          [ 53,  53,  53],
          [ 35,  35,  35],
          ...,
          [ 47,  47,  47],
          [ 53,  53,  53],
          [ 48,  48,  48]]],


        [[[ 42,  42,  42],
          [ 31,  31,  31],
          [ 24,  24,  24],
          ...,
          [ 31,  31,  31],
          [ 48,  48,  48],
          [ 42,  42,  42]],

         [[ 30,  30,  30],
          [ 25,  25,  25],
          [ 44,  44,  44],
          ...,
          [ 49,  49,  49],
          [ 34,  34,  34],
          [ 40,  40,  40]],

         [[ 10,  10,  10],
          [ 23,  23,  23],
          [ 19,  19,  19],
          ...,
          [ 33,  33,  33],
          [ 27,  27,  27],
          [ 29,  29,  29]],

         ...,

         [[ 30,  30,  30],
          [ 26,  26,  26],
          [ 34,  34,  34],
          ...,
          [ 31,  31,  31],
          [ 34,  34,  34],
          [ 31,  31,  31]],

         [[ 43,  43,  43],
          [ 33,  33,  33],
          [ 29,  29,  29],
          ...,
          [ 32,  32,  32],
          [ 24,  24,  24],
          [ 18,  18,  18]],

         [[ 25,  25,  25],
          [ 17,  17,  17],
          [ 43,  43,  43],
          ...,
          [ 40,  40,  40],
          [ 29,  29,  29],
          [ 23,  23,  23]]]], dtype=torch.uint8)
{'boxes': tensor([[[ 799.,    0., 1024.,   75.]],

        [[  41.,  578.,  259.,  920.]]]), 'labels': tensor([[4],
        [9]])}
images (input) variable type :  <class 'torch.Tensor'>
targets (input) variable type :  <class 'dict'>
2
<built-in method size of Tensor object at 0x0000028DE60FDF90>
torch.Size([2, 1024, 1024, 3])



<class 'torch.Tensor'>

TARGETS PART
<class 'dict'>
{'boxes': tensor([[[ 799.,    0., 1024.,   75.]],

        [[  41.,  578.,  259.,  920.]]]), 'labels': tensor([[4],
        [9]])}

tensor([[[ 799.,    0., 1024.,   75.]],

        [[  41.,  578.,  259.,  920.]]])
tensor([[ 799.,    0., 1024.,   75.]])

tensor([[4],
        [9]])

tensor([[ 799.,    0., 1024.,   75.]])
tensor([4])
<class 'torchvision.models.detection.ssd.SSD'>
GeneralizedRCNNTransform(
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    Resize(min_size=(300,), max_size=300, mode='bilinear')
)
Images SHAPE :  torch.Size([2, 3, 1024, 1024])
Targets_list SHAPE :  torch.Size([1, 4])
Targets_list SHAPE :  torch.Size([1])
Targets_list TYPE :  <class 'torch.Tensor'>
Targets_list TYPE :  <class 'torch.Tensor'>
[{'boxes': tensor([[ 799.,    0., 1024.,   75.]]), 'labels': tensor([4], dtype=torch.int32)}, {'boxes': tensor([[ 41., 578., 259., 920.]]), 'labels': tensor([9], dtype=torch.int32)}]
torch.Size([2, 3, 1024, 1024])
tensor([[[[0.0005, 0.0009, 0.0007,  ..., 0.0014, 0.0012, 0.0011],
          [0.0007, 0.0008, 0.0007,  ..., 0.0012, 0.0014, 0.0018],
          [0.0007, 0.0006, 0.0008,  ..., 0.0009, 0.0022, 0.0021],
          ...,
          [0.0007, 0.0007, 0.0006,  ..., 0.0008, 0.0006, 0.0008],
          [0.0008, 0.0008, 0.0005,  ..., 0.0007, 0.0006, 0.0007],
          [0.0006, 0.0008, 0.0005,  ..., 0.0007, 0.0008, 0.0007]],

         [[0.0005, 0.0009, 0.0007,  ..., 0.0014, 0.0012, 0.0011],
          [0.0007, 0.0008, 0.0007,  ..., 0.0012, 0.0013, 0.0018],
          [0.0007, 0.0006, 0.0008,  ..., 0.0009, 0.0022, 0.0021],
          ...,
          [0.0007, 0.0007, 0.0006,  ..., 0.0008, 0.0006, 0.0008],
          [0.0008, 0.0008, 0.0005,  ..., 0.0007, 0.0006, 0.0007],
          [0.0006, 0.0008, 0.0005,  ..., 0.0007, 0.0008, 0.0007]],

         [[0.0005, 0.0009, 0.0007,  ..., 0.0013, 0.0012, 0.0010],
          [0.0007, 0.0008, 0.0007,  ..., 0.0011, 0.0013, 0.0017],
          [0.0007, 0.0006, 0.0008,  ..., 0.0009, 0.0021, 0.0020],
          ...,
          [0.0007, 0.0007, 0.0006,  ..., 0.0008, 0.0006, 0.0008],
          [0.0008, 0.0008, 0.0005,  ..., 0.0007, 0.0006, 0.0007],
          [0.0006, 0.0008, 0.0005,  ..., 0.0007, 0.0008, 0.0007]]],


        [[[0.0006, 0.0005, 0.0004,  ..., 0.0005, 0.0007, 0.0006],
          [0.0005, 0.0004, 0.0007,  ..., 0.0008, 0.0005, 0.0006],
          [0.0002, 0.0004, 0.0003,  ..., 0.0005, 0.0004, 0.0004],
          ...,
          [0.0005, 0.0004, 0.0005,  ..., 0.0005, 0.0005, 0.0005],
          [0.0007, 0.0005, 0.0004,  ..., 0.0005, 0.0004, 0.0003],
          [0.0004, 0.0003, 0.0007,  ..., 0.0006, 0.0004, 0.0004]],

         [[0.0006, 0.0005, 0.0004,  ..., 0.0005, 0.0007, 0.0006],
          [0.0005, 0.0004, 0.0007,  ..., 0.0008, 0.0005, 0.0006],
          [0.0002, 0.0004, 0.0003,  ..., 0.0005, 0.0004, 0.0004],
          ...,
          [0.0005, 0.0004, 0.0005,  ..., 0.0005, 0.0005, 0.0005],
          [0.0007, 0.0005, 0.0004,  ..., 0.0005, 0.0004, 0.0003],
          [0.0004, 0.0003, 0.0007,  ..., 0.0006, 0.0004, 0.0004]],

         [[0.0006, 0.0005, 0.0004,  ..., 0.0005, 0.0007, 0.0006],
          [0.0005, 0.0004, 0.0007,  ..., 0.0008, 0.0005, 0.0006],
          [0.0002, 0.0004, 0.0003,  ..., 0.0005, 0.0004, 0.0004],
          ...,
          [0.0005, 0.0004, 0.0005,  ..., 0.0005, 0.0005, 0.0005],
          [0.0007, 0.0005, 0.0004,  ..., 0.0005, 0.0004, 0.0003],
          [0.0004, 0.0003, 0.0007,  ..., 0.0006, 0.0004, 0.0004]]]])
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[7], line 96
     93 print(images.shape)
     94 print(images)
---> 96 loss_dict = model(images, targets_list)
     99 total_loss = sum(loss for loss in loss_dict.values())
    100 total_loss.backward()

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchvision\models\detection\ssd.py:378, in SSD.forward(self, images, targets)
    375 features = list(features.values())
    377 # compute the ssd heads outputs using the features
--> 378 head_outputs = self.head(features)
    380 # create the set of anchors
    381 anchors = self.anchor_generator(images, features)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchvision\models\detection\ssd.py:66, in SSDHead.forward(self, x)
     64 def forward(self, x: List[Tensor]) -> Dict[str, Tensor]:
     65     return {
---> 66         "bbox_regression": self.regression_head(x),
     67         "cls_logits": self.classification_head(x),
     68     }

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchvision\models\detection\ssd.py:95, in SSDScoringHead.forward(self, x)
     92 all_results = []
     94 for i, features in enumerate(x):
---> 95     results = self._get_result_from_module_list(features, i)
     97     # Permute output from (N, A * K, H, W) to (N, HWA, K).
     98     N, _, H, W = results.shape

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchvision\models\detection\ssd.py:88, in SSDScoringHead._get_result_from_module_list(self, x, idx)
     86 for i, module in enumerate(self.module_list):
     87     if i == idx:
---> 88         out = module(x)
     89 return out

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\conv.py:460, in Conv2d.forward(self, input)
    459 def forward(self, input: Tensor) -> Tensor:
--> 460     return self._conv_forward(input, self.weight, self.bias)

File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torch\nn\modules\conv.py:456, in Conv2d._conv_forward(self, input, weight, bias)
    452 if self.padding_mode != 'zeros':
    453     return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    454                     weight, bias, self.stride,
    455                     _pair(0), self.dilation, self.groups)
--> 456 return F.conv2d(input, weight, bias, self.stride,
    457                 self.padding, self.dilation, self.groups)

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [2, 12]

I know that there are a lot of things and that it does not look good, but i’m a very beginner and I want to learn, so thank you very much for your help on this.