Error conv2d(): argument 'input' (position 1) must be Tensor, not list with CNN fine tuning


I try to fine tune a VGG16 model to predict if images are insects or background. However I always ge a “conv2d(): argument ‘input’ (position 1) must be Tensor, not list” error when I try to pass my batch to the model. I really don’t know why I get this error.

Here is my code :

import torch
from torch import nn
from import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda, Compose
from import read_image
import matplotlib.pyplot as plt

# hyperparameters

num_classes = 2
batch_size = 4
num_epochs = 20
test_split = .2
learning_rate = 1e-3
random_seed= 8
shuffle_dataset = True
feature_extract = True
# num_workers to 0 on windows
workers = 0

# create panda with annotations

PATH_INSECTS = '/content/gdrive/Shareddrives/lotus-insect-pro-2/deepl_model/fromVideo_extraction/training_exemples/all'
lst_images = os.listdir(PATH_INSECTS)
print('nb images : ',len(lst_images))
pd = pandas.DataFrame()
labels = []
nb_insects = 0
nb_background= 0 
for image in lst_images:
  if image[0:1] == 'i':
  else :
pd['image_name'] = lst_images
pd['labels'] = labels
print('nb insects examples : ',nb_insects)
print('nb background examples : ',nb_background)

# create transform ____________________________________________________________________

#def transform(image):
  #return transforms.Compose([transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) 

#def target_transform(label):
  #return Lambda(lambda y: torch.zeros(3, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y), value=1))

# create dataset _______________________________________________________________

class InsectsDataset(

    def __init__(self, img_dir, annotations, transform=None, target_transform=None):
        self.img_dir = img_dir
        self.annotations = annotations
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        # get image id by index
        image_id = self.annotations.iloc[index,0]
        # open image by external id
        image_path = os.path.join(self.img_dir, image_id)
        image = read_image(image_path)
        # get label
        label = torch.tensor(int(self.annotations.iloc[index,1]))
        # transform
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

dataset = InsectsDataset(PATH_INSECTS,pd,None)

# create data loader ___________________________________________________________

dataset_size = len(dataset)
print('dataset size : ', dataset_size)
indices = list(range(dataset_size))

split = int(np.floor(test_split * dataset_size))
print('split : ', split)

if shuffle_dataset :
print('indices : ', indices)

train_indices, test_indices = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)
print('train indices : ',len(train_indices))
print('test indices : ',len(test_indices))

# custom collate_fn for different size samples in batches
def collate_fn(batch):
  # collate_fn needs for batch
    return tuple(zip(*batch))

train_loader =, batch_size=batch_size, 

test_loader =, batch_size=batch_size, 

# test dataloader ______________________________________________________________

train_features, train_labels = next(iter(train_loader))

# import sys
# sys.exit("Error message")

# import model _________________________________________________________________

model = torchvision.models.vgg19(pretrained = True)
model.classifier[6] = nn.Linear(4096,num_classes)

# train model __________________________________________________________________

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

def train(dataloader, model, loss_fn, optimizer):

    size = len(dataloader.dataset)

    for batch, (X, y) in enumerate(dataloader):

        X = list( for image in X)
        y = list( for label in  y)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

for t in range(num_epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_loader, model, loss_fn, optimizer)
    test(test_loader, model)


Thank you for your help

The input to the vgg16 model is expected to be a tensor in the shape [batch_size, 3, 224, 224], so you would have to recreate this batch via e.g. torch.stack.
Usually the DataLoader would already return these batches, but I assume you are seeing a list in the DataLoader loop due to your custom collate_fn.

Thank you ! So impossible to use different size input images ?

It wouldn’t be possible to execute a batch of image tensors in different shapes in a single execution.
The nested tensors utility could solve it, but I’m unsure what the current status of it is.
For now, you could either pad the tensors to a common shape.