Mismatch of input dimension during training

jxlin · April 18, 2024, 5:19am

My model implementation is

class CNNRegressionModel(nn.Module):
  def __init__(self, image_size):
    super(CNNRegressionModel, self).__init__()
    self.image_size = tuple(image_size)
    self.conv1 = nn.Conv2d(in_channels=self.image_size[0], out_channels=24, kernel_size=3, stride=1, padding=1)
    self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
 
    self.conv2 = nn.Conv2d(in_channels=24, out_channels=48, kernel_size=3, stride=1, padding=1)
    self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
    
    self.conv3 = nn.Conv2d(in_channels=48, out_channels=96, kernel_size=3, stride=1, padding=1)
    self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)

    
    self.flat = nn.Flatten()
    
    self.fc4 = nn.Linear(in_features=86400, out_features=96)
    self.drop4 = nn.Dropout(0.5)
    self.output = nn.Linear(in_features=96, out_features=2)

  def forward(self, x):
    x = self.conv1(x)
    print(f'conv1 {x.size()}')
    x = nn.functional.relu(x)
    print(f'relu1 {x.size()}')
    x = self.pool1(x)
    print(f'pool1 {x.size()}')
    
    x = nn.functional.relu(self.conv2(x))
    print(f'conv2 {x.size()}')
    x = self.pool2(x)
    print(f'pool2 {x.size()}')
    
    x = nn.functional.relu(self.conv3(x))
    print(f'conv3 {x.size()}')
    x = self.pool3(x)
    
    x = self.flat(x)
    print(f'flat {x.size()}')
    
    x = nn.functional.relu(self.fc4(x))
    print(f'fc4 {x.size()}')
    x = self.drop4(x)
    x = self.output(x)
    return x

And this is how I created my training dataset.

class CustomImageDataset(torch.utils.data.Dataset):
  def __init__(self, img_paths: List, img_targets: Dict[str, Any]):
    self.img_paths = img_paths
    self.img_targets = img_targets
    self.transform = transforms.Compose([transforms.Resize((240,240)), transforms.ToTensor()])

  def __len__(self):
    return len(self.img_paths)

  def __getitem__(self, idx):
    img_path = self.img_paths[idx]
    image =  Image.open(img_path)
    image = self.transform(image)
    label = torch.tensor(int(img_path.split('/')[-1].split('.')[0]))
    outputs = torch.tensor(self.img_targets[img_path])
    return image, label, outputs

I wrapped the dataset using a dataloader and each input size is

Feature batch shape: torch.Size([1, 3, 240, 240])
Labels batch shape: torch.Size([1])
Targets batch shape: torch.Size([1, 2])

I have checked each input image size and all are the same but I keep running into the following error:
RuntimeError: Given groups=1, weight of size [24, 3, 3, 3], expected input[1, 4, 240, 240] to have 3 channels, but got 4 channels instead

Any help would be greatly appreciated!

ptrblck · April 18, 2024, 5:34am

These shapes do not match and I would guess some of your input images have an additional alpha channel, which you could remove e.g. by indexing image via image = image[:3] in the __getitem__.