Input channel mismatch

I am working on a self-supervised ML task. The colourization pretext architecture is designed to accept grayscale images to predict the grayscale version. However, the error occurs when I use the encoder part of the network for my downstream task which is a binary classification that takes RGB images.

RuntimeError: Given groups=1, weight of size [64, 1, 3, 3], expected input[32, 3, 280, 280] to have 1 channels, but got 3 channels instead

PRETEXT -COLOURIZATION MODEL

class ColorizationModel(nn.Module):
    def __init__(self):
        super(ColorizationModel, self).__init__()
        # Define the architecture for the colorization model
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(64, 3, kernel_size=4, stride=2, padding=1),
            nn.Sigmoid()  # Use sigmoid activation to ensure outputs are in [0, 1] range
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        # Resize the output to match the size of RGB images (224x224)
        x = F.interpolate(x, size=(224, 224), mode='bilinear', align_corners=False)
        return x

# Instantiate the colorization model
colorization_model = ColorizationModel()

BINARY CLASSIFICATION

# Define your binary classification model
class BinaryClassifier(nn.Module):
    def __init__(self, colorization_model, in_features):
        super(BinaryClassifier, self).__init__()
        # Use the encoder of the colorization model as the feature extractor
        self.feature_extractor = colorization_model

        # Add a binary classification layer
        self.classification_layer = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),  # Input is RGB, not grayscale
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(in_features, 1),  # Adjust 'in_features' based on the feature dimension
            nn.Sigmoid()
        )

    def forward(self, x):
        # Extract features using the encoder of the colorization model
        features = self.feature_extractor(x)

        # Flatten or pool the features as needed
        # Pass through the binary classification layer
        output = self.classification_layer(features)
        return output


# Create an instance of the binary classification model
binary_classifier_model = BinaryClassifier(colorization_model.encoder, in_features)

Welcome to the forums!

This layer requires 1 channel inputs. Can you print the size of the input to the model before it goes into the model?

For example:

...
print(input_image.size())
output = binary_classification_model(input image)
...

I’m getting a different error when running your code.

import torch
import torch.nn as nn
import torch.nn.functional as F

class ColorizationModel(nn.Module):
    def __init__(self):
        super(ColorizationModel, self).__init__()
        # Define the architecture for the colorization model
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(128, 64, kernel_size=4, stride=2, padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(64, 3, kernel_size=4, stride=2, padding=1),
            nn.Sigmoid()  # Use sigmoid activation to ensure outputs are in [0, 1] range
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        # Resize the output to match the size of RGB images (224x224)
        print(x.size())
        x = F.interpolate(x, size=(224, 224), mode='bilinear', align_corners=False)
        return x

# Instantiate the colorization model
colorization_model = ColorizationModel()

# Define your binary classification model
class BinaryClassifier(nn.Module):
    def __init__(self, colorization_model, in_features):
        super(BinaryClassifier, self).__init__()
        # Use the encoder of the colorization model as the feature extractor
        self.feature_extractor = colorization_model

        # Add a binary classification layer
        self.classification_layer = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),  # Input is RGB, not grayscale
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten(),
            nn.Linear(in_features, 1),  # Adjust 'in_features' based on the feature dimension
            nn.Sigmoid()
        )

    def forward(self, x):
        # Extract features using the encoder of the colorization model
        features = self.feature_extractor(x)
        print(features.size())
        # Flatten or pool the features as needed
        # Pass through the binary classification layer
        output = self.classification_layer(features)
        return output

in_features = 16
# Create an instance of the binary classification model
binary_classifier_model = BinaryClassifier(colorization_model.encoder, in_features)

dummy_data = torch.rand((32, 1, 224, 224))

print(binary_classifier_model(dummy_data).size())
  File "scratches\scratch_168.py", line 57, in forward
    output = self.classification_layer(features)
RuntimeError: Given groups=1, weight of size [64, 3, 3, 3], expected input[32, 128, 112, 112] to have 3 channels, but got 128 channels instead

The issue with this error is that you are passing in the colorization_model.encoder as the features_extractor and the output of the encoder is (batch_size, 128, 112, 112), based on the input size you specified.

But the classification_layer you have set to only take 3 channels.
See the forward pass:

def forward(self, x):
        # Extract features using the encoder of the colorization model
        features = self.feature_extractor(x)   #< -------- output is 128 channels
        print(features.size())
        # Flatten or pool the features as needed
        # Pass through the binary classification layer
        output = self.classification_layer(features) #< ------- expects 3 channels
        return output