Autoencoder output size is different from input size

H1dd3n_Squ1d · December 1, 2023, 10:17pm

I am trying to implement a CNN autoencoder that will take in Mel spectrogram as inputs but am currently running into an issue with the output size being different from the input size. The input has a shape of [1, 64, 302] (channels, n_mels, num_feats). The output has a shape of [1, 64, 304].

class CNN_AE(nn.Module):
    def __init__(self):
        super(CNN_AE, self).__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 16, kernel_size=5, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 8, kernel_size=5, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(8, 16, kernel_size=5, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 32, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 1, kernel_size=5, stride=2, padding=1, output_padding=1),
            nn.Sigmoid()
        )
         
    def forward(self, x):
        print(x.shape)
        x = self.encoder(x)
        x = self.decoder(x)
        print(x.shape)
        return x

Am I using the padding and output_padding wrong?