I am trying to implement a CNN autoencoder that will take in Mel spectrogram as inputs but am currently running into an issue with the output size being different from the input size. The input has a shape of [1, 64, 302] (channels, n_mels, num_feats). The output has a shape of [1, 64, 304].
class CNN_AE(nn.Module):
def __init__(self):
super(CNN_AE, self).__init__()
self.encoder = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(32, 16, kernel_size=5, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(16, 8, kernel_size=5, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.decoder = nn.Sequential(
nn.ConvTranspose2d(8, 16, kernel_size=5, stride=2, padding=1, output_padding=1),
nn.ReLU(),
nn.ConvTranspose2d(16, 32, kernel_size=5, stride=2),
nn.ReLU(),
nn.ConvTranspose2d(32, 1, kernel_size=5, stride=2, padding=1, output_padding=1),
nn.Sigmoid()
)
def forward(self, x):
print(x.shape)
x = self.encoder(x)
x = self.decoder(x)
print(x.shape)
return x
Am I using the padding and output_padding wrong?