I am trying to learn an autoencoder on CIFAR10. I have written the code with sequential in the code you can see below and it worked fine:
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
self.Flatten = Flatten()
# Input size: [batch, 3, 32, 32]
# Output size: [batch, 3, 32, 32]
self.encoder = nn.Sequential(
nn.Conv2d(3, 12, 4, stride=2, padding=1), # [batch, 12, 16, 16]
nn.ReLU(),
nn.Conv2d(12, 24, 4, stride=2, padding=1), # [batch, 24, 8, 8]
nn.ReLU(),
nn.Conv2d(24, 48, 4, stride=2, padding=1), # [batch, 48, 4, 4]
ReLU(),
)
self.decoder = nn.Sequential(
nn.ConvTranspose2d(48, 24, 4, stride=2, padding=1), # [batch, 24, 8, 8]
nn.ReLU(),
nn.ConvTranspose2d(24, 12, 4, stride=2, padding=1), # [batch, 12, 16, 16]
nn.ReLU(),
nn.ConvTranspose2d(12, 3, 4, stride=2, padding=1), # [batch, 3, 32, 32]
nn.Sigmoid(),
)
def forward(self, x):
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return encoded, decoded
I decided to add a linear layer to reduce the output dimensionality. For this, I rewrote the above code as you can see below (this time I avoided using sequential):
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
# Encoder architecture
self.conv_1 = nn.Conv2d(3, 12, kernel_size=3, stride=2, padding=1)
self.conv_2 = nn.Conv2d(12, 24, kernel_size=3, stride=2, padding=1)
self.conv_3 = nn.Conv2d(24, 48, kernel_size=3, stride=2, padding=1)
#Decodere architecture: convolutional transpose layers or deconvolutional layers + upsampling pooling layers
self.de_conv_3 = nn.ConvTranspose2d(48, 24, kernel_size=3, padding=1, stride=2)
self.de_conv_2 = nn.ConvTranspose2d(24, 12, kernel_size=3, padding=1, stride=2)
self.de_conv_1 = nn.ConvTranspose2d(12, 3, kernel_size=3, padding=1, stride=2)
self.linear1 = nn.Linear(48*4*4, 10)
self.linear2 = nn.Linear(10, 48*4*4)
self.sigmoid = nn.Sigmoid()
def forward(self, images):
code = self.encode(images)
out = self.decode(code)
return code, out
def encode(self, images):
#print('input shape', images.shape)
code = self.conv_1(images)
code = F.relu(code)
print("conv1 output", code.shape)
code = self.conv_2(code)
code = F.relu(code)
#print('batch batch_norm shape', code.shape)
print("conv2 output", code.shape)
code = self.conv_3(code)
code = F.relu(code)
print("conv3 output", code.shape)
code = code.view(code.size(0), 48 * 4 * 4)
print('code after view', code.shape)
#add a linear layer that flattens and also gives a 10
code = F.relu(self.linear1(code))
print('code after linear1', code.shape)
return code#[100,10]
def decode(self, code):
code = self.linear2(code)
print('code after linear2', code.shape)
#reshape to conv
code = code.view(100, 48, 4, 4)
print('code after view', code.shape)
#upsample, followed by a conv layer, with relu activation function
#this function is called `upsample` in some PyTorch versions
code = F.relu(self.de_conv_3(code))
print('deconv3 output', code.shape)
code = F.relu(self.de_conv_2(code))
print('deconv2 output', code.shape)
code = self.de_conv_1(code)
print("deconv1 output", code.shape)
out = self.sigmoid(code)
#print('final output', out.shape)
return out
But I get an error and when I print the output of each layer, I see that the decoder does not construct the same shape as the encoder outputs. I am confused why I am not getting the same results as the first model written with sequential. Is it because maybe stride=2 in convtranspose layer works differently? but why not when written in sequential.
These are the results of printing the shape of the decoder layers:
conv1 output torch.Size([100, 12, 16, 16])
conv2 output torch.Size([100, 24, 8, 8])
conv3 output torch.Size([100, 48, 4, 4])
code after view torch.Size([100, 768])
code after linear1 torch.Size([100, 10])
code after linear2 torch.Size([100, 768])
code after view torch.Size([100, 48, 4, 4])
deconv3 output torch.Size([100, 24, 7, 7])
deconv2 output torch.Size([100, 12, 13, 13])
deconv1 output torch.Size([100, 3, 25, 25])