Stride in Conv2d vs. ConvTranspose2d

samin_hamidi · September 12, 2020, 9:34am

I am trying to learn an autoencoder on CIFAR10. I have written the code with sequential in the code you can see below and it worked fine:

class Autoencoder(nn.Module):
def __init__(self):
    super(Autoencoder, self).__init__()
    self.Flatten = Flatten()
    
    # Input size: [batch, 3, 32, 32]
    # Output size: [batch, 3, 32, 32]
    self.encoder = nn.Sequential(
        nn.Conv2d(3, 12, 4, stride=2, padding=1),            # [batch, 12, 16, 16]
        nn.ReLU(),
        nn.Conv2d(12, 24, 4, stride=2, padding=1),           # [batch, 24, 8, 8]
        nn.ReLU(),
        nn.Conv2d(24, 48, 4, stride=2, padding=1),           # [batch, 48, 4, 4]
        ReLU(),
       
        
        
    )
    self.decoder = nn.Sequential(
        
        
        nn.ConvTranspose2d(48, 24, 4, stride=2, padding=1),  # [batch, 24, 8, 8]
        nn.ReLU(),
        nn.ConvTranspose2d(24, 12, 4, stride=2, padding=1),  # [batch, 12, 16, 16]
        nn.ReLU(),
        nn.ConvTranspose2d(12, 3, 4, stride=2, padding=1),   # [batch, 3, 32, 32]
        nn.Sigmoid(),
    )

def forward(self, x):
    encoded = self.encoder(x)
    
    decoded = self.decoder(encoded)
    return encoded, decoded

I decided to add a linear layer to reduce the output dimensionality. For this, I rewrote the above code as you can see below (this time I avoided using sequential):

class Autoencoder(nn.Module):

def __init__(self):
    super(Autoencoder, self).__init__()
    
    # Encoder architecture
    self.conv_1 = nn.Conv2d(3, 12, kernel_size=3, stride=2, padding=1)
    self.conv_2 = nn.Conv2d(12, 24, kernel_size=3, stride=2, padding=1)
    self.conv_3 = nn.Conv2d(24, 48, kernel_size=3, stride=2, padding=1)
    #Decodere architecture: convolutional transpose layers or deconvolutional layers + upsampling pooling layers
    self.de_conv_3 = nn.ConvTranspose2d(48, 24, kernel_size=3, padding=1, stride=2)
    self.de_conv_2 = nn.ConvTranspose2d(24, 12, kernel_size=3, padding=1, stride=2)
    self.de_conv_1 = nn.ConvTranspose2d(12, 3, kernel_size=3, padding=1, stride=2)
    
    self.linear1 = nn.Linear(48*4*4, 10)
    self.linear2 = nn.Linear(10, 48*4*4)
    self.sigmoid = nn.Sigmoid()
    

def forward(self, images):
    code = self.encode(images)
    out = self.decode(code)
    return code, out

def encode(self, images):
    #print('input shape', images.shape)
    code = self.conv_1(images)
    code = F.relu(code)
    print("conv1 output", code.shape)
   
    code = self.conv_2(code)
    code = F.relu(code)
    #print('batch batch_norm shape', code.shape)
    print("conv2 output", code.shape)
    
    code = self.conv_3(code)
    code = F.relu(code)
    print("conv3 output", code.shape)
    
    code = code.view(code.size(0), 48 * 4 * 4)
    print('code after view', code.shape)
    #add a linear layer that flattens and also gives a 10 
    code = F.relu(self.linear1(code))
    print('code after linear1', code.shape)
    return code#[100,10]


def decode(self, code):
    code = self.linear2(code)
    print('code after linear2', code.shape)
    #reshape to conv
    code = code.view(100, 48, 4, 4)
    print('code after view', code.shape)
    #upsample, followed by a conv layer, with relu activation function  
    #this function is called `upsample` in some PyTorch versions
    
   
    code = F.relu(self.de_conv_3(code))
    print('deconv3 output', code.shape)
    code = F.relu(self.de_conv_2(code))
    print('deconv2 output', code.shape)
    
    code = self.de_conv_1(code)
    print("deconv1 output", code.shape)
    out = self.sigmoid(code)
    #print('final output', out.shape)
    
    return out

But I get an error and when I print the output of each layer, I see that the decoder does not construct the same shape as the encoder outputs. I am confused why I am not getting the same results as the first model written with sequential. Is it because maybe stride=2 in convtranspose layer works differently? but why not when written in sequential.
These are the results of printing the shape of the decoder layers:

conv1 output torch.Size([100, 12, 16, 16])
conv2 output torch.Size([100, 24, 8, 8])
conv3 output torch.Size([100, 48, 4, 4])
code after view torch.Size([100, 768])
code after linear1 torch.Size([100, 10])
code after linear2 torch.Size([100, 768])
code after view torch.Size([100, 48, 4, 4])
deconv3 output torch.Size([100, 24, 7, 7])
deconv2 output torch.Size([100, 12, 13, 13])
deconv1 output torch.Size([100, 3, 25, 25])

ptrblck · September 15, 2020, 8:08am

Both approaches use a different kernel size (4 vs 3), which would explain the difference in the shapes of the activations.