Understanding output_padding - CNN autoencoder input output not the same

soulless · August 9, 2018, 9:02am

As titled, I am working on a text autoencoder with CNN The parameters of the encoder and decoder seem to be the same but the output is of different size. I know it has something to do with the padding and the confusion with stride. But I don’t really understand why is it this case and how to fix it. Thanks a lot!

class ConvEncoder(nn.Module):
    def __init__(self, embedDim, maxLength, filterSize, filterShape, latentSize):
        super(ConvEncoder, self).__init__()
        self.embedDim = embedDim
        self.maxLength = maxLength
        self.filterSize = filterSize
        self. filterShape = filterShape
        self.latentSize = latentSize
        
        t1 = maxLength + 2 * (filterShape - 1)
        t2 = int(math.floor((t1 - filterShape) / 2) + 1) # "2" means stride size
        t3 = int(math.floor((t2 - filterShape) / 2) + 1) - 2
        
        
        #self.embed = embedding
        self.conv1 = nn.Conv2d(1, filterSize, kernel_size=(filterShape, embedDim), stride=2)
        self.batchNorm1 = nn.BatchNorm2d(filterSize)
        self.conv2 = nn.Conv2d(filterSize, filterSize*2, kernel_size=(filterShape, 1), stride=2)
        self.batchNorm2 = nn.BatchNorm2d(filterSize*2)
        self.conv3 = nn.Conv2d(filterSize*2, latentSize, kernel_size=(t3, 1), stride=2)
        # weight initialize for conv layer
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
        
    def forward(self, x):
        # x.size() is (L, emb_dim) if batch_size is 1.
        # So interpolate x's dimension if batch_size is 1.
        if len(x.size()) < 3:
            x = x.view(1, *x.size())
        # reshape for convolution layer
        if len(x.size()) < 4:
            x = x.view(x.size()[0], 1, x.size()[1], x.size()[2])
        print("input: " + str(x.size()))
        conv1Output = self.conv1(x)
        h1 = F.relu(self.batchNorm1(conv1Output))
        conv2Output = self.conv2(h1)
        h2 = F.relu(self.batchNorm2(conv2Output))
        h3 = F.relu(self.conv3(h2))
    
        print("conv1: " + str(conv1Output.size()))
        print("conv2: " + str(conv2Output.size()))
        print("conv3: " + str(h3.size()))
        return h3

class ConvDecoder(nn.Module):
    def __init__(self, tau, embedDim, maxLength, filterSize, filterShape, latentSize):
        super(ConvDecoder, self).__init__()
        self.tau = tau
        self.maxLength = maxLength
        self.embedDim = embedDim
        #self.embed = embedding
        
        """
        embedWeightSize = embedWeights.size()
        self.vocabSize = embedWeightSize[0]
        self.embeddingDim = embedWeightSize[1]
        print("Vocab size: " + str(self.vocabSize))
        print("embeddingDim: " + str(self.embeddingDim))
        self.emb = nn.Embedding(self.vocabSize, self.embeddingDim)
        self.emb.weight.data.copy_(embedWeights)
        #Freeze embedding weights
        self.emb.weight.requires_grad = False
        """
        
        t1 = maxLength + 2 * (filterShape - 1)
        t2 = int(math.floor((t1 - filterShape) / 2) + 1) # "2" means stride size
        t3 = int(math.floor((t2 - filterShape) / 2) + 1) - 2
        
        self.deconv1 = nn.ConvTranspose2d(latentSize, filterSize * 2, kernel_size=(t3, 1), stride=2)
        self.batchNorm1 = nn.BatchNorm2d(filterSize * 2)
        self.deconv2 = nn.ConvTranspose2d(filterSize * 2, filterSize, kernel_size=(filterShape, 1), stride=2)
        self.batchNorm2 = nn.BatchNorm2d(filterSize)
        self.deconv3 = nn.ConvTranspose2d(filterSize, 1, kernel_size=(filterShape, embedDim), stride=2)
        #output_padding=(1,0)
        # weight initialize for conv_transpose layer
        for m in self.modules():
            if isinstance(m, nn.ConvTranspose2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
        
    def forward(self, h3):
        deconvOutput1 = self.deconv1(h3)
        h2 = F.relu(self.batchNorm1(deconvOutput1))
        deconvOutput2 = self.deconv2(h2)
        h1 = F.relu(self.batchNorm2(deconvOutput2))
        deconvOutput3 = self.deconv3(h1)
        xHat = F.relu(deconvOutput3)
        xHat = xHat.squeeze()
        
        print("Deconv1: " + str(deconvOutput1.size()))
        print("Deconv2: " + str(deconvOutput2.size()))
        print("Deconv3: " + str(deconvOutput3.size()))
        exit()
        # x.size() is (L, emb_dim) if batch_size is 1.
        # So interpolate x's dimension if batch_size is 1.
        if len(xHat.size()) < 3:
            xHat = xHat.view(1, *xHat.size())
        # normalize
        normXHat = torch.norm(xHat, 2, dim=2, keepdim=True)
        recXHat = xHat / normXHat
        
        return recXHat

Output:

input: torch.Size([12, 1, 20, 100])
conv1: torch.Size([12, 300, 9, 1])
conv2: torch.Size([12, 600, 4, 1])
conv3: torch.Size([12, 500, 1, 1])
Deconv1: torch.Size([12, 600, 3, 1])
Deconv2: torch.Size([12, 300, 7, 1])
Deconv3: torch.Size([12, 1, 15, 100])

I am really new to this. Sorry for my ignorance and thank you for your help