As titled, I am working on a text autoencoder with CNN The parameters of the encoder and decoder seem to be the same but the output is of different size. I know it has something to do with the padding and the confusion with stride. But I don’t really understand why is it this case and how to fix it. Thanks a lot!
class ConvEncoder(nn.Module):
def __init__(self, embedDim, maxLength, filterSize, filterShape, latentSize):
super(ConvEncoder, self).__init__()
self.embedDim = embedDim
self.maxLength = maxLength
self.filterSize = filterSize
self. filterShape = filterShape
self.latentSize = latentSize
t1 = maxLength + 2 * (filterShape - 1)
t2 = int(math.floor((t1 - filterShape) / 2) + 1) # "2" means stride size
t3 = int(math.floor((t2 - filterShape) / 2) + 1) - 2
#self.embed = embedding
self.conv1 = nn.Conv2d(1, filterSize, kernel_size=(filterShape, embedDim), stride=2)
self.batchNorm1 = nn.BatchNorm2d(filterSize)
self.conv2 = nn.Conv2d(filterSize, filterSize*2, kernel_size=(filterShape, 1), stride=2)
self.batchNorm2 = nn.BatchNorm2d(filterSize*2)
self.conv3 = nn.Conv2d(filterSize*2, latentSize, kernel_size=(t3, 1), stride=2)
# weight initialize for conv layer
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
def forward(self, x):
# x.size() is (L, emb_dim) if batch_size is 1.
# So interpolate x's dimension if batch_size is 1.
if len(x.size()) < 3:
x = x.view(1, *x.size())
# reshape for convolution layer
if len(x.size()) < 4:
x = x.view(x.size()[0], 1, x.size()[1], x.size()[2])
print("input: " + str(x.size()))
conv1Output = self.conv1(x)
h1 = F.relu(self.batchNorm1(conv1Output))
conv2Output = self.conv2(h1)
h2 = F.relu(self.batchNorm2(conv2Output))
h3 = F.relu(self.conv3(h2))
print("conv1: " + str(conv1Output.size()))
print("conv2: " + str(conv2Output.size()))
print("conv3: " + str(h3.size()))
return h3
class ConvDecoder(nn.Module):
def __init__(self, tau, embedDim, maxLength, filterSize, filterShape, latentSize):
super(ConvDecoder, self).__init__()
self.tau = tau
self.maxLength = maxLength
self.embedDim = embedDim
#self.embed = embedding
"""
embedWeightSize = embedWeights.size()
self.vocabSize = embedWeightSize[0]
self.embeddingDim = embedWeightSize[1]
print("Vocab size: " + str(self.vocabSize))
print("embeddingDim: " + str(self.embeddingDim))
self.emb = nn.Embedding(self.vocabSize, self.embeddingDim)
self.emb.weight.data.copy_(embedWeights)
#Freeze embedding weights
self.emb.weight.requires_grad = False
"""
t1 = maxLength + 2 * (filterShape - 1)
t2 = int(math.floor((t1 - filterShape) / 2) + 1) # "2" means stride size
t3 = int(math.floor((t2 - filterShape) / 2) + 1) - 2
self.deconv1 = nn.ConvTranspose2d(latentSize, filterSize * 2, kernel_size=(t3, 1), stride=2)
self.batchNorm1 = nn.BatchNorm2d(filterSize * 2)
self.deconv2 = nn.ConvTranspose2d(filterSize * 2, filterSize, kernel_size=(filterShape, 1), stride=2)
self.batchNorm2 = nn.BatchNorm2d(filterSize)
self.deconv3 = nn.ConvTranspose2d(filterSize, 1, kernel_size=(filterShape, embedDim), stride=2)
#output_padding=(1,0)
# weight initialize for conv_transpose layer
for m in self.modules():
if isinstance(m, nn.ConvTranspose2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
def forward(self, h3):
deconvOutput1 = self.deconv1(h3)
h2 = F.relu(self.batchNorm1(deconvOutput1))
deconvOutput2 = self.deconv2(h2)
h1 = F.relu(self.batchNorm2(deconvOutput2))
deconvOutput3 = self.deconv3(h1)
xHat = F.relu(deconvOutput3)
xHat = xHat.squeeze()
print("Deconv1: " + str(deconvOutput1.size()))
print("Deconv2: " + str(deconvOutput2.size()))
print("Deconv3: " + str(deconvOutput3.size()))
exit()
# x.size() is (L, emb_dim) if batch_size is 1.
# So interpolate x's dimension if batch_size is 1.
if len(xHat.size()) < 3:
xHat = xHat.view(1, *xHat.size())
# normalize
normXHat = torch.norm(xHat, 2, dim=2, keepdim=True)
recXHat = xHat / normXHat
return recXHat
Output:
input: torch.Size([12, 1, 20, 100])
conv1: torch.Size([12, 300, 9, 1])
conv2: torch.Size([12, 600, 4, 1])
conv3: torch.Size([12, 500, 1, 1])
Deconv1: torch.Size([12, 600, 3, 1])
Deconv2: torch.Size([12, 300, 7, 1])
Deconv3: torch.Size([12, 1, 15, 100])
I am really new to this. Sorry for my ignorance and thank you for your help