Discrepancy in the shape of the tensor from an autoencoder

I’m relatively new to PyTorch and trying to build an encoder-decoder architecture that encodes the sequence of images ((batch, channels, seq_len, height, width) --------> (2,3,25,120,160)) using the 3D Convolutions (torch.nn.Conv3D()). The output from the encoder is supposed to be (2,4,25,1,1). But what I’m getting is (2,4,19,1,1). What might be the reason and where am I going wrong? Please find my code below:

class Encoder_Decoder(nn.Module):

def __init__(self):

    super(Encoder_Decoder, self).__init__()

    

    self.encoder = nn.Sequential(nn.Conv3d(in_channels=3, out_channels=16, kernel_size=3, stride=(1,2,2)), nn.ReLU(), nn.BatchNorm3d(16),

                                nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, stride=(1,2,2)), nn.ReLU(), nn.BatchNorm3d(32),

                                nn.Conv3d(in_channels=32, out_channels=16, kernel_size=3, stride=(1,2,2)), nn.ReLU(), nn.BatchNorm3d(16),

                                nn.Conv3d(in_channels=16, out_channels=8, kernel_size=1, stride=(1,3,4)), nn.ReLU(), nn.BatchNorm3d(8),

                                nn.Conv3d(in_channels=8, out_channels=4, kernel_size=1, stride=(1,5,5)), nn.ReLU(), nn.BatchNorm3d(4))


            

    #Decoder

    self.decoder = nn.Sequential(nn.ConvTranspose3d(in_channels=4, out_channels=8, kernel_size=3), nn.ReLU(), nn.Upsample((25,120,160)),

                                nn.ConvTranspose3d(in_channels=8, out_channels=16, kernel_size=3), nn.ReLU(), nn.Upsample((25,120,160)),

                                nn.ConvTranspose3d(in_channels=16, out_channels=32, kernel_size=3), nn.ReLU(), nn.Upsample((25,120,160)),

                                nn.ConvTranspose3d(in_channels=32, out_channels=16, kernel_size=3), nn.ReLU(), nn.Upsample((25,120,160)),

                                nn.ConvTranspose3d(in_channels=16, out_channels=3, kernel_size=3), nn.ReLU(), nn.Upsample((25,120,160)))

    

def forward(self,x):

    x = x.reshape(2,3,25,120,160)

    print(x.shape)

    x1 = self.encoder(x)

    print(x1.shape)

    x2 = self.decoder(x1)

    print(x2.shape)

    return x1, x2.reshape(2,25,3,120,160)

If you don’t want to reduce the depth of your input volume, you could use e.g. a kernel size of 1 for this dimension (or another kernel size with the appropriate padding):

encoder = nn.Sequential(
    nn.Conv3d(in_channels=3, out_channels=16, kernel_size=(1, 3, 3), stride=(1,2,2)), nn.ReLU(), nn.BatchNorm3d(16),
    nn.Conv3d(in_channels=16, out_channels=32, kernel_size=(1, 3, 3), stride=(1,2,2)), nn.ReLU(), nn.BatchNorm3d(32),
    nn.Conv3d(in_channels=32, out_channels=16, kernel_size=(1, 3, 3), stride=(1,2,2)), nn.ReLU(), nn.BatchNorm3d(16),
    nn.Conv3d(in_channels=16, out_channels=8, kernel_size=1, stride=(1,3,4)), nn.ReLU(), nn.BatchNorm3d(8),
    nn.Conv3d(in_channels=8, out_channels=4, kernel_size=1, stride=(1,5,5)), nn.ReLU(), nn.BatchNorm3d(4)
)

x = torch.randn(1,3,25,120,160)
out = encoder(x)
print(out.shape)
> torch.Size([1, 4, 25, 1, 1])
1 Like