Issue with the output size mismatching the input size in a 3D convolutional network

Hello everyone,
I am trying to run a 3D convolutional encoder decoder but I have an issue getting the same output size as my input size.
The code is the following:

import torch
import torch.nn as nn

class EncoderDecoder3D(nn.Module):
def init(self):
super(EncoderDecoder3D, self).init()

    # Encoder
    self.encoder = nn.Sequential(
        nn.Conv3d(2, 16, kernel_size=4, stride=1, padding=0),
        nn.Conv3d(16, 32, kernel_size=(5, 5, 3), stride=(2, 2, 1), padding=0),
        nn.Conv3d(32, 64, kernel_size=3, stride=(2, 2, 1), padding=0),
        nn.Conv3d(64, 128, kernel_size=(5, 5, 3), stride=(2, 2, 1), padding=0)
    )

    # Bottleneck (Linear layers)
    self.bottleneck = nn.Sequential(
        nn.Linear(128 * 3 * 3 * 17, 100),
        nn.Linear(100, 128 * 3 * 3 * 17)
    )

    # Decoder
    self.decoder = nn.Sequential(
        nn.ConvTranspose3d(128, 64, kernel_size=(5, 5, 3), stride=(2, 2, 1), padding=0),
        nn.ConvTranspose3d(64, 32, kernel_size=3, stride=(2, 2, 1), padding=0),
        nn.ConvTranspose3d(32, 16, kernel_size=(5, 5, 3), stride=(2, 2, 1), padding=0),
        nn.ConvTranspose3d(16, 1, kernel_size=4, stride=1, padding=0)
    )

def forward(self, x):
    # Encoder
    x1 = self.encoder(x)

    # Bottleneck
    x2 = self.bottleneck(x1.view(x.size(0), -1))

    # Decoder
    x2 = x2.view(x.size(0), 128, 3, 3, 17)
    output = self.decoder(x2)

    return output

model = EncoderDecoder3D()

<

For an input of size (1,2,40,40,20), I would like to get an output of (1,1,40,40,20) as well, which I cannot seem to get.
I know the issue is related to kernel size and stride but I cannot seem to solve it.
Thank you

Apologies for the formatting issues, I am still new here

Try this:

import torch
import torch.nn as nn

class EncoderDecoder3D(nn.Module):
    def __init__(self):
        super(EncoderDecoder3D, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv3d(2, 16, kernel_size=(4,4,2), stride=1, padding=0),
            nn.Conv3d(16, 32, kernel_size=(5, 5, 1), stride=(2, 2, 1), padding=0),
            nn.Conv3d(32, 64, kernel_size=3, stride=(2, 2, 1), padding=0),
            nn.Conv3d(64, 128, kernel_size=(4, 4, 1), stride=(2, 2, 1), padding=0)
        )

        # Bottleneck (Linear layers)
        self.bottleneck = nn.Sequential(
            nn.Linear(128 * 3*3*17, 100),
            nn.Linear(100, 128 * 3*3*17)
        )

        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose3d(128, 64, kernel_size=(4, 4, 1), stride=(2, 2, 1), padding=0),
            nn.ConvTranspose3d(64, 32, kernel_size=3, stride=(2, 2, 1), padding=0),
            nn.ConvTranspose3d(32, 16, kernel_size=(5, 5, 1), stride=(2, 2, 1), padding=0),
            nn.ConvTranspose3d(16, 1, kernel_size=(4,4,2), stride=1, padding=0)
        )

    def forward(self, x):
        # Encoder
        x1 = self.encoder(x)

        # Bottleneck
        x2 = self.bottleneck(x1.view(x.size(0), -1))

        # Decoder
        x2 = x2.view(x.size(0), 128, 3,3,17)
        output = self.decoder(x2)

        return output

model = EncoderDecoder3D()
dummy_input = torch.rand((1,2,40,40,20))
outputs = model(dummy_input)

print(outputs.size())

That’s assuming you want to keep the bottleneck unchanged.