Adapt autoencoder latent vector

Hello, I have been trying to find an out-of-the-box autoencoder that lets me choose the size of my latent vector.
I’ve been working on a problem where I want to encode some 128x128 grayscale images in a 256 datapoints using convolutional autoencoder.

I’ve been trying to use some pre-conceived examples but without success.

class Encoder(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        ### Convolutional section
        
        self.enc1 = nn.Conv2d(
            in_channels=image_channels, out_channels=init_channels, kernel_size=kernel_size, 
            stride=2, padding=1
        )
        self.enc2 = nn.Conv2d(
            in_channels=init_channels, out_channels=init_channels*2, kernel_size=kernel_size, 
            stride=2, padding=1
        )
        self.enc3 = nn.Conv2d(
            in_channels=init_channels*2, out_channels=init_channels*4, kernel_size=kernel_size, 
            stride=2, padding=1
        )
        self.enc4 = nn.Conv2d(
            in_channels=init_channels*4, out_channels=64, kernel_size=kernel_size, 
            stride=2, padding=1
        )
        
        self.enc5 = nn.Conv2d(
            in_channels=init_channels*8, out_channels=128, kernel_size=kernel_size, 
            stride=2, padding=0
        )
        
        
        
        # fully connected layers for learning representations
        self.fc1 = nn.Linear(128, 128)
        self.fc_mu = nn.Linear(128, latent_dim)
        self.fc_log_var = nn.Linear(128, latent_dim)
        self.fc2 = nn.Linear(latent_dim, 128)
        
    def reparameterize(self, mu, log_var):
        """
        :param mu: mean from the encoder's latent space
        :param log_var: log variance from the encoder's latent space
        """
        std = torch.exp(0.5*log_var) # standard deviation
        eps = torch.randn_like(std) # `randn_like` as we need the same size
        sample = mu + (eps * std) # sampling
        return sample
        
    def forward(self, x):
        # encoding
        x = F.relu(self.enc1(x))
        x = F.relu(self.enc2(x))
        x = F.relu(self.enc3(x))
        x = F.relu(self.enc4(x))
        x = F.relu(self.enc5(x))
        
        
        batch, _, _, _ = x.shape
        x = F.adaptive_avg_pool2d(x, 1).reshape(batch, -1)
        hidden = self.fc1(x)
        
        # get `mu` and `log_var`
        mu = self.fc_mu(hidden)
        log_var = self.fc_log_var(hidden)

        
        # get the latent vector through reparameterization
        z = self.reparameterize(mu, log_var)
        z = self.fc2(z)
        z = z.view(-1, latent_dim, 1, 1)
        
        return mu, log_var, z
    
encoder = Encoder()
encoder.to(device)

class Decoder(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.dec1 = nn.Upsample(scale_factor=2)
        self.dec2 = nn.Upsample(scale_factor=2)
        self.dec3 = nn.Upsample(scale_factor=2)
        self.dec4 = nn.Upsample(scale_factor=2)
        self.dec5 = nn.Upsample(scale_factor=2)
        self.dec6 = nn.Upsample(scale_factor=2)
        self.dec7 = nn.Upsample(scale_factor=2)
        
        self.conv1 = nn.Conv2d(latent_dim, 128, 3, padding=1)
        self.conv2 = nn.Conv2d(128, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 32, 3, padding=1)
        self.conv4 = nn.Conv2d(32, 16, 3, padding=1)
        self.conv5 = nn.Conv2d(16, 8, 3, padding=1)
        self.conv6 = nn.Conv2d(8, 4, 3, padding=1)
        self.conv7 = nn.Conv2d(4, 1, 3, padding=1)
        
        
        
        
    def forward(self, x):
        x = self.dec1(x)
        x = F.relu(self.conv1(x))
        
        x = self.dec2(x)
        x = F.relu(self.conv2(x))
        
        x = self.dec3(x)
        x = F.relu(self.conv3(x))
        
        x = self.dec4(x)
        x = F.relu(self.conv4(x))
        
        x = self.dec5(x)
        x = F.relu(self.conv5(x))
        
        x = self.dec6(x)
        x = F.relu(self.conv6(x))
        
        x = self.dec7(x)
        x = F.relu(self.conv7(x))
        
        reconstruction = torch.sigmoid(x)
        
        return reconstruction, x
    
decoder = Decoder()
decoder.to(device)

params_to_optimize = [
    {'params': encoder.parameters()},
    {'params': decoder.parameters()}
]

optim = torch.optim.Adam(params_to_optimize, lr=lr, weight_decay=1e-05)
criterion = nn.BCELoss(reduction='sum')

I customized a standard autoencoder implementation into this but I can’t even get it to converge, I am actually getting some random results.

I would like to know if you can guide me into some examples where the latent space vector is customized or tell me why my solution is failing hard.

Kind regards