Hi,
I’m trying to adapt the architecture from here to run on 3D volumes of size 182 x 218 x 182 (a.k.a. more channels than the standard RGB, and uneven height-width ratio).
Here are my Encoder, Decoder and Autoencoder:
class Encoder(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int,
act_fn : object = nn.GELU):
"""
Inputs:
- num_input_channels : Number of input channels of the image. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the first convolutional layers. Deeper layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the encoder network
"""
super().__init__()
c_hid = base_channel_size
self.net = nn.Sequential(
nn.Conv2d(num_input_channels, c_hid, kernel_size=3, padding=1, stride=2), # 32x32 => 16x16
act_fn(),
nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
act_fn(),
nn.Conv2d(c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 16x16 => 8x8
act_fn(),
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
act_fn(),
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1, stride=2), # 8x8 => 4x4
act_fn(),
nn.Flatten() , # Image grid to single feature vector
nn.Linear(2*16*c_hid, latent_dim)
)
def forward(self, x):
return self.net(x)
class Decoder(nn.Module):
def __init__(self,
num_input_channels : int,
base_channel_size : int,
latent_dim : int,
act_fn : object = nn.GELU):
"""
Inputs:
- num_input_channels : Number of channels of the image to reconstruct. For CIFAR, this parameter is 3
- base_channel_size : Number of channels we use in the last convolutional layers. Early layers might use a duplicate of it.
- latent_dim : Dimensionality of latent representation z
- act_fn : Activation function used throughout the decoder network
"""
super().__init__()
c_hid = base_channel_size
self.linear = nn.Sequential(
nn.Linear(latent_dim, 2*16*c_hid),
#nn.Linear(latent_dim, 2*32*c_hid),
act_fn()
)
self.net = nn.Sequential(
nn.ConvTranspose2d(2*c_hid, 2*c_hid, kernel_size=3, output_padding=1, padding=1, stride=(2,2)), # 4x4 => 8x8
act_fn(),
nn.Conv2d(2*c_hid, 2*c_hid, kernel_size=3, padding=1),
act_fn(),
nn.ConvTranspose2d(2*c_hid, c_hid, kernel_size=3, output_padding=1, padding=1, stride=(5,5)), # 8x8 => 16x16
act_fn(),
nn.Conv2d(c_hid, c_hid, kernel_size=3, padding=1),
act_fn(),
nn.ConvTranspose2d(c_hid, num_input_channels, kernel_size=3, output_padding=1, padding=1, stride=(6,5)), # 16x16 => 32x32
nn.ReLU() #
)
def forward(self, x):
x = self.linear(x)
x = x.reshape(x.shape[0], -1, 4, 4)
x = self.net(x)
return x
class Autoencoder(pl.LightningModule):
def __init__(self,
base_channel_size: int,
latent_dim: int,
encoder_class : object = Encoder,
decoder_class : object = Decoder,
#num_input_channels: int = 3,
num_input_channels: int = 182,
#width: int = 32,
width: int = 218,
#height: int = 32):
height: int = 182):
super().__init__()
# Saving hyperparameters of autoencoder
self.save_hyperparameters()
# Creating encoder and decoder
self.encoder = encoder_class(num_input_channels, base_channel_size, latent_dim)
self.decoder = decoder_class(num_input_channels, base_channel_size, latent_dim)
# Example input array needed for visualizing the graph of the network
self.example_input_array = torch.zeros(2, num_input_channels, width, height)
#self.automatic_optimization = False
def forward(self, x):
"""
The forward function takes in an image and returns the reconstructed image
"""
z = self.encoder(x)
print('done encoder')
x_hat = self.decoder(z)
print('done decoder')
return x_hat
def _get_reconstruction_loss(self, batch):
"""
Given a batch of images, this function returns the reconstruction loss (MSE in our case)
"""
x, _ = batch # We do not need the labels
x_hat = self.forward(x)
loss = F.mse_loss(x, x_hat, reduction="none")
loss = loss.sum(dim=[1,2,3]).mean(dim=[0]) #.to('cpu')
print(loss)
return loss
def configure_optimizers(self):
optimizer = optim.Adam(self.parameters(), lr=1e-3)
# Using a scheduler is optional but can be helpful.
# The scheduler reduces the LR if the validation performance hasn't improved for the last N epochs
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
mode='min',
factor=0.2,
patience=20,
min_lr=5e-5)
return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
#return {"optimizer": optimizer, "monitor": "val_loss"}
def training_step(self, batch, batch_idx):
loss = self._get_reconstruction_loss(batch)
self.log('train_loss', loss)
return loss #.backward()
def validation_step(self, batch, batch_idx):
loss = self._get_reconstruction_loss(batch)
self.log('val_loss', loss)
def test_step(self, batch, batch_idx):
loss = self._get_reconstruction_loss(batch)
self.log('test_loss', loss)
The output of torchinfo’s summary seems right:
mo = Autoencoder(base_channel_size=32, latent_dim=64, num_input_channels=182) #, width=218, height=182)
summary(mo, (10, 182, 218, 182))
==========================================================================================
Layer (type:depth-idx) Output Shape Param #
==========================================================================================
Autoencoder -- --
├─Encoder: 1-1 [10, 64] --
│ └─Sequential: 2-1 [10, 64] --
│ │ └─Conv2d: 3-1 [10, 32, 109, 91] 52,448
│ │ └─GELU: 3-2 [10, 32, 109, 91] --
│ │ └─Conv2d: 3-3 [10, 32, 109, 91] 9,248
│ │ └─GELU: 3-4 [10, 32, 109, 91] --
│ │ └─Conv2d: 3-5 [10, 64, 55, 46] 18,496
│ │ └─GELU: 3-6 [10, 64, 55, 46] --
│ │ └─Conv2d: 3-7 [10, 64, 55, 46] 36,928
│ │ └─GELU: 3-8 [10, 64, 55, 46] --
│ │ └─Conv2d: 3-9 [10, 64, 28, 23] 36,928
│ │ └─GELU: 3-10 [10, 64, 28, 23] --
│ │ └─Flatten: 3-11 [10, 41216] --
│ │ └─Linear: 3-12 [10, 64] 65,600
├─Decoder: 1-2 [10, 182, 218, 182] --
│ └─Sequential: 2-2 [10, 1024] --
│ │ └─Linear: 3-13 [10, 1024] 66,560
│ │ └─GELU: 3-14 [10, 1024] --
│ └─Sequential: 2-3 [10, 182, 218, 182] --
│ │ └─ConvTranspose2d: 3-15 [10, 64, 8, 8] 36,928
│ │ └─GELU: 3-16 [10, 64, 8, 8] --
│ │ └─Conv2d: 3-17 [10, 64, 8, 8] 36,928
│ │ └─GELU: 3-18 [10, 64, 8, 8] --
│ │ └─ConvTranspose2d: 3-19 [10, 32, 37, 37] 18,464
│ │ └─GELU: 3-20 [10, 32, 37, 37] --
│ │ └─Conv2d: 3-21 [10, 32, 37, 37] 9,248
│ │ └─GELU: 3-22 [10, 32, 37, 37] --
│ │ └─ConvTranspose2d: 3-23 [10, 182, 218, 182] 52,598
│ │ └─ReLU: 3-24 [10, 182, 218, 182] --
==========================================================================================
Total params: 440,374
Trainable params: 440,374
Non-trainable params: 0
Total mult-adds (G): 29.06
==========================================================================================
Input size (MB): 288.84
Forward/backward pass size (MB): 665.42
Params size (MB): 1.76
Estimated Total Size (MB): 956.03
==========================================================================================
…Yet, there is an error when running the model through Pytorch (Lightning), which seems to suggest that backpropagation is skipping the bottleneck layer: AddmmBackward expects the shape of the penultimate Encoder layer (41216 elements), whereas it is seeing the shape of the Decoder’s first layer output (1024 elements):
RuntimeError: Function AddmmBackward returned an invalid gradient at index 1 - got [50, 1024] but expected shape compatible with [50, 41216]
Any ideas on what I could be missing here? Thanks!