Hello, I am trying to build a simple autoencoder for images like these
The image size is 128x128.
I want my latent vectors to have 256 data points and I am struggling to build my NN for this purpose.
class ConvVAE(nn.Module):
def __init__(self):
super(ConvVAE, self).__init__()
# encoder
self.enc1 = nn.Conv2d(
in_channels=image_channels, out_channels=init_channels, kernel_size=kernel_size,
stride=2, padding=1
)
self.enc2 = nn.Conv2d(
in_channels=init_channels, out_channels=init_channels*2, kernel_size=kernel_size,
stride=2, padding=1
)
self.enc3 = nn.Conv2d(
in_channels=init_channels*2, out_channels=init_channels*4, kernel_size=kernel_size,
stride=2, padding=1
)
self.enc4 = nn.Conv2d(
in_channels=init_channels*4, out_channels=64, kernel_size=kernel_size,
stride=2, padding=0
)
# fully connected layers for learning representations
self.fc1 = nn.Linear(64, 128)
self.fc_mu = nn.Linear(128, latent_dim)
self.fc_log_var = nn.Linear(128, latent_dim)
self.fc2 = nn.Linear(latent_dim, 64)
# decoder
self.dec1 = nn.ConvTranspose2d(
in_channels=64, out_channels=init_channels*8, kernel_size=kernel_size,
stride=1, padding=0
)
self.dec2 = nn.ConvTranspose2d(
in_channels=init_channels*8, out_channels=init_channels*4, kernel_size=kernel_size,
stride=2, padding=1
)
self.dec3 = nn.ConvTranspose2d(
in_channels=init_channels*4, out_channels=init_channels*2, kernel_size=kernel_size,
stride=2, padding=1
)
self.dec4 = nn.ConvTranspose2d(
in_channels=init_channels*2, out_channels=image_channels, kernel_size=kernel_size,
stride=2, padding=1
)
def reparameterize(self, mu, log_var):
"""
:param mu: mean from the encoder's latent space
:param log_var: log variance from the encoder's latent space
"""
std = torch.exp(0.5*log_var) # standard deviation
eps = torch.randn_like(std) # `randn_like` as we need the same size
sample = mu + (eps * std) # sampling
return sample
def forward(self, x):
# encoding
x = F.relu(self.enc1(x))
x = F.relu(self.enc2(x))
x = F.relu(self.enc3(x))
x = F.relu(self.enc4(x))
batch, _, _, _ = x.shape
x = F.adaptive_avg_pool2d(x, 1).reshape(batch, -1)
hidden = self.fc1(x)
# get `mu` and `log_var`
mu = self.fc_mu(hidden)
log_var = self.fc_log_var(hidden)
# get the latent vector through reparameterization
z = self.reparameterize(mu, log_var)
z = self.fc2(z)
z = z.view(-1, 64, 1, 1)
# decoding
x = F.relu(self.dec1(z))
x = F.relu(self.dec2(x))
x = F.relu(self.dec3(x))
reconstruction = torch.sigmoid(self.dec4(x))
return reconstruction, mu, log_var
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# initialize the model
model = ConvVAE().to(device)
# set the learning parameters
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss(reduction='sum')
def final_loss(bce_loss, mu, logvar):
"""
This function will add the reconstruction loss (BCELoss) and the
KL-Divergence.
KL-Divergence = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
:param bce_loss: recontruction loss
:param mu: the mean from the latent vector
:param logvar: log variance from the latent vector
"""
BCE = bce_loss
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return BCE + KLD
def train(model, dataloader, dataset, device, optimizer, criterion):
model.train()
running_loss = 0.0
counter = 0
for i, data in tqdm(enumerate(dataloader), total=int(len(dataset)/dataloader.batch_size)):
counter += 1
data = data[0]
data = data.to(device)
optimizer.zero_grad()
reconstruction, mu, logvar = model(data)
bce_loss = criterion(reconstruction, data)
loss = final_loss(bce_loss, mu, logvar)
loss.backward()
running_loss += loss.item()
optimizer.step()
train_loss = running_loss / counter
return train_loss
This is my code and I am trying do change it so my latent vector has 256 data points, but no success. Ive been trying to play with latent_dim
parameter but I cannot make it work due to a dimensions problem.
Using a target size (torch.Size([64, 1, 128, 128])) that is different to the input size (torch.Size([64, 1, 32, 32])) is deprecated. Please ensure they have the same size.
Any idea what may be causing this? Can someone clear up how do I change my latent vector dimensions, which changes do I need to make to my NN architecture?
Kind regards