Hi Team,
I have designed a VAE to run on some data I have of 50k+ stellar spectra.
I have been trying to adjust the model and train it for the past week but have noticed that while KL loss improves the reconstruction loss hovers around the same value, and never improves.
When using just the decoder part of the VAE no matter what value you give it from the latent space it always produces the same output, and all training data also has the same output from the VAE.
I have noticed that the gradients and weights are being updated for everything in the encoder section of the VAE but all gradients are None in the decoder and all weights do not change.
I have not found any other posts about this happening to others so was wondering if I could get some help, I am unsure why the gradients are not being updated for the decoder and I believe this is why the VAE is not learning.
My VAE model is listed below,
class VAE(nn.Module):
def __init__(self):
super(VAE, self).__init__()
input_dim = 400
z_dim = 5
#encoder defs
self.conv1_e = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=8, stride=2, padding=1)
self.conv2_e = nn.Conv1d(in_channels=1, out_channels=1, kernel_size=4, stride=2, padding=1)
self.linear1_e = nn.Linear(in_features=99, out_features=50)
self.linear2_e = nn.Linear(in_features=50, out_features=25)
self.z_mu_e = nn.Linear(in_features=25, out_features=z_dim)
self.z_var_e = nn.Linear(in_features=25, out_features=z_dim)
#decoder defs
self.fc_d = nn.Linear(in_features=z_dim, out_features=25)
self.linear1_d = nn.Linear(in_features=25, out_features=50)
self.linear2_d = nn.Linear(in_features=50, out_features=99)
self.conv1_d = nn.ConvTranspose1d(in_channels=1, out_channels=1, kernel_size=4, stride=2, padding=1)
self.conv2_d = nn.ConvTranspose1d(in_channels=1, out_channels=1, kernel_size=8, stride=2, padding=1)
def encode(self, x):
x = F.relu(self.conv1_e(x))
x = F.relu(self.conv2_e(x))
x = F.relu(self.linear1_e(x))
x = F.relu(self.linear2_e(x))
z_mu = self.z_mu_e(x)
z_var = self.z_var_e(x)
return z_mu, z_var
def reparameterize(self, mu, logvar):
std = torch.exp(0.5*logvar)
eps = torch.randn_like(std)
return mu + eps*std
def decode(self, z):
x = self.fc_d(z)
x = F.relu(self.linear1_d(x))
x = F.relu(self.linear2_d(x))
x = x.view(x.size()[0], 1, 99)
x = F.relu(self.conv1_d(x))
x = F.relu(self.conv2_d(x))
return x
def forward(self, x):
mu, logvar = self.encode(x)
z = self.reparameterize(mu, logvar)
return self.decode(z), mu, logvar
and the training is listed here…
INPUT_DIM = 400
lr = 1e-6
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# define vae
model = ml.VAE().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-6)
def objective_loss(y1, y2, ivar, log_z_var, z_mu):
diff = (y1 - y2)**2 * ivar.numpy()
recon_loss = np.mean(np.sum(diff, axis=1) / INPUT_DIM)
kl_loss = 0.5 * torch.sum(torch.exp(log_z_var) + z_mu**2 - 1.0 - log_z_var)
return recon_loss + kl_loss
def train():
model.train()
train_loss = 0
for i, (wl, fl, iv) in enumerate(train_iterator):
fl = fl.to(device)
fl = fl.reshape(fl.size()[0], 1, 400)
fl = fl.float()
optimizer.zero_grad()
fl_sample, z_mu, log_z_var = model(fl)
y1 = fl_sample.view(fl.size()[0], 400).detach().numpy()
y2 = fl.view(fl.size()[0], 400).detach().numpy()
loss = objective_loss(y1, y2, iv, log_z_var, z_mu)
loss.backward()
g = model.linear2_d.weight
w = model.conv1_d.weight.data
print(g)
train_loss += loss.item()
optimizer.step()
return train_loss
Does anyone have any suggestion as to what I could check or try to fix this problem? Or can anyone see straight away in my code why this would be a problem,
Thank you very much