Hello fellow torchers. I wanted to experiment a bit with AutoEncoders. I created two separate models, an encoder and a decoder. The reason why I want to keep them separate is because I want to attach different decoders in my experiment later. I compute the output, loss, and gradients by calling the following:
# compute reconstructions
encoded = encoder(inputs)
decoded = decoder(encoded)
# compute training reconstruction loss
train_loss = loss_function(decoded, inputs)
# compute accumulated gradients
train_loss.backward()
However in the code I noticed that the gradients of the encoder are zero, while the gradients of the decoder are not. So the gradients are not properly propagated. Anybody got an explanation why the gradients are not propagated? I thought it was possible to connect models like this. Full code below:
import torch, torchvision
from torch import nn, optim
class Encoder(nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.model = nn.Sequential(
nn.Linear(784, 128),
nn.ReLU(),
nn.Linear(128, 32),
nn.ReLU(),
)
def forward(self, features):
return self.model(features)
class Decoder(nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.model = nn.Sequential(
nn.Linear(32, 128),
nn.ReLU(),
nn.Linear(128, 784),
nn.ReLU(),
)
def forward(self, features):
return self.model(features)
# use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# create a model from `AE` autoencoder class
# load it to the specified device, either gpu or cpu
decoder = Decoder().to(device)
encoder = Encoder().to(device)
# create an optimizer object
# Adam optimizer with learning rate 1e-3
encoder_optimizer = optim.Adam(encoder.parameters(), lr=1e-3)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=1e-3)
loss_function = nn.MSELoss()
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])
train_dataset = torchvision.datasets.MNIST(
root="~/torch_datasets", train=True, download=True, transform=transform
)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True
)
#train_dataset
epochs = 1
for epoch in range(epochs):
loss = 0
for batch_features, _ in train_loader:
# reshape mini-batch data to [N, 784] matrix
# load it to the active device
#print(batch_features)
inputs = batch_features.view(-1, 784).to(device)
# reset the gradients back to zero
# PyTorch accumulates gradients on subsequent backward passes
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
# compute reconstructions
encoded = encoder(inputs)
decoded = decoder(encoded)
# compute training reconstruction loss
train_loss = loss_function(decoded, inputs)
# compute accumulated gradients
train_loss.backward()
print(encoder.model[0].weight.grad)
# perform parameter update based on current gradients
decoder_optimizer.step()
encoder_optimizer.step()
# add the mini-batch training loss to epoch loss
loss += train_loss.item()
# compute the epoch training loss
loss = loss / len(train_loader)
# display the epoch training loss
print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))