Concatenating two models

Hello fellow torchers. I wanted to experiment a bit with AutoEncoders. I created two separate models, an encoder and a decoder. The reason why I want to keep them separate is because I want to attach different decoders in my experiment later. I compute the output, loss, and gradients by calling the following:

# compute reconstructions
encoded = encoder(inputs)
decoded = decoder(encoded)

# compute training reconstruction loss
train_loss = loss_function(decoded, inputs)

# compute accumulated gradients
train_loss.backward()

However in the code I noticed that the gradients of the encoder are zero, while the gradients of the decoder are not. So the gradients are not properly propagated. Anybody got an explanation why the gradients are not propagated? I thought it was possible to connect models like this. Full code below:

import torch, torchvision
from torch import nn, optim

class Encoder(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
        )

    def forward(self, features):
        return self.model(features)
    
class Decoder(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(32, 128),
            nn.ReLU(),
            nn.Linear(128, 784),
            nn.ReLU(),
        )
        
    def forward(self, features):
        return self.model(features)

#  use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# create a model from `AE` autoencoder class
# load it to the specified device, either gpu or cpu
decoder = Decoder().to(device)
encoder = Encoder().to(device)


# create an optimizer object
# Adam optimizer with learning rate 1e-3
encoder_optimizer = optim.Adam(encoder.parameters(), lr=1e-3)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=1e-3)

loss_function = nn.MSELoss()

transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor()])

train_dataset = torchvision.datasets.MNIST(
    root="~/torch_datasets", train=True, download=True, transform=transform
)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=128, shuffle=True, num_workers=4, pin_memory=True
)
#train_dataset

epochs = 1
for epoch in range(epochs):
    loss = 0
    for batch_features, _ in train_loader:
        # reshape mini-batch data to [N, 784] matrix
        # load it to the active device
        #print(batch_features)
        inputs = batch_features.view(-1, 784).to(device)
        
        # reset the gradients back to zero
        # PyTorch accumulates gradients on subsequent backward passes
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        # compute reconstructions
        encoded = encoder(inputs)
        decoded = decoder(encoded)
        
        # compute training reconstruction loss
        train_loss = loss_function(decoded, inputs)

        # compute accumulated gradients
        train_loss.backward()
        print(encoder.model[0].weight.grad)  
        
        # perform parameter update based on current gradients
        decoder_optimizer.step()
        encoder_optimizer.step()
        
        # add the mini-batch training loss to epoch loss
        loss += train_loss.item()
    
    # compute the epoch training loss
    loss = loss / len(train_loader)
    
    # display the epoch training loss
    print("epoch : {}/{}, loss = {:.6f}".format(epoch + 1, epochs, loss))

Yes, your use case is possible and the gradients are also valid using your architecture:

class Encoder(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(784, 128),
            nn.ReLU(),
            nn.Linear(128, 32),
            nn.ReLU(),
        )

    def forward(self, features):
        return self.model(features)
    
class Decoder(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(32, 128),
            nn.ReLU(),
            nn.Linear(128, 784),
            nn.ReLU(),
        )
        
    def forward(self, features):
        return self.model(features)

#  use gpu if available
device = 'cpu'

# create a model from `AE` autoencoder class
# load it to the specified device, either gpu or cpu
decoder = Decoder().to(device)
encoder = Encoder().to(device)

inputs = torch.randn(1, 28*28)
# compute reconstructions
encoded = encoder(inputs)
decoded = decoder(encoded)
decoded.mean().backward()        

for name, param in encoder.named_parameters():
    print(name, param.grad.abs().sum())

> 
model.0.weight tensor(29.0105)
model.0.bias tensor(0.0458)
model.2.weight tensor(1.8911)
model.2.bias tensor(0.0586)
    
for name, param in decoder.named_parameters():
    print(name, param.grad.abs().sum())
    
>
model.0.weight tensor(0.5203)
model.0.bias tensor(0.1350)
model.2.weight tensor(4.4251)
model.2.bias tensor(0.5115)

Ok, I just didn’t realise the gradients were non-zero (just eye-balled the tensor and assumed they were). Muchas gracias