Two networks and gradients of parameters of one of those models is none

I have trained an autoencoder and saved this trained mode. In another notebook I have uploaded this network:

def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
model.load_state_dict(checkpoint[‘state_dict’])
for parameter in model.parameters():
parameter.requires_grad = True
model.train()
return model

Here I set

parameter.requires_grad=True and also model.train()

Then I define a new network called logits_net and this network is trained with the output images of the pre-trained encoder which has been loaded. Loss is calculated by policy gradient method and then I propagate this loss in both networks. this way:

params = list(logits_net.parameters()) + list(model.parameters())
opt = Adam(params, lr=lr)

then

opt.zero_grad()
batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32), weights = torch.as_tensor(batch_weights, dtype=torch.float32))
batch_loss.backward()
opt.step()

After this I check the gradient of autoencoder parameters and they are all none. The other network’s parameters have gradients.

I am wondering why they do not get trained despite the fact that I have set

parameter.requires_grad=True and also model.train()
and also included the parameters into the optimizer function.

The loss computation looks a bit concerning, as it seems you might be recreating tensors and thus detaching them from the computation graph.
What is the type of batch_logp and batch_weights? Is one of these tensors the output of both models (called sequentially)?
Also, could you post the code of compute_loss? If you are detaching the tensors, the batch_loss.backward() call should raise an error, so I’m not sure where exactly the issue comes from.

I will add more code so you can take a look at them:
this is my first network: an autoencoder:

class DEC_AE(nn.Module):
def init(self, num_classes, num_features):
super(DEC_AE, self).init()
self.dropout = nn.Dropout(p=0.1)
self.fc1 = nn.Linear(28 * 28, 500)
self.fc2 = nn.Linear(500, 500)
self.fc3 = nn.Linear(500, 2000)
self.fc4 = nn.Linear(2000, num_features)
self.relu = nn.ReLU()
self.fc_d1 = nn.Linear(500, 28 * 28)
self.fc_d2 = nn.Linear(500, 500)
self.fc_d3 = nn.Linear(2000, 500)
self.fc_d4 = nn.Linear(num_features, 2000)
self.alpha = 1.0

    self.pretrainMode = True
    #here we initialize all the weights
    for m in self.modules():
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            torch.nn.init.xavier_uniform_(m.weight)

def setPretrain(self, mode):
    """To set training mode to pretrain or not,
    so that it can control to run only the Encoder or Encoder+Decoder"""
    self.pretrainMode = mode


def forward(self, x):#a batch of images
    x = x.view(-1, 1 * 28 * 28)
    x = self.dropout(x)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    x = self.relu(x)
    x = self.fc3(x)
    x = self.relu(x)
    x = self.fc4(x)
    x_e = x 

    # if not in pre_train mode, we need encoder and t distribution output
    if self.pretrainMode is False:
        return x
        

    # encoder is done, followed by decoder
    x = self.fc_d4(x)
    x = self.relu(x)
    x = self.fc_d3(x)
    x = self.relu(x)
    x = self.fc_d2(x)
    x = self.relu(x)
    x = self.fc_d1(x)
    x_de = x.view(-1, 1, 28, 28)
    
    return x_e, x_de

loading the AE:

def load_checkpoint(filepath, model):
checkpoint = torch.load(filepath)
#model = checkpoint[‘model’]
model.load_state_dict(checkpoint[‘state_dict’])
for parameter in model.parameters():
parameter.requires_grad = True

model.train()
return model

and this is the architecture od the second network (I am testing with both sequential and also a model inherited from nn.module ):

def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network. outputs are the logits
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)

and loss is calculated this way:

logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])



def get_policy(obs):
        logits = logits_net(obs)
        return Categorical(logits=logits)
def get_action(obs):
        return get_policy(obs).sample().item()
def Logp(obs, act):
        logp = get_policy(obs).log_prob(act)
        return logp
def compute_loss(logp, weights):
        return -(logp * weights).mean()

 opt = MultipleOptimizer(SGD(model.parameters(), lr=1, momentum=0.9), Adam(logits_net.parameters(), lr=lr)

for i, data in enumerate(train_loader):
           
            
            x, label = data
            x = model(x.cuda())
            obs = x.data.cpu().numpy()
            batch_obs.append(obs.copy())
          
            #act in the environment
            act = get_action(torch.as_tensor(obs, dtype=torch.float32))
            
            #log probability
            logp = Logp(torch.as_tensor(obs, dtype=torch.float32),act = torch.as_tensor(act, dtype=torch.int32))
                
          
            rew = reward(obs, act+2)
           

            # save action, reward
            batch_acts.append(act)
            batch_weights.append(rew)#episode rewards
            batch_logp.append(logp)
            
        
        opt.zero_grad()
        batch_logp = torch.stack(batch_logp, dim=0)
       
        batch_loss = compute_loss(logp = torch.as_tensor(batch_logp, dtype=torch.float32), 
                                  weights = torch.as_tensor(batch_weights, dtype=torch.float32))
        batch_loss.backward()
        opt.step()