Loss.grad always prints None

class Parent(nn.Module):

  def __init__(self,in_features,z_dim, img_dim):
        super().__init__()
        self.my_child1 = Child1 (z_dim, img_dim)
        self.my_child2 = Child2 (in_features)
      

  def forward(self,input):
         input=self.my_child1(input)
         input=self.my_child2(input)    
         return input
  
  def forward1(self,input):
         input=self.my_child1(input)
         return input
         
  def forward2(self,input):
         input=self.my_child2(input)
         return input


class Child2(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.child2 = nn.Sequential(
            nn.Linear(in_features, 128),
            nn.LeakyReLU(0.01),
            nn.Linear(128, 1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.child2(x)


class Child1(nn.Module):
    def __init__(self, z_dim, img_dim):
        super().__init__()
        self.child1 = nn.Sequential(
            nn.Linear(z_dim, 256),
            nn.LeakyReLU(0.01),
            nn.Linear(256, img_dim),
            nn.Tanh(), 
        )

    def forward(self, x):
        return self.child1(x)

criterion=nn.BCELoss()
noise = torch.randn(batch_size, z_dim).to(device)
model=Parent(in_features,z_dim, img_dim)
output1=model(noise)
loss1=criterion(output1,torch.ones_like(output1))
loss2=criterion(output1,torch.zeroes_like(output1))
loss3=(loss1+loss2)/2
model.zero_grad()
loss3.backward(retain_graph=True)
print(loss3.grad)

I have not used any optimizer here because updating the parameters are done using a seperate formula which I will employ only after I get the gradients. The formula requires the gradients to be stored in a matrix. However, the gradient always prints “None”.

What is the issue here?

Hi Jeet!

First, be aware of the following:

When you call loss.backward(), the gradients (that I think) you want
won’t be stored in loss.grad. Rather, the gradient of loss with respect
to, for example, my_child1's first Linear's bias parameter will be stored
in that parameter’s .grad property, .bias.grad.

(You won’t have any single tensor that has all of the gradients in it unless
you assemble such a tensor yourself.)

As an aside, if you want to get an actual gradient value for loss itself, call
loss.retain_grad() before calling loss.backward(), but, as loss is the
root of the computation graph, loss.grad will always be set to the trivial
value 1.0 (which makes sense, but isn’t useful).

Best.

K. Frank