Grad for the biases are None but network seems to be training

class Lag(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear_relu_stack = nn.Sequential(
            nn.Linear(5, 200),
            nn.ReLU(),
            nn.Linear(200, 1)
        )

  def forward(self, x_dot, x):
    q_dots = x_dot[0:2]
    q_dots_transpose = q_dots.t()
    mass_matrix = m(x[0],x[1])
    KE = 0.5*torch.matmul(q_dots_transpose, torch.matmul(mass_matrix, q_dots))
    ke_variabels = torch.square(torch.cat([x[0:2], x[4:]]))
    qc_x = (x[5]*x[0]).view(1,1)
    x_new = torch.cat((ke_variabels, qc_x),0).view(1,-1)
    L_wo_KE = self.linear_relu_stack(x_new)
    L = L_wo_KE + KE
    return L

When I checked grads with the following code

for name, param in L.named_parameters():
    print(name, param.grad)

I got

linear_relu_stack.2.bias None

I am confused. Why my network is training even though some of the grads are None. Please help me out here. Thanks in advance.

@ptrblck @KFrank, Could you please help me with it? Thanks.

The bias gradient is not None using your code:

class Lag(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear_relu_stack = nn.Sequential(
            nn.Linear(5, 200),
            nn.ReLU(),
            nn.Linear(200, 1)
        )

  def forward(self, x_dot, x):
    q_dots = x_dot[0:2]
    q_dots_transpose = q_dots.t()
    mass_matrix = torch.randn(2, 2)
    KE = 0.5*torch.matmul(q_dots_transpose, torch.matmul(mass_matrix, q_dots))
    ke_variabels = torch.square(torch.cat([x[0:2], x[4:]]))
    qc_x = (x[5]*x[0]).view(1,1)
    x_new = torch.cat((ke_variabels, qc_x),0).view(1,-1)
    L_wo_KE = self.linear_relu_stack(x_new)
    L = L_wo_KE + KE
    return L


model = Lag()
x_dot = torch.randn(6, 1)
x = torch.randn(6, 1)

out = model(x_dot, x)
out.mean().backward()

for name, param in model.named_parameters():
    print(name, param.grad.sum())
    
# linear_relu_stack.0.weight tensor(-1.8680)
# linear_relu_stack.0.bias tensor(-0.6111)
# linear_relu_stack.2.weight tensor(39.8814)
# linear_relu_stack.2.bias tensor(1.)

Note that I needed to replace the undefined mass_matrix with a random tensor, but I don’t see how this would affect the gradients.

Also, I would recommend avoiding tagging specific users as it could discourage others to post a valid response.

Got it. Thanks. I will recheck it.



def EOM(L, D, x, x_dot, x_dd): # x = [q1,q2, I , qc] ..... x_dot = [q1_dot , q2_dot, I_dot, qc_dot] , .... x_dd = [q1_dd, q2_dd]
  
  dL_dxdot = grad(L, x_dot, create_graph=True, retain_graph= True)



  dL_q1dot_xdot = grad(dL_dxdot[0][0], x_dot, create_graph = True, retain_graph= True)

  dD_xdot = grad(D,x_dot, create_graph=True, retain_graph= True)

  dD_x = grad(D,x, create_graph=True, retain_graph= True)

  dL_dx = grad(L, x, create_graph=True, retain_graph= True)
  dL_I_x = grad(dL_dx[0][4],x, create_graph = True, retain_graph= True)

  dL_q2dot_xdot = grad(dL_dxdot[0][1], x_dot, create_graph = True, retain_graph= True)

  q1_eom = dL_q1dot_xdot[0][0]* x_dd[0] + grad(L, x, retain_graph= True)[0][0] + dD_xdot[0][0]
  q2_eom = dL_q2dot_xdot[0][1]* x_dd[1] + grad(L, x, retain_graph= True)[0][1] + dD_xdot[0][1]

  i_eom = dL_I_x[0][4]*x_dot[4] + dD_x[0][4]

  qc_eom = - dL_dx[0][5] + dD_xdot[0][5]

  return q1_eom, q2_eom, i_eom, qc_eom

I feel that I am making some mistakes in calculating the gradients. Can anyone find where the computational graph is detaching or the code is perfectly fine?

Please help me out here. I could give more information if needed. Thanks.