Pytorch autograd wrong values on Coriolis- & Centrifugal Matrix

Dear reader,

I am trying to calculate the Euler-Lagrange matrices of a double pendulum using pytorch. This works fine in the original coordinate system q (q0, q1 are the absolute angles of the links). However, when I apply a coordinate transformation I obtain incorrect values of the Coriolis- & Centrifugal matrix.

My code is as follows:

I calculate the dynamical matrices (M(q), C(q, q_d), G(q)) as follows (checked with symbolic tools):

def dynamical_matrices(rp: dict, q, q_d):
    """
    Computes the dynamical matrices in the equation
    M_q @ q_dd + C_q @ q_d + G_q = tau_q

    Args:
        rp: dictionary of robot parameters
        q: link angles of shape (2, )
        q_d: link angular velocities of shape (2, )
    Returns:
        M_q: inertial matrix of shape (2, 2)
        C_q: coriolis and centrifugal matrix of shape (2, 2)
        G_q: gravitational matrix of shape (2, )     
    """

    c1 = torch.cos(q[0]).unsqueeze(0)
    c2 = torch.cos(q[1]).unsqueeze(0)
    s12 = torch.sin(q[0]-q[1]).unsqueeze(0)
    c12 = torch.cos(q[0]-q[1]).unsqueeze(0)
    
    M_q_11 = torch.tensor([rp["l1"]**2 * (rp["m1"] + rp["m2"])]) 
    M_q_1 = torch.cat((M_q_11, (rp["l1"] * rp["l2"] * rp["m2"] * c12)), dim = 0).unsqueeze(0)
    M_q_22 = torch.tensor([rp["l2"]**2 * rp["m2"]]) 
    M_q_2 = torch.cat(((rp["l1"] * rp["l2"] * rp["m2"] * c12), M_q_22), dim = 0).unsqueeze(0)
    M_q = torch.cat((M_q_1, M_q_2), dim = 0)
        
    C_q_11 = torch.tensor([0]) 
    C_q_1 = torch.cat((C_q_11, (rp["l1"] * rp["l2"] * rp["m2"] * q_d[1] * s12)), dim=0).unsqueeze(0)
    C_q_2 = torch.cat(((-rp["l1"] * rp["l2"] * rp["m2"] * q_d[0] * s12), C_q_11), dim=0).unsqueeze(0)
    C_q = torch.cat((C_q_1, C_q_2), dim = 0)
    
    G_q_1 = torch.tensor([rp["g"] * rp["l1"] * (rp["m1"] + rp["m2"])]).unsqueeze(0)  * c1
    G_q_2 = torch.tensor([rp["g"] * rp["l2"] * rp["m2"]]).unsqueeze(0)  * c2
    G_q = torch.cat((G_q_1, G_q_2), dim = 0)


    return M_q, C_q, G_q 

The new coordinates theta are a function of q:

def theta_func(rp, q):

    Rx = rp["xa"] - rp["l1"] * torch.cos(q[0]) - rp["l2"] * torch.cos(q[1])
    Ry = rp["ya"] - rp["l1"] * torch.sin(q[0]) - rp["l2"] * torch.sin(q[1])

    th0 = torch.sqrt(Rx**2 + Ry**2)
    th1 = torch.atan2(Ry, Rx)
    return(torch.stack([th0, th1]))

The inverse function is as follows:

def analytic_inverse(rp: dict, th):

    """
    Inverse kinematics from theta to q, based on the end-effector
    position (xend, yend). 
    Returns a tuple with two sets of joint angles, one for clockwise
    and one for counter-clockwise configuration.
    """

    # Obtain end effector position.

    xend = rp["xa"] - th[0]*torch.cos(th[1])
    yend = rp["ya"] - th[0]*torch.sin(th[1])


    numerator = (xend**2 + yend**2 - rp["l1"]**2 - rp["l2"]**2)
    denominator = 2*rp["l1"]*rp["l2"]
    fraction = numerator/denominator

    beta = torch.arccos(fraction)

    # Determine primary angles.
    q1 = torch.atan2(yend, xend) - torch.atan2(rp["l2"]*torch.sin(beta), rp["l1"] + rp["l2"]*torch.cos(beta))
    q2 = q1 + beta

    # Determine secondary angles.
    q1_alt = torch.atan2(yend, xend) + torch.atan2(rp["l2"]*torch.sin(beta), rp["l1"] + rp["l2"]*torch.cos(beta))
    q2_alt = q1_alt - beta 

    q = torch.stack([q1, q2], dim=-1)
    q_alt = torch.stack([q1_alt, q2_alt], dim=-1)

    # Check whether the primary angle is clockwise. Otherwise, swap with secondary.
    q_cw = q
    q_ccw = q_alt
    
    return q_ccw#, q_cw

I then obtain the Jacobian of this inverse like so:

def Jh_inv_func(analytic_inverse, rp, th):

    analytic_wrapped = lambda th: analytic_inverse(rp, th)
    Jh_inv = torch.autograd.functional.jacobian(analytic_wrapped, th, create_graph=True)

    return Jh_inv

The dynamical matrices in theta-coordinates can then be calculated as:

def transform_dynamical_from_inverse(M_q, C_q, G_q, theta, theta_d,
                                     J_h_inv, J_h_inv_trans):


    # @ performs batched matrix multiplication on the terms

    M_th = J_h_inv_trans @ M_q @ J_h_inv

    C_th = torch.zeros(M_th.size()).to(M_q.device)
    for i in range(C_th.size(0)):
        for j in range(C_th.size(1)):
            for k in range(C_th.size(1)):
                M_th_dot_ijk = torch.autograd.grad(M_th[i,j], theta, create_graph=True)[0][k]
                M_th_dot_ikj = torch.autograd.grad(M_th[i,k], theta, create_graph=True)[0][j]
                M_th_dot_jki = torch.autograd.grad(M_th[j,k], theta, create_graph=True)[0][i]
                C_th[i, j] += 0.5 * (M_th_dot_ijk + M_th_dot_ikj - M_th_dot_jki) * theta_d[k]

    G_th = J_h_inv_trans @ G_q

    
    return M_th, C_th, G_th

When I run this code, the computation of theta, the (inverse) jacobians and the dynamical matrices in q-space are all correct!

Also the calculation of M(th) and G(th) is correct.
However, something goes wrong in the calculation of C(th, th_d).

Does anyone understand what I am doing wrong? Or is there some problem with taking such derivatives using torch.autograd.grad()?