Autograd loses track of gradient if nested

I have the following function

template
torch::Tensor ppppppH(const torch::Tensor &x,
const torch::Tensor &p,
T W,
std::function<torch::Tensor(const torch::Tensor&, const torch::Tensor&, T)> H)
{
// Create tensors with gradient tracking for both x and p
auto xt = x.clone();
xt.detach_();
xt=xt.set_requires_grad(false);
auto pt = p.clone();
pt.detach_();
pt=pt.set_requires_grad(true);

std::cerr << "After setting gradient in ppppppH" << std::endl;
std::cerr << "xt.requires_grad() = " << xt.requires_grad() << std::endl;
std::cerr << "pt.requires_grad() = " << pt.requires_grad() << std::endl;

// Compute the Hamiltonian value
auto Hvalue = H(xt, pt, W);
std::cerr << "Hvalue.requires_grad() = " << Hvalue.requires_grad() << std::endl;

// Compute the first-order gradient of H with respect to p
auto grad_H_p = torch::autograd::grad({Hvalue}, 
                                      {pt}, 
                                      {torch::ones_like(Hvalue)}, 
                                      true, 
                                      true, 
                                      true)[0];
// Initialize a tensor to hold the third-order gradients
std::cerr << "grad_H_p.requires_grad() = " << grad_H_p.requires_grad() << std::endl;
std::cerr << "grad_H_p.grad_fn() = " << grad_H_p.grad_fn() << std::endl;
std::cerr << "grad_H_p shape = " << grad_H_p.sizes() << std::endl;
std::cerr << "grad_H_p values = " << grad_H_p << std::endl;

auto batch_size = p.size(0);
auto p_dim = p.size(1);
torch::Tensor third_order_derivative = torch::zeros({batch_size, p_dim, p_dim, p_dim}, p.options());

// Compute the second-order and third-order gradients
for (int i = 0; i < p_dim; ++i)
{
    // Zero out the gradients of pt before computing the gradient for the next dimension
    if (pt.grad().defined()) {
        pt.grad().zero_();
    }

    // Compute the gradient of the i-th component of grad_H_p with respect to p
    auto grad_H_pati = grad_H_p.index({Slice(), i});
    auto grad_H_p_i = torch::autograd::grad({grad_H_pati}, 
                                            {pt}, 
                                            {torch::ones_like(grad_H_pati)}, 
                                            true, 
                                            true, 
                                            true)[0];
    std::cerr << "grad_H_p_i.requires_grad() = " << grad_H_p_i.requires_grad() << std::endl;
    std::cerr << "pt.requires_grad() = " << pt.requires_grad() << std::endl;
    for (int j = 0; j < p_dim; ++j)
    {
        // Zero out the gradients of pt before computing the gradient for the next dimension
        if (pt.grad().defined()) {
            pt.grad().zero_();
        }
        //Check to see if grad_H_p_i lost the graph
        if ( grad_H_p_i.requires_grad() == false) {
            std::cerr << "grad_H_p_i lost gradient tracking at this point." << std::endl;
            grad_H_p_i.set_requires_grad(true);
        }


        // Compute the gradient of the j-th component of grad_H_p_i with respect to p
        auto grad_H_p_ij = torch::autograd::grad({grad_H_p_i.index({Slice(), j})}, 
                                                 {pt}, 
                                                 {torch::ones_like(grad_H_p_i.index({Slice(), j}))}, 
                                                 true, 
                                                 false, 
                                                 true)[0];
        std::cerr << "grad_H_p_ij.requires_grad() = " << grad_H_p_ij.requires_grad() << std::endl;

        if (!grad_H_p_ij.requires_grad()) {
            std::cerr << "grad_H_p_ij lost gradient tracking at this point." << std::endl;
            continue;
        }

        //third_order_derivative.select(1, i).select(2, j).copy_(grad_H_p_ij);
        third_order_derivative.index_put_({Slice(), i, j}, grad_H_p_ij);
    }
}

// Return the third-order derivative tensor
return third_order_derivative;

}

The function is tested using the following methods
torch::Tensor control(const torch::Tensor& x1,
const torch::Tensor& x2,
const torch::Tensor& p1,
const torch::Tensor& p2,
double W=1.0) {
auto u = -p2 * ((1 - x1 * x1) * x2 - x1) / W;
return u; // Return through copy elision
}

torch::Tensor hamiltonian(const torch::Tensor& x,
const torch::Tensor& p,
double W) {
torch::Tensor p1 = p.index({Slice(), 0});
torch::Tensor p2 = p.index({Slice(), 1});
torch::Tensor x1 = x.index({Slice(), 0});
torch::Tensor x2 = x.index({Slice(), 1});
torch::Tensor u = control(x1, x2, p1, p2, W);

auto H = p1 * x2 + p2 * u * ((1 - x1 * x1) * x2 - x1) + W * u * u / 2 + 1;
return H;

}

TEST(HamiltonianTest, ppppppHTest)
{
torch::Tensor x = torch::tensor({0.5, -0.5}).view({1, -1}); // example tensor
torch::Tensor p = torch::tensor({1.0, -1.0}).view({1, -1}); // example tensor
double W = 1.0; // example parameter

auto result = ppppppH<double>(x, p, W, hamiltonian);

// Print the result for verification
std::cout << result << std::endl;

}

Autograd loses track of the gradients in the second loop. I have to set the grad_H_p_i manually.
This should not be happening.

My apologies for the formatting
Here are the code snippets

template<typename T>
torch::Tensor ppppppH(const torch::Tensor &x, 
                      const torch::Tensor &p, 
                      T W,
                      std::function<torch::Tensor(const torch::Tensor&, const torch::Tensor&, T)> H)
{
    // Create tensors with gradient tracking for both x and p
    auto xt = x.clone();
    xt.detach_();
    xt=xt.set_requires_grad(false);
    auto pt = p.clone();
    pt.detach_();
    pt=pt.set_requires_grad(true);

    std::cerr << "After setting gradient in ppppppH" << std::endl;
    std::cerr << "xt.requires_grad() = " << xt.requires_grad() << std::endl;
    std::cerr << "pt.requires_grad() = " << pt.requires_grad() << std::endl;

    // Compute the Hamiltonian value
    auto Hvalue = H(xt, pt, W);
    std::cerr << "Hvalue.requires_grad() = " << Hvalue.requires_grad() << std::endl;

    // Compute the first-order gradient of H with respect to p
    auto grad_H_p = torch::autograd::grad({Hvalue}, 
                                          {pt}, 
                                          {torch::ones_like(Hvalue)}, 
                                          true, 
                                          true, 
                                          true)[0];
    // Initialize a tensor to hold the third-order gradients
    std::cerr << "grad_H_p.requires_grad() = " << grad_H_p.requires_grad() << std::endl;
    std::cerr << "grad_H_p.grad_fn() = " << grad_H_p.grad_fn() << std::endl;
    std::cerr << "grad_H_p shape = " << grad_H_p.sizes() << std::endl;
    std::cerr << "grad_H_p values = " << grad_H_p << std::endl;

    auto batch_size = p.size(0);
    auto p_dim = p.size(1);
    torch::Tensor third_order_derivative = torch::zeros({batch_size, p_dim, p_dim, p_dim}, p.options());

    // Compute the second-order and third-order gradients
    for (int i = 0; i < p_dim; ++i)
    {
        // Zero out the gradients of pt before computing the gradient for the next dimension
        if (pt.grad().defined()) {
            pt.grad().zero_();
        }

        // Compute the gradient of the i-th component of grad_H_p with respect to p
        auto grad_H_pati = grad_H_p.index({Slice(), i});
        auto grad_H_p_i = torch::autograd::grad({grad_H_pati}, 
                                                {pt}, 
                                                {torch::ones_like(grad_H_pati)}, 
                                                true, 
                                                true, 
                                                true)[0];
        std::cerr << "grad_H_p_i.requires_grad() = " << grad_H_p_i.requires_grad() << std::endl;
        std::cerr << "pt.requires_grad() = " << pt.requires_grad() << std::endl;
        for (int j = 0; j < p_dim; ++j)
        {
            // Zero out the gradients of pt before computing the gradient for the next dimension
            if (pt.grad().defined()) {
                pt.grad().zero_();
            }
            //Check to see if grad_H_p_i lost the graph
            if ( grad_H_p_i.requires_grad() == false) {
                std::cerr << "grad_H_p_i lost gradient tracking at this point." << std::endl;
                grad_H_p_i.set_requires_grad(true);
            }


            // Compute the gradient of the j-th component of grad_H_p_i with respect to p
            auto grad_H_p_ij = torch::autograd::grad({grad_H_p_i.index({Slice(), j})}, 
                                                     {pt}, 
                                                     {torch::ones_like(grad_H_p_i.index({Slice(), j}))}, 
                                                     true, 
                                                     false, 
                                                     true)[0];
            std::cerr << "grad_H_p_ij.requires_grad() = " << grad_H_p_ij.requires_grad() << std::endl;

            if (!grad_H_p_ij.requires_grad()) {
                std::cerr << "grad_H_p_ij lost gradient tracking at this point." << std::endl;
                continue;
            }

            //third_order_derivative.select(1, i).select(2, j).copy_(grad_H_p_ij);
            third_order_derivative.index_put_({Slice(), i, j}, grad_H_p_ij);
        }
    }

    // Return the third-order derivative tensor
    return third_order_derivative;
}

The code is tested using the following functions

torch::Tensor control(const torch::Tensor& x1, 
                      const torch::Tensor& x2,
                      const torch::Tensor& p1,
                      const torch::Tensor& p2, 
                      double W=1.0) {
    auto u = -p2 * ((1 - x1 * x1) * x2 - x1) / W;
    return u; // Return through copy elision
}

torch::Tensor hamiltonian(const torch::Tensor& x, 
                          const torch::Tensor& p, 
                          double W) {
    torch::Tensor p1 = p.index({Slice(), 0});  
    torch::Tensor p2 = p.index({Slice(), 1});  
    torch::Tensor x1 = x.index({Slice(), 0});  
    torch::Tensor x2 = x.index({Slice(), 1});  
    torch::Tensor u = control(x1, x2, p1, p2, W);

    auto H = p1 * x2 + p2 * u * ((1 - x1 * x1) * x2 - x1) + W * u * u / 2 + 1;
    return H;
}



TEST(HamiltonianTest, ppppppHTest)
{
    torch::Tensor x = torch::tensor({0.5, -0.5}).view({1, -1}); // example tensor
    torch::Tensor p = torch::tensor({1.0, -1.0}).view({1, -1}); // example tensor
    double W = 1.0; // example parameter

    auto result = ppppppH<double>(x, p, W, hamiltonian);

    // Print the result for verification
    std::cout << result << std::endl;

}

Autograd appears to lose track of the gradient in the inner loop. This is easy to reproduce. I believe the gradient should not have to be set manually like this.