Backward step in the generator regularization is much slower in Libtorch than in pytorch

I have this path length penalty loss calculated in the function g_path_regularize. And the generator regularization step looks like this:

        std::vector<torch::Tensor> res = generator->forward(noise, true);
        fake_img = res[0];
        latents = res[1];
   
        weighted_path_loss = args.parameter * path_loss;
        generator->zero_grad();

        at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
        AT_CUDA_CHECK(cudaStreamSynchronize(stream));
        std::clock_t start;
        double duration;
        start = std::clock();

        weighted_path_loss.backward();
        stream = at::cuda::getCurrentCUDAStream();
        AT_CUDA_CHECK(cudaStreamSynchronize(stream));

        duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
        std::cout<<"Time spent backward: "<< duration <<'\n';

From the output window I saw “Time spent backward: 294.73”, which means it took around 5 minutes to finish one regularization step. But in python using pytorch, the regularization step finished in less than 10 seconds. I wonder what could be the bottleneck. Is there any way to debug the backward step? Any input would be greatly appreciated!

std::vector<torch::Tensor> g_path_regularize(const torch::Tensor& fake_img, const torch::Tensor& 
latents,  const torch::Tensor& mean_path_length, Scalar decay=0.01){
std::vector<torch::Tensor> res;
auto noise = torch::randn_like(fake_img) / sqrt(
        fake_img.size(2) * fake_img.size(3)
);

auto grad = torch::autograd::grad(
        {(fake_img * noise).sum()}, {latents}, {}, true, true)[0];

auto path_lengths = torch::sqrt(torch::mean(torch::sum(torch::mul(grad, grad), 2),1));

auto path_mean = mean_path_length + decay * (path_lengths.mean() - mean_path_length);//tensor

auto path_penalty = torch::mean(torch::mul(path_lengths - path_mean, path_lengths - path_mean) );

res={path_penalty, path_mean.detach(), path_lengths};
return res;
}