I have this path length penalty loss calculated in the function g_path_regularize. And the generator regularization step looks like this:

```
std::vector<torch::Tensor> res = generator->forward(noise, true);
fake_img = res[0];
latents = res[1];
weighted_path_loss = args.parameter * path_loss;
generator->zero_grad();
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
std::clock_t start;
double duration;
start = std::clock();
weighted_path_loss.backward();
stream = at::cuda::getCurrentCUDAStream();
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
std::cout<<"Time spent backward: "<< duration <<'\n';
```

From the output window I saw “Time spent backward: 294.73”, which means it took around 5 minutes to finish one regularization step. But in python using pytorch, the regularization step finished in less than 10 seconds. I wonder what could be the bottleneck. Is there any way to debug the backward step? Any input would be greatly appreciated!

```
std::vector<torch::Tensor> g_path_regularize(const torch::Tensor& fake_img, const torch::Tensor&
latents, const torch::Tensor& mean_path_length, Scalar decay=0.01){
std::vector<torch::Tensor> res;
auto noise = torch::randn_like(fake_img) / sqrt(
fake_img.size(2) * fake_img.size(3)
);
auto grad = torch::autograd::grad(
{(fake_img * noise).sum()}, {latents}, {}, true, true)[0];
auto path_lengths = torch::sqrt(torch::mean(torch::sum(torch::mul(grad, grad), 2),1));
auto path_mean = mean_path_length + decay * (path_lengths.mean() - mean_path_length);//tensor
auto path_penalty = torch::mean(torch::mul(path_lengths - path_mean, path_lengths - path_mean) );
res={path_penalty, path_mean.detach(), path_lengths};
return res;
}
```