Segfault Using Kineto With Libtorch

I have the following minimal code:

#include <torch/csrc/autograd/profiler.h>
#include <torch/torch.h>

int main(){

    torch::autograd::profiler::ProfilerConfig config(
        torch::autograd::profiler::ProfilerState::KINETO, // Change to ProfilerState::CUDA for GPU profiling
        false, // Record input shapes
        false, // Profile memory
        false, // With stack
        false, // with flops
        false // with modules
        // experimental config see: https://github.com/pytorch/pytorch/blob/90d5a6f001ef3ea40ef91ae20e050e39a6d550de/torch/csrc/profiler/orchestration/observer.cpp#L16
    );
    std::set<torch::profiler::impl::ActivityType> activities = {torch::autograd::profiler::ActivityType::CPU, torch::autograd::profiler::ActivityType::CUDA};
    std::unordered_set<at::RecordScope> scopes = {};
    torch::autograd::profiler::enableProfiler(config,activities,scopes);
    //----------------------------------------------------------------------
    // CPU operations
    torch::Tensor tensor_cpu = torch::randn({1000, 1000});
    for (int i = 0; i < 10; ++i) {
        tensor_cpu = tensor_cpu.matmul(tensor_cpu);
    }

    // GPU operations
    if (torch::cuda::is_available()) {
        torch::Tensor tensor_gpu = torch::randn({1000, 1000}, torch::device(torch::kCUDA));
        for (int i = 0; i < 10; ++i) {
            tensor_gpu = tensor_gpu.matmul(tensor_gpu);
        }
    } else {
        std::cout << "CUDA is not available. Skipping GPU operations." << std::endl;
    }
    //----------------------------------------------------------------------

    auto profiler_result = torch::autograd::profiler::disableProfiler();
    profiler_result->save("profile_results.json");
}

I get the following error

[ac630c8274b8:3344 :0:3344] Caught signal 11 (Segmentation fault: address not mapped to object at address (nil))
==== backtrace (tid:   3344) ====
 0 0x0000000000042520 __sigaction()  ???:0
 1 0x0000000004d5dbc4 torch::profiler::impl::kineto::startTrace()  :0
 2 0x0000000000005c12 main()  /workspace/src/scratch2.cu:17
 3 0x0000000000029d90 __libc_init_first()  ???:0
 4 0x0000000000029e40 __libc_start_main()  ???:0
 5 0x0000000000005925 _start()  ???:0
=================================
Segmentation fault (core dumped)

I cannot figure out how to solve this. libkineto.a is in the linked libraries and is found when I look at the paths.

kineto_LIBRARY is set as follows in CMakeCache.txt

kineto_LIBRARY:FILEPATH=/usr/local/lib/libkineto.a

Here is the compile_commands.json entry for debug build

  "directory": "/workspace/build",
  "command": "/usr/local/cuda/bin/nvcc -forward-unknown-to-host-compiler -DUSE_C10D_GLOO -DUSE_C10D_MPI -DUSE_C10D_NCCL -DUSE_C10D_UCC -DUSE_DISTRIBUTED -DUSE_RPC -DUSE_TENSORPIPE --options-file CMakeFiles/ppo_optim_scratch2.dir/includes_CUDA.rsp  -DONNX_NAMESPACE=onnx_c2 -gencode arch=compute_90,code=sm_90 -Xcudafe --diag_suppress=cc_clobber_ignored,--diag_suppress=field_without_dll_interface,--diag_suppress=base_class_has_different_dll_interface,--diag_suppress=dll_interface_conflict_none_assumed,--diag_suppress=dll_interface_conflict_dllexport_assumed,--diag_suppress=bad_friend_decl --expt-relaxed-constexpr --expt-extended-lambda -g -std=c++20 -g -G --maxrregcount 64 -gencode arch=compute_90,code=sm_90 -std=c++20 -fconcepts -march=native -Xptxas -v -Xcompiler -fopenmp -D_GLIBCXX_USE_CXX11_ABI=1 -x cu -c /workspace/src/scratch2.cu -o CMakeFiles/ppo_optim_scratch2.dir/src/scratch2.cu.o",
  "file": "/workspace/src/scratch2.cu",
  "output": "CMakeFiles/ppo_optim_scratch2.dir/src/scratch2.cu.o"

output of nvidia-smi

Wed Jun 26 02:38:30 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA H100 PCIe               Off |   00000000:C1:00.0 Off |                    0 |
| N/A   31C    P0             47W /  350W |      13MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
+-----------------------------------------------------------------------------------------+

Using torch 2.3.0

What could be causing the segfault?

Solved needed to download and explicitly link kineto