Segmentation Fault at end of execution Libtorch

Hi, I’m new to libtorch and I’m trying to run the model I trained in pytorch in C++
The problem I’m running into is that while the code runs correctly and the model evalutes properly, the code goes in segmentation fault at the end of execution:

(pytorch2.5.0) epimetheus@epimetheus-B650-EAGLE-AX:~/pytorch$ ./libtest
HSA_OVERRIDE_GFX_VERSION: 10.3.0
Model loaded successfully.
[W1021 23:59:39.241343365 Context.cpp:296] Warning: Attempting to use hipBLASLt on an unsupported architecture! Overriding blas backend to hipblas (function operator())
Model output: -34.4764
-26.7665
 87.0915
[ CPUFloatType{3} ]
Segmentation fault (core dumped)

my code is the following:

#include <torch/script.h> // One-stop header.
#include <memory>
#include <cstdlib>
#include <iostream>

int main() {
    // Set the environment variable
    if (setenv("HSA_OVERRIDE_GFX_VERSION", "10.3.0", 1) != 0) {
        std::cerr << "Error setting environment variable\n";
        return -1;
    }

    // Verify if the environment variable is set
    const char* value = std::getenv("HSA_OVERRIDE_GFX_VERSION");
    if (value) {
        std::cout << "HSA_OVERRIDE_GFX_VERSION: " << value << std::endl;
    } else {
        std::cerr << "Environment variable not found\n";
    }
    torch::jit::script::Module module;
    try {
        module = torch::jit::load("iris_model_torchscript.pt");
    } catch (const c10::Error& e) {
        std::cerr << "Error loading the model\n";
        return -1;
    }

    std::cout << "Model loaded successfully.\n";

    // Create a random input tensor
    at::Tensor input = torch::tensor({1.4, 0.2, 4.5, 9.6}, at::kCUDA);


    at::Tensor output = module.forward({input}).toTensor();

    std::cout << "Model output: " << output.cpu() << std::endl;

    return 0;
}

is there anything obvious I’m missing? I don’t think I have done much of anything right now

I don’t see anything obviously wrong, so could you try to grab the stacktrace via:

gdb --args ./application
...
r
...
bt
(gdb) r
Starting program: /home/epimetheus/pytorch/libtest 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
warning: could not find '.gnu_debugaltlink' file for /lib/x86_64-linux-gnu/libtinfo.so.6
warning: could not find '.gnu_debugaltlink' file for /home/epimetheus/libtorch/lib/libtinfo.so
HSA_OVERRIDE_GFX_VERSION: 10.3.0
[New Thread 0x7ffd9c6006c0 (LWP 12077)]
[New Thread 0x7ffd9bc006c0 (LWP 12078)]
[Thread 0x7ffd9bc006c0 (LWP 12078) exited]
[New Thread 0x7ffd996006c0 (LWP 12086)]
Model loaded successfully.
[W1022 11:48:43.997286476 Context.cpp:296] Warning: Attempting to use hipBLASLt on an unsupported architecture! Overriding blas backend to hipblas (function operator())
[New Thread 0x7ffc4be006c0 (LWP 12090)]
[Thread 0x7ffc4be006c0 (LWP 12090) exited]
Model output: -34.4764
-26.7665
 87.0915
[ CPUFloatType{3} ]
[Thread 0x7ffd996006c0 (LWP 12086) exited]
[Thread 0x7ffd9c6006c0 (LWP 12077) exited]

Thread 1 "libtest" received signal SIGSEGV, Segmentation fault.
0x000055555b2302e0 in ?? ()
(gdb) bt
#0  0x000055555b2302e0 in ?? ()
#1  0x00007fffa1592b7c in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#2  0x00007fffa1592c9d in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#3  0x00007fffa15a39ef in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#4  0x00007fffa159f253 in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#5  0x00007fffa159f26d in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#6  0x00007fffa15a39ef in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#7  0x00007fffa131797f in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#8  0x00007fffa1318180 in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#9  0x00007fffa12cbfca in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#10 0x00007fffa14a3950 in ?? () from /opt/rocm-6.2.2/lib/libamdhip64.so.6
#11 0x00007ffdef523212 in ?? () from /home/epimetheus/libtorch/lib/librocsparse.so
#12 0x00007fffa0a47372 in __cxa_finalize (d=0x7ffdf1f3a758) at ./stdlib/cxa_finalize.c:82
#13 0x00007ffdef51b91e in ?? () from /home/epimetheus/libtorch/lib/librocsparse.so
#14 0x00007ffff7fc60f2 in _dl_call_fini (closure_map=closure_map@entry=0x7fffa2868000) at ./elf/dl-call_fini.c:43
#15 0x00007ffff7fca578 in _dl_fini () at ./elf/dl-fini.c:114
#16 0x00007fffa0a47a66 in __run_exit_handlers (status=0, listp=<optimized out>, run_list_atexit=run_list_atexit@entry=true, run_dtors=run_dtors@entry=true) at ./stdlib/exit.c:108
#17 0x00007fffa0a47bae in __GI_exit (status=<optimized out>) at ./stdlib/exit.c:138
#18 0x00007fffa0a2a1d1 in __libc_start_call_main (main=main@entry=0x555555559740 <main>, argc=argc@entry=1, argv=argv@entry=0x7fffffffdeb8)
    at ../sysdeps/nptl/libc_start_call_main.h:74
#19 0x00007fffa0a2a28b in __libc_start_main_impl (main=0x555555559740 <main>, argc=1, argv=0x7fffffffdeb8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, 
    stack_end=0x7fffffffdea8) at ../csu/libc-start.c:360
#20 0x0000555555559675 in _start ()

It seems your application segfaults during the teardown of librocsparse.so. I’m not an AMD expert so wouldn’t know what might be causing it.

I noticed the errors with the libraries so I tried compiling using this cmake:

cmake_minimum_required(VERSION 3.10)
project(example-app)

# Set the C++ standard to C++17
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED YES)

# Set Clang++ as the compiler
set(CMAKE_CXX_COMPILER clang++)

# Specify the path to LibTorch (replace with your path)
set(Torch_DIR "/home/epimetheus/libtorch/share/cmake/Torch")

# Find LibTorch package
find_package(Torch REQUIRED)

# Add the executable you want to compile
add_executable(libtest2 libtest.cpp)

# Link the executable with LibTorch
target_link_libraries(libtest2 "${TORCH_LIBRARIES}")

# Add additional compilation flags if needed
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")

instead of this:

clang++ -std=c++17     -I/home/epimetheus/libtorch/include     -I/home/epimetheus/libtorch/include/torch/csrc/api/include     -L/home/epimetheus/libtorch/lib     -Wl,-rpath=/home/epimetheus/libtorch/lib     libtest.cpp -o libtest     -ltorch -ltorch_cpu -ltorch_hip -lc10 -ldl -pthread -lgomp -L/opt/rocm/lib -lamdhip64

and now it doesn’t segmentation fault anymore. It’s weird because I thought anything like this would give me problems at the linker level though

@Epimetheus I had a similar issue today. The root cause might be a linking issue.

I would suggest to check ldd a.out | grep libamdhip64 and check if you have not it linked twice (one from torch, one from /opt/rocm/lib).

In my case, the root cause was this - libamdhip64.so linked twice, once from the wrong location:

$ ldd a.out | grep libamdhip64
        libamdhip64.so.6 => /opt/rocm-6.2.4/lib/llvm/bin/../../../lib/libamdhip64.so.6 (0x000078f857c00000)
        libamdhip64.so => /scratch/felmarty/miniconda3/envs/py310/lib/python3.10/site-packages/torch/lib/libamdhip64.so (0x000078f854e00000)