Segment fault when calling torch::jit::Module::forward

Hello! I was loading a nn.model generated by Gpytorch.

I am using the example here https://github.com/cornellius-gp/gpytorch/blob/master/examples/04_Variational_and_Approximate_GPs/Modifying_the_variational_strategy_and_distribution.ipynb with CPU for training device.
I traced the model with the same code here https://github.com/cornellius-gp/gpytorch/blob/master/examples/08_Advanced_Usage/TorchScript_Variational_Models.ipynb. (CPU only)

The traced model works fine in python.

My c++ front-end implementation is
torch::Device device_ = torch::kCPU
try {
torch_model_ = torch::jit::load(torch_model_file, torch::device_);
} catch (const c10::Error& e) {
return;
}
std::vectortorch::jit::IValue torch_inputs;
torch_inputs.push_back(torch::ones({1024, 18}));
auto torch_output = torch_model_.forward(torch_inputs);
I got a segment fault in the forward func.

Any suggestions? Thanks!

Here is the gdb trace back results
#0 0x00007ffff7445903 in spotrf () from /opt/xxx/sysroot/lib/libqpOASES.so
#1 0x00007fffe76f8615 in void at::native::lapackCholesky(char, int, float*, int, int*) ()
from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#2 0x00007fffe76ff13a in at::native::cholesky_helper_cpu(at::Tensor const&, bool) ()
from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#3 0x00007fffe7cf0605 in at::CPUType::cholesky_helper(at::Tensor const&, bool) ()
from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#4 0x00007fffe7d2d727 in c10::detail::wrap_kernel_functor_unboxed<c10::detail::WrapRuntimeKernelFunctor<at::Tensor ()(at::Tensor const&, bool), at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, bool> >, at::Tensor (at::Tensor const&, bool)>::call(c10::OperatorKernel, at::Tensor const&, bool) () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#5 0x00007fffe770111a in at::native::cholesky(at::Tensor const&, bool) ()
from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#6 0x00007fffe7dfeb35 in at::TypeDefault::cholesky(at::Tensor const&, bool) ()
from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#7 0x00007fffe7d2d727 in c10::detail::wrap_kernel_functor_unboxed_<c10::detail::WrapRuntimeKernelFunctor_<at::Tensor ()(at::Tensor const&, bool), at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, bool> >, at::Tensor (at::Tensor const&, bool)>::call(c10::OperatorKernel, at::Tensor const&, bool) () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#8 0x00007fffe97ee200 in torch::autograd::VariableType::cholesky(at::Tensor const&, bool) ()
from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#9 0x00007fffe7d2d727 in c10::detail::wrap_kernel_functor_unboxed_<c10::detail::WrapRuntimeKernelFunctor_<at::Tensor ()(at::Tensor const&, bool), at::Tensor, c10::guts::typelist::typelist<at::Tensor const&, bool> >, at::Tensor (at::Tensor const&, bool)>::call(c10::OperatorKernel, at::Tensor const&, bool) () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#10 0x00007fffe96c289a in torch::jit::(anonymous namespace)::{lambda(std::vector<c10::IValue, std::allocatorc10::IValue >&)#108}::_FUN () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#11 0x00007fffe9ac8c7d in torch::jit::InterpreterStateImpl::runImpl(std::vector<c10::IValue, std::allocatorc10::IValue >&) () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#12 0x00007fffe9abf9d1 in torch::jit::InterpreterState::run(std::vector<c10::IValue, std::allocatorc10::IValue >&) () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#13 0x00007fffe9a9d341 in torch::jit::GraphExecutorImplBase::run(std::vector<c10::IValue, std::allocatorc10::IValue >&) () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#14 0x00007fffe9dbac0a in torch::jit::GraphFunction::operator()(std::vector<c10::IValue, std::allocatorc10::IValue >, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, c10::IValue, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::__cxx11::basic_string<char, std::char_traits, std::allocator > const, c10::IValue> > > const&) ()
from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#15 0x00007fffe9d6f514 in torch::jit::Method::operator()(std::vector<c10::IValue, std::allocatorc10::IValue >, std::unordered_map<std::__cxx11::basic_string<char, std::char_traits, std::allocator >, c10::IValue, std::hash<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::equal_to<std::__cxx11::basic_string<char, std::char_traits, std::allocator > >, std::allocator<std::pair<std::_cxx11::basic_string<char, std::char_traits, std::allocator > const, c10::IValue> > > const&) () from /usr/local/libtorch_gpu/lib/libtorch_cpu.so
#16 0x0000555555b2f22d in torch::jit::Module::forward (this=0x5555cb3673a0,
inputs=std::vector of length 0, capacity 0)