I’m getting a segmentation fault when running loss backward.
int n_a = 31;
int n_b = 100;
torch::Tensor dynamic_parameters = torch::full({1}, /*value=*/0.0002, torch::dtype(torch::kFloat64).requires_grad(true));
torch::optim::SGD optimizer({dynamic_parameters}, /*lr=*/0.01);
torch::Tensor target = torch::zeros({n_a}, torch::dtype(torch::kFloat64).requires_grad(false));
torch::Tensor dots = torch::zeros({n_a, n_b}, torch::dtype(torch::kFloat64).requires_grad(false));
// I fill this with some data
torch::Tensor magnitudes = torch::zeros({n_b}, torch::dtype(torch::kFloat64).requires_grad(false));
// I fill out this with some data
torch::Tensor prediction = torch::matmul(dots, magnitudes);
torch::Tensor loss = torch::nn::functional::mse_loss(prediction, target);
loss.backward();
I’m getting
Abnormal program termination: received signal 11 (Segmentation fault)
I printed loss, in case that helps.:
1.31119e+07
[ CPUDoubleType{} ]
I’m only setting requires_grad on the parameter I’m using SGD on - should I be setting it on all of these tensors?
Which libtorch version are you using? In case an older one, could you try the latest release and see, if you are still getting this seg fault?
If so, could you get the backtrace via:
Thread 1 "python" received signal SIGSEGV, Segmentation fault.
_PyMethodDef_RawFastCallDict ()
at /tmp/build/80754af9/python_1565725737370/work/Objects/call.c:464
464 /tmp/build/80754af9/python_1565725737370/work/Objects/call.c: No such file or directory.
hread 50 (Thread 0x7fff00868700 (LWP 18409)):
#0 0x00007ffff7bc1ad3 in futex_wait_cancelable (private=<optimised out>, expected=0, futex_word=0x55555a5de548)
at ../sysdeps/unix/sysv/linux/futex-internal.h:88
#1 __pthread_cond_wait_common (abstime=0x0, mutex=0x55555a5de550, cond=0x55555a5de520) at pthread_cond_wait.c:502
#2 __pthread_cond_wait (cond=0x55555a5de520, mutex=0x55555a5de550) at pthread_cond_wait.c:655
#3 0x00007fffb57f54cb in __gthread_cond_wait (__mutex=<optimised out>, __cond=<optimised out>)
at /home/nwani/m3/conda-bld/compilers_linux-64_1560109574129/work/.build/x86_64-conda_cos6-linux-gnu/build/build-cc-gcc-final/x86_64-conda_cos6-linux-gnu/libstdc++-v3/include/x86_64-conda_cos6-linux-gnu/bits/gthr-default.h:878
#4 std::condition_variable::wait (this=<optimised out>, __lock=...)
at /home/nwani/m3/conda-bld/compilers_linux-64_1560109574129/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/condition_variable.cc:53
#5 0x00007fff8cf9cb8b in torch::autograd::ReadyQueue::pop() () from /home/ian/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so
#6 0x00007fff8cfa1cd9 in torch::autograd::Engine::thread_main(std::shared_ptr<torch::autograd::GraphTask> const&) ()
from /home/ian/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so
#7 0x00007fff8cf996a9 in torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) ()
from /home/ian/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_cpu.so
#8 0x00007fff91ec451a in torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) ()
from /home/ian/anaconda3/lib/python3.7/site-packages/torch/lib/libtorch_python.so
#9 0x00007fffb57f919d in std::execute_native_thread_routine (__p=0x55555a5aec80)
at /home/nwani/m3/conda-bld/compilers_linux-64_1560109574129/work/.build/x86_64-conda_cos6-linux-gnu/src/gcc/libstdc++-v3/src/c++11/thread.cc:80
#10 0x00007ffff7bbb6db in start_thread (arg=0x7fff00868700) at pthread_create.c:463
#11 0x00007ffff78e471f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
Thread 49 (Thread 0x7fff00067700 (LWP 18408)):
#0 0x00007ffff78e60c7 in accept4 (fd=15, addr=..., addr_len=0x7fff00066df8, flags=524288) at ../sysdeps/unix/sysv/linux/accept4.c:32
#1 0x00007ffeeed009c6 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#2 0x00007ffeeecf1f6d in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#3 0x00007ffeeed02a18 in ?? () from /usr/lib/x86_64-linux-gnu/libcuda.so.1
#4 0x00007ffff7bbb6db in start_thread (arg=0x7fff00067700) at pthread_create.c:463
#5 0x00007ffff78e471f in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:95