Hi All,
I have C++ module subclass torch::nn::Module. I built the module into libmnist_torch.so.
MNISTImpl::MNISTImpl() {
conv1_ = register_module(“conv1_”,
torch::nn::Conv2d(torch::nn::Conv2dOptions(1, 6, 5).stride(1)));
pool1_ = register_module(“pool1_”, torch::nn::AvgPool2d(torch::nn::AvgPool2dOptions(2).stride(2)));
conv2_ = register_module(“conv2_”, torch::nn::Conv2d(torch::nn::Conv2dOptions(6, 16, 5).stride(1)));
pool2_ = register_module(“pool2_”, torch::nn::AvgPool2d(torch::nn::AvgPool2dOptions(2).stride(2)));
fc1_ = register_module(“fc1_”, torch::nn::Linear(400, 120));
fc2_ = register_module(“fc2_”, torch::nn::Linear(120, 84));
fc3_ = register_module(“fc3_”, torch::nn::Linear(84, 10));
}
…
Then from another project’s build script, I link libmnist_torch.so,
// Grab GPU if available
torch::Device device(torch::kCPU);
if (torch::cuda::is_available()) {
device = torch::Device(torch::kCUDA, 0);
}
…
myMNIST = MNIST();
myMNIST->to(device);
myMNIST->to(device)
triggers SEGV exception. The stacktrace is
#1 0x00007fc8029ca181 in std::unique_ptr<THCState, void ()(THCState)>::reset (
this=0x7fc7fef36c58 at::globalContext()::globalContext_+24,
__p=0x7fc7f8ce40f0 at::Context::Context():{lambda(THHState*)#2}::_FUN(THHState*))
at …/gcc-7.4.0/bin/…/include/c++/7.4.0/bits/unique_ptr.h:376
#2 0x00007fc8029c9e82 in std::unique_ptr<THCState, void ()(THCState)>::operator= (
this=0x7fc7fef36c58 at::globalContext()::globalContext_+24, __u=…)
at …/gcc-7.4.0/bin/…/include/c++/7.4.0/bits/unique_ptr.h:283
#3 0x00007fc8029c4064 in at::Context::lazyInitCUDA()::{lambda()#1}::operator()() const (this=0x7fc8091bd150)
** at …/pytorch/torch/include/ATen/Context.h:74**
#4 0x00007fc8029d7def in std::__invoke_impl<void, at::Context::lazyInitCUDA()::{lambda()#1}>(std::__invoke_other, at::Context::lazyInitCUDA()::{lambda()#1}&&) (__f=…)
at …/gcc-7.4.0/bin/…/include/c++/7.4.0/bits/invoke.h:60
…
Frame 3 shows
(gdb) f 3
#3 0x00007fc8029c4064 in at::Context::lazyInitCUDA()::{lambda()#1}::operator()() const (this=0x7fc8091bd150)
at …/pytorch/torch/include/ATen/Context.h:74
74 thc_state = detail::getCUDAHooks().initCUDA();
It seems that torch::Device(torch::kCUDA, 0) tries to initiate CUDA again and fails.
My questions are:
- Why does the second CUDA initialization,
myMNIST->to(device)
, inside a .so fail why the caller of such .so has initialized CUDA alreadydevice = torch::Device(torch::kCUDA, 0);
? - How can I pass a flag to torch so that
myMNIST->to(device)
would not call callinitCUDA()
again?
Thank you very much.