Problems with inference on CPU on C++ (Expected object of backend CUDA but got backend CPU for argument #2 'weight')

I have problems with inference on C++ via using CPU. Everything works fine on the GPU, but after transferring the model to the CPU, everything breaks down with this error:

terminate called after throwing an instance of 'std::runtime_error'
  what():  
Expected object of backend CUDA but got backend CPU for argument #2 'weight' (checked_tensor_unwrap at /pytorch/aten/src/ATen/Utils.h:70)
frame #0: std::function<std::string ()>::operator()() const + 0x11 (0x7fffe8abafe1 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libc10.so)
frame #1: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x2a (0x7fffe8abadfa in /home/gulyaev/Work/build/pytorch/libtorch/lib/libc10.so)
frame #2: <unknown function> + 0xcad2d8 (0x7fffb73132d8 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2_gpu.so)
frame #3: at::CUDAFloatType::_thnn_conv2d_forward(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>) const + 0xdf (0x7fffb7352e7f in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2_gpu.so)
frame #4: at::native::thnn_conv2d_forward(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>) + 0x5a (0x7fffebab476a in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #5: at::TypeDefault::thnn_conv2d_forward(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>) const + 0x88 (0x7fffebd2b2a8 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #6: torch::autograd::VariableType::thnn_conv2d_forward(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>) const + 0x598 (0x7fffea883ce8 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #7: at::native::thnn_conv2d(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>) + 0x66 (0x7fffebab45c6 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #8: at::TypeDefault::thnn_conv2d(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>) const + 0x88 (0x7fffebd2b0d8 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #9: torch::autograd::VariableType::thnn_conv2d(at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>) const + 0x1ac (0x7fffea9cc13c in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #10: at::native::_convolution_nogroup(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, c10::ArrayRef<long>) + 0x72f (0x7fffeba325af in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #11: at::TypeDefault::_convolution_nogroup(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, c10::ArrayRef<long>) const + 0x98 (0x7fffebd0a248 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #12: torch::autograd::VariableType::_convolution_nogroup(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, c10::ArrayRef<long>) const + 0x1e0 (0x7fffeaa1e2a0 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #13: at::native::_convolution(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, c10::ArrayRef<long>, long, bool, bool, bool) + 0x1ee2 (0x7fffeba36c52 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #14: at::TypeDefault::_convolution(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, c10::ArrayRef<long>, long, bool, bool, bool) const + 0xce (0x7fffebd0a15e in /home/gulyaev/Work/build/pytorch/libtorch/lib/libcaffe2.so)
frame #15: torch::autograd::VariableType::_convolution(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, bool, c10::ArrayRef<long>, long, bool, bool, bool) const + 0x26a (0x7fffea9087aa in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #16: <unknown function> + 0x56826b (0x7fffeaa7b26b in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #17: <unknown function> + 0x686166 (0x7fffeab99166 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #18: torch::jit::InterpreterState::run(std::vector<c10::IValue, std::allocator<c10::IValue> >&) + 0x22 (0x7fffeab94222 in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #19: <unknown function> + 0x664d6c (0x7fffeab77d6c in /home/gulyaev/Work/build/pytorch/libtorch/lib/libtorch.so.1)
frame #20: <unknown function> + 0x1aad8 (0x55555556ead8 in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #21: <unknown function> + 0x1ab3a (0x55555556eb3a in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #22: <unknown function> + 0x1ba15 (0x55555556fa15 in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #23: <unknown function> + 0x15c41 (0x555555569c41 in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #24: <unknown function> + 0x1509f (0x55555556909f in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #25: <unknown function> + 0x2a76a (0x55555557e76a in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #26: <unknown function> + 0x29e76 (0x55555557de76 in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #27: <unknown function> + 0xa81d (0x55555555e81d in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
frame #28: __libc_start_main + 0xe7 (0x7fffe7597b97 in /lib/x86_64-linux-gnu/libc.so.6)
frame #29: <unknown function> + 0xa49a (0x55555555e49a in /home/gulyaev/Work/build/pytorch/build-pytorch_inference-Desktop-Debug/FeatureTesting)
:
operation failed in interpreter:
op_version_set = 0
def forward(self,
    x_1: Tensor) -> Tensor:
  data = torch.slice(x_1, 0, 0, 9223372036854775807, 1)
  input_1 = torch.to(data, dtype=6, layout=0, device=torch.device("cuda"), non_blocking=False, copy=False)
  input_2 = torch._convolution(input_1, self.extractor.init_block.conv.weight, None, [2, 2], [1, 1], [1, 1], False, [0, 0], 1, False, False, True)
            ~~~~~~~~~~~~~~~~~~ <--- HERE
  _0 = torch.add_(self.extractor.init_block.bn.num_batches_tracked, CONSTANTS.c0, alpha=1)
  input_3 = torch.batch_norm(input_2, self.extractor.init_block.bn.weight, self.extractor.init_block.bn.bias, self.extractor.init_block.bn.running_mean, self.extractor.init_block.bn.running_var, True, 0.10000000000000001, 1.0000000000000001e-05, True)
  input_4 = torch.threshold_(input_3, 0., 0.)
  input_5 = torch._convolution(input_4, self.extractor.stage1.unit1.dw_conv.conv.weight, None, [1, 1], [1, 1], [1, 1], False, [0, 0], 32, False, False, True)
  _1 = torch.add_(self.extractor.stage1.unit1.dw_conv.bn.num_batches_tracked, CONSTANTS.c0, alpha=1)
  input_6 = torch.batch_norm(input_5, self.extractor.stage1.unit1.dw_conv.bn.weight, self.extractor.stage1.unit1.dw_conv.bn.bias, self.extractor.stage1.unit1.dw_conv.bn.running_mean, self.extractor.stage1.unit1.dw_conv.bn.running_var, True, 0.10000000000000001, 1.0000000000000001e-05, True)
  input_7 = torch.threshold_(input_6, 0., 0.)
  input_8 = torch._convolution(input_7, self.extractor.stage1.unit1.pw_conv.conv.weight, None, [1, 1], [0, 0], [1, 1], False, [0, 0], 1, False, False, True)
  _2 = torch.add_(self.extractor.stage1.unit1.pw_conv.bn.num_batches_tracked, CONSTANTS.c0, alpha=1)

I have libtorch on C++ from official site. I have tried with different versions with CUDA and without. This model worked without problems on the CPU on Python.

Code where the problem occurs:

dst_data = torch::from_blob(img_float.data, {1, 224, 224, 3}).to(torch::kCPU);
dst_data = dst_data.permute({0, 3, 1, 2});
std::vector<torch::jit::IValue> inputs({dst_data});
model->to(torch::kCPU);
torch::Tensor out_tensor = model->forward(inputs).toTensor();

This seems like a bug. Could you open a bug report at https://github.com/pytorch/pytorch/issues?