Hi all,
cpp is far outside my comfort zone. So far I was using torch.nn.grad.conv2d_weight but this seems very slow. I want to know if using cudnn_convolution_backward_weight is faster. My cpp extension looks like this:
#include <torch/extension.h>
#include <vector>
#include <ATen/NativeFunctions.h>
#include <ATen/Config.h>
at::Tensor backward_weight(
c10::ArrayRef<long int> weight_size,
const at::Tensor& grad_output,
const at::Tensor& input,
c10::ArrayRef<long int> padding,
c10::ArrayRef<long int> stride,
c10::ArrayRef<long int> dilation,
int64_t groups,
bool benchmark,
bool deterministic) {
return at::cudnn_convolution_backward_weight(
weight_size,
grad_output,
input,
padding,
stride,
dilation,
groups,
benchmark,
deterministic);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("backward", &backward_weight, "Conv2d backward cudnn");
}
Which I compile jit like this:
from torch.utils.cpp_extension import load
conv2d_cudnn = load(name="conv2d_backward", sources=["conv2d_backward.cpp"], verbose=True)
I can then use it in my python code: conv2d_cudnn.backward.
All my parameters seems correct (identical to torch.nn.grad.conv2d_weight, order has to be a bit different though.
I receive the following error:
RuntimeError: cuDNN error: CUDNN_STATUS_BAD_PARAM (getWorkspaceSize at /opt/conda/conda-bld/pytorch_1549628766161/work/aten/src/ATen/native/cudnn/Conv.cpp:653)
frame #0: c10::Error::Error(c10::SourceLocation, std::string const&) + 0x45 (0x7f1d5fc60cf5 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1082df9 (0x7f1d63d8bdf9 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so)
frame #2: at::native::raw_cudnn_convolution_backward_weight_out(at::Tensor const&, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) + 0x19b (0x7f1d63d89eeb in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so)
frame #3: at::native::cudnn_convolution_backward_weight(char const*, c10::ArrayRef<long>, at::TensorArg const&, at::TensorArg const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) + 0x3f7 (0x7f1d63d8a797 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so)
frame #4: at::native::cudnn_convolution_backward_weight(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) + 0xf7 (0x7f1d63d8aac7 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so)
frame #5: at::CUDAFloatType::cudnn_convolution_backward_weight(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) const + 0xab (0x7f1d63e62fdb in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libcaffe2_gpu.so)
frame #6: torch::autograd::VariableType::cudnn_convolution_backward_weight(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) const + 0x336 (0x7f1d5dacc436 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libtorch.so.1)
frame #7: <unknown function> + 0x34cb3 (0x7f1d42018cb3 in /tmp/torch_extensions/conv2d_backward/conv2d_backward.so)
frame #8: backward_weight(c10::ArrayRef<long>, at::Tensor const&, at::Tensor const&, c10::ArrayRef<long>, c10::ArrayRef<long>, c10::ArrayRef<long>, long, bool, bool) + 0x85 (0x7f1d42019015 in /tmp/torch_extensions/conv2d_backward/conv2d_backward.so)
frame #9: <unknown function> + 0x57f10 (0x7f1d4203bf10 in /tmp/torch_extensions/conv2d_backward/conv2d_backward.so)
frame #10: <unknown function> + 0x54a4e (0x7f1d42038a4e in /tmp/torch_extensions/conv2d_backward/conv2d_backward.so)
frame #11: <unknown function> + 0x50259 (0x7f1d42034259 in /tmp/torch_extensions/conv2d_backward/conv2d_backward.so)
frame #12: <unknown function> + 0x5093f (0x7f1d4203493f in /tmp/torch_extensions/conv2d_backward/conv2d_backward.so)
frame #13: <unknown function> + 0x412aa (0x7f1d420252aa in /tmp/torch_extensions/conv2d_backward/conv2d_backward.so)
<omitting python frames>
frame #26: torch::autograd::PyFunctionPostHook::operator()(std::vector<torch::autograd::Variable, std::allocator<torch::autograd::Variable> > const&, std::vector<torch::autograd::Variable, std::allocator<torch::autograd::Variable> > const&) + 0xe4 (0x7f1d81086b84 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #27: torch::autograd::Engine::evaluate_function(torch::autograd::FunctionTask&) + 0x1711 (0x7f1d5d9b40c1 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libtorch.so.1)
frame #28: torch::autograd::Engine::thread_main(torch::autograd::GraphTask*) + 0xc0 (0x7f1d5d9b4e80 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libtorch.so.1)
frame #29: torch::autograd::Engine::thread_init(int) + 0xc7 (0x7f1d5d9b1a47 in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libtorch.so.1)
frame #30: torch::autograd::python::PythonEngine::thread_init(int) + 0x2a (0x7f1d8107633a in /home/hans/anaconda/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #31: <unknown function> + 0xb8678 (0x7f1d81d08678 in /home/hans/anaconda/lib/python3.6/site-packages/torch/../../../libstdc++.so.6)
frame #32: <unknown function> + 0x76ba (0x7f1d91d7e6ba in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #33: clone + 0x6d (0x7f1d91ab441d in /lib/x86_64-linux-gnu/libc.so.6)
Does anybody know where to look to debug?
Thanks!