Hi,
I am trying to create a custom module using CUDA with Dynamic Parallelism. I have followed the basic module extension from Pytorch Documentation.
My newmodule.cpp is as follows
#include <torch/extension.h>
cudaError_t newmodule_cuda(...);
torch::Tensor newmodule(
const torch::Tensor& self,
const torch::Tensor& weight,
const torch::Tensor& bias,
int64_t pad)
{
// do some operations
newmodule_cuda(...);
// do some operations
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &newmodule, "newmodule CUDA");
}
My newmodule_cuda.cu is as follows.
#include <cuda.h>
#include <cuda_runtime.h>
#include <driver_functions.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
template <typename scalar_t>
__global__
void child(.....)
{
// do operation with tensor
}
template <typename scalar_t>
__global__
void parent(...)
{
// do parent operation
// launching child
// Convolve
child<<<numBlock, numThread>>>(...);
child<<<numBlock, numThread>>>(...);
// do parent operation
}
cudaError_t newmodule_cuda(...)
{
torch::Device deviceCPU(torch::kCPU);
torch::Device deviceGPU(torch::kCPU);
if (torch::cuda::is_available())
{
std::cout << "CUDA is available! Run on GPU." << std::endl;
// do preparation
AT_DISPATCH_FLOATING_TYPES(self.type(), "newmodule_cuda", ([&] {
parent<scalar_t><<<numBlock,numThread>>>(...);
}));
cudaDeviceSynchronize();
}
else
{
// CPU code
}
return cudaSuccess;
}
My setup.py is as follows.
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='newmodule',
ext_modules=[
CUDAExtension('convtbcglu_cuda', [
'newmodule.cpp',
'newmodule_cuda.cu',
],
extra_compile_args={'cxx': ['-Wall'], 'nvcc': ['-arch=sm_70']})
],
cmdclass={
'build_ext': BuildExtension
})
With this setup, I get an error when executing python setup.py install
error: kernel launch from __device__ or __global__ functions requires separate compilation mode
Then, I add ‘-rdc=true’, ‘-lcudadevrt’ to my setup.py as follows.
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='newmodule',
ext_modules=[
CUDAExtension('convtbcglu_cuda', [
'newmodule.cpp',
'convtbcglu_cuda.cu',
],
extra_compile_args={'cxx': ['-Wall'], 'nvcc': ['-arch=sm_70', '-rdc=true', '-lcudadevrt']})
],
cmdclass={
'build_ext': BuildExtension
})
It is compiled successfuly, but when I am trying to run the newmodule, it returns error.
undefined symbol: __cudaRegisterLinkedBinary_50
I have read this Issue but still cannot figure out the solution.
Any help would be appreciated.
Thank you.