Custom CUDA Extension with Dynamic Parallelism

Hi,

I am trying to create a custom module using CUDA with Dynamic Parallelism. I have followed the basic module extension from Pytorch Documentation.

My newmodule.cpp is as follows

#include <torch/extension.h>

cudaError_t newmodule_cuda(...);

torch::Tensor newmodule(
    const torch::Tensor& self,
    const torch::Tensor& weight,
    const torch::Tensor& bias,
    int64_t pad)
{
	// do some operations
        newmodule_cuda(...);
        // do some operations
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &newmodule, "newmodule CUDA");
}

My newmodule_cuda.cu is as follows.

#include <cuda.h>
#include <cuda_runtime.h>
#include <driver_functions.h>
#include <torch/extension.h>
#include <ATen/ATen.h>

template <typename scalar_t>
__global__ 
void child(.....)
{
	// do operation with tensor
}

template <typename scalar_t>
__global__
void parent(...)
{
	// do parent operation
	
	// launching child
		
	// Convolve
	child<<<numBlock, numThread>>>(...);
	child<<<numBlock, numThread>>>(...);
	// do parent operation	
}

cudaError_t newmodule_cuda(...)
{
	torch::Device deviceCPU(torch::kCPU);
    torch::Device deviceGPU(torch::kCPU);
	if (torch::cuda::is_available())
    {
        std::cout << "CUDA is available! Run on GPU." << std::endl;
		// do preparation
	    AT_DISPATCH_FLOATING_TYPES(self.type(), "newmodule_cuda", ([&] {
			parent<scalar_t><<<numBlock,numThread>>>(...);	
		}));
        cudaDeviceSynchronize();
    }
	else
	{
		// CPU code
		
	}
	return cudaSuccess;
}

My setup.py is as follows.

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
setup(
    name='newmodule',
    ext_modules=[
        CUDAExtension('convtbcglu_cuda', [
            'newmodule.cpp',
            'newmodule_cuda.cu',
        ],
		extra_compile_args={'cxx': ['-Wall'], 'nvcc': ['-arch=sm_70']})                  
    ],
    cmdclass={
        'build_ext': BuildExtension
    })

With this setup, I get an error when executing python setup.py install

error: kernel launch from __device__ or __global__ functions requires separate compilation mode

Then, I add ‘-rdc=true’, ‘-lcudadevrt’ to my setup.py as follows.

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
setup(
    name='newmodule',
    ext_modules=[
        CUDAExtension('convtbcglu_cuda', [
            'newmodule.cpp',
            'convtbcglu_cuda.cu',
        ],
		extra_compile_args={'cxx': ['-Wall'], 'nvcc': ['-arch=sm_70', '-rdc=true', '-lcudadevrt']})                   
    ],
    cmdclass={
        'build_ext': BuildExtension
    })

It is compiled successfuly, but when I am trying to run the newmodule, it returns error.

undefined symbol: __cudaRegisterLinkedBinary_50

I have read this Issue but still cannot figure out the solution.

Any help would be appreciated.

Thank you.

try changing
global
void child(…)
to

device
void child(…)
Let me know if that helps