Custom CUDA Extension with Dynamic Parallelism

(Bagus Hanindhito) #1

Hi,

I am trying to create a custom module using CUDA with Dynamic Parallelism. I have followed the basic module extension from Pytorch Documentation.

My newmodule.cpp is as follows

#include <torch/extension.h>

cudaError_t newmodule_cuda(...);

torch::Tensor newmodule(
    const torch::Tensor& self,
    const torch::Tensor& weight,
    const torch::Tensor& bias,
    int64_t pad)
{
	// do some operations
        newmodule_cuda(...);
        // do some operations
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &newmodule, "newmodule CUDA");
}

My newmodule_cuda.cu is as follows.

#include <cuda.h>
#include <cuda_runtime.h>
#include <driver_functions.h>
#include <torch/extension.h>
#include <ATen/ATen.h>

template <typename scalar_t>
__global__ 
void child(.....)
{
	// do operation with tensor
}

template <typename scalar_t>
__global__
void parent(...)
{
	// do parent operation
	
	// launching child
		
	// Convolve
	child<<<numBlock, numThread>>>(...);
	child<<<numBlock, numThread>>>(...);
	// do parent operation	
}

cudaError_t newmodule_cuda(...)
{
	torch::Device deviceCPU(torch::kCPU);
    torch::Device deviceGPU(torch::kCPU);
	if (torch::cuda::is_available())
    {
        std::cout << "CUDA is available! Run on GPU." << std::endl;
		// do preparation
	    AT_DISPATCH_FLOATING_TYPES(self.type(), "newmodule_cuda", ([&] {
			parent<scalar_t><<<numBlock,numThread>>>(...);	
		}));
        cudaDeviceSynchronize();
    }
	else
	{
		// CPU code
		
	}
	return cudaSuccess;
}

My setup.py is as follows.

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
setup(
    name='newmodule',
    ext_modules=[
        CUDAExtension('convtbcglu_cuda', [
            'newmodule.cpp',
            'newmodule_cuda.cu',
        ],
		extra_compile_args={'cxx': ['-Wall'], 'nvcc': ['-arch=sm_70']})                  
    ],
    cmdclass={
        'build_ext': BuildExtension
    })

With this setup, I get an error when executing python setup.py install

error: kernel launch from __device__ or __global__ functions requires separate compilation mode

Then, I add ‘-rdc=true’, ‘-lcudadevrt’ to my setup.py as follows.

from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
 
setup(
    name='newmodule',
    ext_modules=[
        CUDAExtension('convtbcglu_cuda', [
            'newmodule.cpp',
            'convtbcglu_cuda.cu',
        ],
		extra_compile_args={'cxx': ['-Wall'], 'nvcc': ['-arch=sm_70', '-rdc=true', '-lcudadevrt']})                   
    ],
    cmdclass={
        'build_ext': BuildExtension
    })

It is compiled successfuly, but when I am trying to run the newmodule, it returns error.

undefined symbol: __cudaRegisterLinkedBinary_50

I have read this Issue but still cannot figure out the solution.

Any help would be appreciated.

Thank you.