I’m writing a cuda extension for a customized function. It requires an atomicAdd() function during the calculation. Meanwhile, I want to make it possible for half-precision support. for example, in my .cuda file I have something like:
#include <c10/util/Half.h>
#include "sample.h"
//------------------------------------------------------------------------
// Helpers.
template <class T> struct InternalType;
template <> struct InternalType<double> { typedef double scalar_t; };
template <> struct InternalType<float> { typedef float scalar_t; };
template <> struct InternalType<c10::Half> { typedef float scalar_t; };
//------------------------------------------------------------------------
// Generic CUDA implementation.
template <class T> static __global__ void my_kernel(const void* x, void* y)
{
typedef typename InternalType<T>::scalar_t scalar_t;
// some operation includes the atomicAdd operation:
for (...)
{
...
scalar_t final_value = (scalar_t)(*x);
atomicAdd(y, (T)final_value);
...
}
}
//------------------------------------------------------------------------
// Kernel select function that's callable from .cpp file.
template <class T> kernel_spec choose_kernel()
{
spec = {(void*)my_kernel<T>};
return spec;
}
//------------------------------------------------------------------------
// Template specializations.
template kernel_spec choose_kernel<double> ();
template kernel_spec choose_kernel<float> ();
template kernel_spec choose_kernel<c10::Half>();
And in my .cpp file, I have something like:
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "sample.h"
...
// Choose CUDA kernel.
kernel_spec spec;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(y.scalar_type(), "my_kernel_cuda", [&]
{
spec = choose_kernel<scalar_t>();
});
But I got error during the compiling:
RuntimeError: Error building extension 'sample_plugin': [1/3] /usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=sample_plugin -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.8/site-packages/torch/include -isystem /opt/conda/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 --compiler-options '-fPIC' --use_fast_math -std=c++14 -c /workspace/torch_utils/ops/sample.cu -o sample.cuda.o
FAILED: sample.cuda.o
/usr/local/cuda/bin/nvcc -DTORCH_EXTENSION_NAME=sample_plugin -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1013\" -isystem /opt/conda/lib/python3.8/site-packages/torch/include -isystem /opt/conda/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 --compiler-options '-fPIC' --use_fast_math -std=c++14 -c /workspace/torch_utils/ops/sample.cu -o sample.cuda.o
/workspace/torch_utils/ops/sample.cu(72): error: no instance of overloaded function "atomicAdd" matches the argument list
argument types are: (c10::Half *, c10::Half)
detected during:
instantiation of "void kernel_backward<T>(kernel_params) [with T=c10::Half]"
(84): here
instantiation of "kernel_spec choose_kernel<T>(int) [with T=c10::Half]"
(93): here
atomicAdd should support half-precision since compute capability 7.* and I have 7.5 (GeForce RTX 2080) in my case.
Is there anything I did wrong? or any suggestions I can fix it? Thanks!