I see that acos for gpu has implementation here.
But when I tried to compile the following code
// Type your code here, or load an example.
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
__global__ void asin_kernel(half *input, half *output, int size) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < size) {
output[tid] = ::acos(input[tid]);
}
}
I got error
<source>(9): error: more than one instance of overloaded function "acos" matches the argument list:
function "std::acos(long double)" (declared at line 91 of /opt/compiler-explorer/gcc-10.2.0/include/c++/10.2.0/cmath)
function "std::acos(float)" (declared at line 87 of /opt/compiler-explorer/gcc-10.2.0/include/c++/10.2.0/cmath)
argument types are: (half)
output[tid] = ::acos(input[tid]);
^
1 error detected in the compilation of "<source>".
Compiler returned: 2
how does pytorch get half input with acos work?