Hi,
I am relatively new to PyTorch C++ and cuda implementation. I wrote the following code to compute the dot product between two vectors.
#include <torch/extension.h>
#include
using namespace std;__global__
void dot_cuda_kernel(
torch::PackedTensorAccessor<float, 1> &x_a,
torch::PackedTensorAccessor<float, 1> &y_a,
int n,
float *c)
{
int index = threadIdx.x;
int stride = blockDim.x;// for(int i = index; i < n; i += stride) for(int i = 0; i < n; i++) atomicAdd(c, x_a[i] * y_a[i]);
}
float dot_cuda(torch::Tensor &x, torch::Tensor &y)
{
auto x_a = x.packed_accessor<float, 1>();
auto y_a = y.packed_accessor<float, 1>();float c = 0; dot_cuda_kernel<<<1, 1>>>(x_a, y_a, x.size(0), &c); cout<<c<<endl; return c;
}
But it does not work. The function dot_cuda always return 0. What could possibly go wrong? Thanks in advance!