Hi,

I am relatively new to PyTorch C++ and cuda implementation. I wrote the following code to compute the dot product between two vectors.

#include <torch/extension.h>

#include

using namespace std;__global__

void dot_cuda_kernel(

torch::PackedTensorAccessor<float, 1> &x_a,

torch::PackedTensorAccessor<float, 1> &y_a,

int n,

float *c)

{

int index = threadIdx.x;

int stride = blockDim.x;`// for(int i = index; i < n; i += stride) for(int i = 0; i < n; i++) atomicAdd(c, x_a[i] * y_a[i]);`

}

float dot_cuda(torch::Tensor &x, torch::Tensor &y)

{

auto x_a = x.packed_accessor<float, 1>();

auto y_a = y.packed_accessor<float, 1>();`float c = 0; dot_cuda_kernel<<<1, 1>>>(x_a, y_a, x.size(0), &c); cout<<c<<endl; return c;`

}

But it does not work. The function dot_cuda always return 0. What could possibly go wrong? Thanks in advance!