atomicAdd does not work


I am relatively new to PyTorch C++ and cuda implementation. I wrote the following code to compute the dot product between two vectors.

#include <torch/extension.h>
using namespace std;

void dot_cuda_kernel(
torch::PackedTensorAccessor<float, 1> &x_a,
torch::PackedTensorAccessor<float, 1> &y_a,
int n,
float *c)
int index = threadIdx.x;
int stride = blockDim.x;

// for(int i = index; i < n; i += stride)
for(int i = 0; i < n; i++)
   atomicAdd(c, x_a[i] * y_a[i]);


float dot_cuda(torch::Tensor &x, torch::Tensor &y)
auto x_a = x.packed_accessor<float, 1>();
auto y_a = y.packed_accessor<float, 1>();

float c = 0;

dot_cuda_kernel<<<1, 1>>>(x_a, y_a, x.size(0), &c);


return c;


But it does not work. The function dot_cuda always return 0. What could possibly go wrong? Thanks in advance!

c is in cpu memory. you can’t add to it in a cuda kernel.