Custom cuda module undefined behaviour

I’m working with a remote system where cuda works just fine when used normally, however when I try to implement a custom cuda module it doesn’t seem to work. The code i’m using is from this post

cuda kernel implement?

import torch
from torch.utils.cpp_extension import load_inline

print ("Cuda available: " + str(torch.cuda.is_available()))

cpp_src = """
void add_cu(torch::Tensor a, torch::Tensor b, torch::Tensor c);

#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

void add(torch::Tensor a, torch::Tensor b, torch::Tensor c){
    CHECK_INPUT(a);
    CHECK_INPUT(b);
    CHECK_INPUT(c);
    add_cu(a, b, c);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("add", &add, "add(CUDA)");
}
"""

cuda_src = """
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>

void add_cu(torch::Tensor a, torch::Tensor b, torch::Tensor c);


template <typename scalar_t>
__global__ void add_kernel(
    scalar_t* __restrict__ a, 
    scalar_t* __restrict__ b, 
    scalar_t* __restrict__ c, 
    size_t size
){
    const int index = blockIdx.x * blockDim.x + threadIdx.x;
    const int stride = blockDim.x * gridDim.x;
    for (int i = index; i < size; i += stride){
        a[i] = b[i] + c[i];
    }
}


void add_cu(torch::Tensor a, torch::Tensor b, torch::Tensor c){
    const auto size = a.size(0);

    const int threads = 8;
    const dim3 blocks((size + threads - 1) / threads);

    AT_DISPATCH_FLOATING_TYPES(a.type(), "add cuda", ([&] {
        add_kernel<scalar_t><<<blocks, threads>>>(
            a.data<scalar_t>(), 
            b.data<scalar_t>(), 
            c.data<scalar_t>(), 
            size
        );
    }));
}
"""

add = load_inline(name='add', cpp_sources=[cpp_src],
                   cuda_sources=[cuda_src])

a = torch.zeros((100, ))
b = torch.ones((100, )) * 10
c = torch.ones((100, ))
a = a.cuda(0)
b = b.cuda(0)
c = c.cuda(0)
add.add(a, b, c)
print(a)

Output

Cuda available: True
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.], device='cuda:0')

what might be the reason for this?

It does give me this warning, but im not sure if its related (the system is a linux machine with g++).

                               !! WARNING !!

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Your compiler (c++) is not compatible with the compiler Pytorch was
built with for this platform, which is g++ on linux. Please
use g++ to to compile your extension. Alternatively, you may
compile PyTorch from source using c++, and then you can also use
c++ to compile your extension.

See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help
with compiling PyTorch from source.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

I also tried using setuptools instead of inline and got the same result.

Update; adding this before kernel launch will output:

cudaError_t err = cudaGetLastError();
printf("CUDA Error: %s\\n", cudaGetErrorString(err));

CUDA Error: initialization error. The cuda runtime api as described here Tutorial 01: Say Hello to CUDA - CUDA Tutorial works as expected.