I’m working with a remote system where cuda works just fine when used normally, however when I try to implement a custom cuda module it doesn’t seem to work. The code i’m using is from this post
cuda kernel implement?
import torch
from torch.utils.cpp_extension import load_inline
print ("Cuda available: " + str(torch.cuda.is_available()))
cpp_src = """
void add_cu(torch::Tensor a, torch::Tensor b, torch::Tensor c);
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
void add(torch::Tensor a, torch::Tensor b, torch::Tensor c){
CHECK_INPUT(a);
CHECK_INPUT(b);
CHECK_INPUT(c);
add_cu(a, b, c);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("add", &add, "add(CUDA)");
}
"""
cuda_src = """
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
void add_cu(torch::Tensor a, torch::Tensor b, torch::Tensor c);
template <typename scalar_t>
__global__ void add_kernel(
scalar_t* __restrict__ a,
scalar_t* __restrict__ b,
scalar_t* __restrict__ c,
size_t size
){
const int index = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = blockDim.x * gridDim.x;
for (int i = index; i < size; i += stride){
a[i] = b[i] + c[i];
}
}
void add_cu(torch::Tensor a, torch::Tensor b, torch::Tensor c){
const auto size = a.size(0);
const int threads = 8;
const dim3 blocks((size + threads - 1) / threads);
AT_DISPATCH_FLOATING_TYPES(a.type(), "add cuda", ([&] {
add_kernel<scalar_t><<<blocks, threads>>>(
a.data<scalar_t>(),
b.data<scalar_t>(),
c.data<scalar_t>(),
size
);
}));
}
"""
add = load_inline(name='add', cpp_sources=[cpp_src],
cuda_sources=[cuda_src])
a = torch.zeros((100, ))
b = torch.ones((100, )) * 10
c = torch.ones((100, ))
a = a.cuda(0)
b = b.cuda(0)
c = c.cuda(0)
add.add(a, b, c)
print(a)
Output
Cuda available: True
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0.], device='cuda:0')
what might be the reason for this?
It does give me this warning, but im not sure if its related (the system is a linux machine with g++).
!! WARNING !!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Your compiler (c++) is not compatible with the compiler Pytorch was
built with for this platform, which is g++ on linux. Please
use g++ to to compile your extension. Alternatively, you may
compile PyTorch from source using c++, and then you can also use
c++ to compile your extension.
See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help
with compiling PyTorch from source.
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
I also tried using setuptools instead of inline and got the same result.