I’m new about cuda programming but I need to make some pytorch cuda programming now.
So I looked the pytorch cuda extension tutorial and had a try. First I’d like to try the add
example, so I tried:
// add_cuda.h
#ifndef _ADD_CUDA
#define _ADD_CUDA
#include <torch/extension.h>
void add(torch::Tensor a, torch::Tensor b, torch::Tensor c);
#endif
// add_wrapper.cpp
#include "add_cuda.h"
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
void add(torch::Tensor a, torch::Tensor b, torch::Tensor c){
CHECK_INPUT(a);
CHECK_INPUT(b);
CHECK_INPUT(c);
add_cu(a, b, c);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("add", &add, "add(CUDA)");
}
// add_cuda.cu
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include "add_cuda.h"
template <typename scalar_t>
__global__ void add_kernel(
scalar_t* __restrict__ a,
scalar_t* __restrict__ b,
scalar_t* __restrict__ c,
size_t size
){
const int index = blockIdx.x * blockDim.x + threadIdx.x;
const int stride = blockIdx.x * gridDim.x;
for (int i = index; i < size; i += stride){
a[i] = b[i] + c[i];
}
}
void add_cu(torch::Tensor a, torch::Tensor b, torch::Tensor c){
const auto size = a.size(0);
const int threads = 8;
const dim3 blocks((size + threads - 1) / threads);
AT_DISPATCH_FLOATING_TYPES(a.type(), "add cuda", ([&] {
add_kernel<scalar_t><<<blocks, threads>>>(
a.data<scalar_t>(),
b.data<scalar_t>(),
c.data<scalar_t>(),
size
);
}));
}
And the test py file:
# test.py
import torch
import add
if __name__ == "__main__":
a = torch.zeros((100, ))
b = torch.ones((100, )) * 10
c = torch.ones((100, ))
a = a.cuda(1)
b = b.cuda(1)
c = c.cuda(1)
add.add(a, b, c)
print(a)
Theoretically the print output should be a torch tensor with 100 11
s, but actually the output is
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0.], device='cuda:1')
It seems that the add_kernel
in add_cuda.cu
didn’t implement at all, but I don’t know why it didn’t implement.
Any help would be appreciated. Thank you!