Is it allowed / possible to call cublas functions (e.g. dot product) inside a kernel?
As a minimal example (based on the c++ extensions tutorial):
#include <THC/THCGeneral.h>
#include <THC/THCBlas.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <vector>
namespace {
__global__ void my_op_forward_kernel(
cublasHandle_t handle,
const float* x,
const float* y,
float* output,
size_t n, size_t d) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i + d < n) {
float result;
cublasSdot(handle, d, x, 1, y, 1, &result);
output[i] = result;
}
}
} // namespace
std::vector<at::Tensor> my_op_cuda_forward(THCState *state, at::Tensor x, at::Tensor y, int d) {
cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
cublasSetStream(handle, THCState_getCurrentStream(state));
auto n = x.size(0);
auto output = at::zeros_like(x);
const int threads = 1024;
const dim3 blocks((n + threads - 1) / threads, 1);
my_op_forward_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
handle,
x.data<float>(),
y.data<float>(),
output.data<float>(),
n, d);
THCudaCheck(cudaGetLastError());
return {output};
}
I can get this to compile, but I get an error when I try to call the function (“cuda runtime error (77): an illegal memory access was encountered at torch/csrc/cuda/Module.cpp”).
Is there anything special I should be doing?