Call cublas functions in custom kernel?

Is it allowed / possible to call cublas functions (e.g. dot product) inside a kernel?

As a minimal example (based on the c++ extensions tutorial):

#include <THC/THCGeneral.h>
#include <THC/THCBlas.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

#include <vector>

namespace {
  __global__ void my_op_forward_kernel(
      cublasHandle_t handle,
      const float* x,
      const float* y,
      float* output,
      size_t n, size_t d) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i + d < n) {
      float result;
      cublasSdot(handle, d, x, 1, y, 1, &result);
      output[i] = result;
    }
  }

} // namespace

std::vector<at::Tensor> my_op_cuda_forward(THCState *state, at::Tensor x, at::Tensor y, int d) {

  cublasHandle_t handle = THCState_getCurrentBlasHandle(state);
  cublasSetStream(handle, THCState_getCurrentStream(state));

  auto n = x.size(0);
  auto output = at::zeros_like(x);

  const int threads = 1024;
  const dim3 blocks((n + threads - 1) / threads, 1);

  my_op_forward_kernel<<<blocks, threads, 0, THCState_getCurrentStream(state)>>>(
      handle,
      x.data<float>(),
      y.data<float>(),
      output.data<float>(),
      n, d);

  THCudaCheck(cudaGetLastError());
  return {output};
}

I can get this to compile, but I get an error when I try to call the function (“cuda runtime error (77): an illegal memory access was encountered at torch/csrc/cuda/Module.cpp”).

Is there anything special I should be doing?