Hello
I am trying to build matrix multiplication cuda extension using cublas.
Here is my code
#include <torch/extension.h>
void matmul_cuda_forward(
float* A,
float* B,
float* C,
int N);
void matmul_forward(
float* A,
float* B,
float* C,
int N) {
matmul_cuda_forward(A, B, C, N);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &matmul_forward, "matmul forward (CUDA)");
}
#include <torch/extension.h>
#include <cuda_runtime_api.h>
#include <cublas_v2.h>
void matmul_cuda_forward(
float* A,
float* B,
float* C,
int N) {
int lda=N,ldb=N,ldc=N;
const float alf = 1;
const float bet = 0;
const float *alpha = &alf;
const float *beta = &bet;
// Create a handle for CUBLAS
cublasHandle_t handle;
cublasCreate(&handle);
// Do the actual multiplication
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, alpha, A, lda, B, ldb, beta, C, ldc);
// Destroy the handle
cublasDestroy(handle);
}
from setuptools import setup
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
setup(
name='matmul_cuda',
ext_modules=[
CUDAExtension('matmul_cuda', ['matmul_cuda.cpp', 'matmul_cublas_kernel.cu'],
extra_compile_args={'cxx':[], 'nvcc':['-arch=sm_80', '-lcublas']})
],
cmdclass={
'build_ext': BuildExtension
})
import math
from torch import nn
from torch.autograd import Function
import torch
import matmul_cuda
torch.manual_seed(42)
device = torch.device("cuda")
N = 4
A = torch.zeros((N*N),dtype=torch.float32).to(device)
B = torch.zeros((N*N),dtype=torch.float32).to(device)
C = torch.zeros((N*N),dtype=torch.float32).to(device)
A_p = A.contiguous().data_ptr()
B_p = B.contiguous().data_ptr()
C_p = C.contiguous().data_ptr()
for i in range(N*N):
A[i] = (i%4) + 1
B[i] = (i%4) + 5
matmul_cuda.forward(A_p, B_p, C_p, N)
print(C)
It builds cuda extension successfully but when I run the python code
I got this error
RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Can anyone tell me what is wrong with this code?
Does torch cuda extension cannot get the float* value?
Thank you