I’m trying to share tensors from pytorch with a cuda kernel that I have compiled separately, and I’m seeing illegal memory access errors when passing the data pointer for the torch tensor into the kernel.
As far as I can tell, from the perspective of the kernel, the pointer I get from tensor.data_ptr()
isn’t actually a real CUDA pointer.
Minimal repro w/ a bunch of debugging prints.
When run, this prints the “Writing to i=” line for each thread, but does not print the line afterwards "A[i] = " for any thread, which leads me to believe that is the line where all of the threads are crashing (first access to the data pointer).
Makefile
CUDA_PATH ?= /usr/local/cuda
.PHONY: run clean
run: zero.so
CUDA_LAUNCH_BLOCKING=1 python zero.py
zero.so: zero.o
nvcc -shared $^ -o $@ -lcuda
zero.o: zero.cu
nvcc -I $(CUDA_PATH)/include -I$(CUDA_PATH)/samples/common/inc --compiler-options '-fPIC' $^ -c $@
clean:
rm -f *.o *.so
zero.cu:
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
namespace
{
__global__ void _zero(float *A, int n)
{
printf("Hello from block %d, thread %d\n", blockIdx.x, threadIdx.x);
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
printf("Writing to i=%d.\n", i);
printf("A[i] = %f.\n", A[i]);
A[i] = 0;
}
else
{
printf("Skipping index %d\n", i);
}
}
}
extern "C" void zero(float *A, int n, int threads)
{
const char *res_str;
CUdeviceptr pbase;
size_t psize;
CUresult res = cuMemGetAddressRange(&pbase, &psize, (CUdeviceptr)A);
cuGetErrorName(res, &res_str);
if (res != CUDA_SUCCESS)
printf("Error getting A mem address range: (%d) %s\n", res, res_str);
else
printf("A mem address range: 0x%p - 0x%p, size: %zu\n", (void *)pbase, (void *)pbase + psize, psize);
printf("Running the kernel\n");
const int blocks = (n + threads - 1) / threads;
_zero<<<blocks, threads>>>(A, n);
printf("Finished running the kernel\n");
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Failed to launch zero kernel (error code %s)!\n", cudaGetErrorString(err));
else
printf("Successfully launched zero kernel!\n");
}
zero.py
#!/usr/bin/env python
import torch
from ctypes import CDLL
zero = CDLL("./zero.so")
N = 10
A = torch.ones(N, dtype=float).cuda()
threads = 32
Ap = A.data_ptr()
print('Ap', hex(Ap), 'A', A, 'A.shape', A.shape)
zero.zero(Ap, N, threads)
print('A', A)
Edit: this is CUDA C++ code interfacing with PyTorch Python code, so I’m not totally sure if this belongs in the C++ category or not. I’m happy to re-file elsewhere if this isn’t the proper place.