I wrote a simple CUDA extension to multiply a tensor in-place on GPU, it works fine on single process but in multiprocessing mode it gets the illegal memory access error. I’ve tried compute-sanitizer
as well but didn’t get any more meaningful info. Here’s a minimal reproducible code package, simply run python setup.py install && python test_inplacemul.py
.
Here are the main snippets for a quick preview:
CUDA implementation:
__global__ void raw_mul_cuda_forward_kernel(float* x) {
const int r = blockIdx.x;
const int c = blockIdx.y;
const int d = threadIdx.x;
const int D = blockDim.x;
x[r * D * D + c * D + d] *= 2.f;
}
void mul_cuda_forward(torch::Tensor x) {
const auto H = x.size(0);
const auto W = x.size(1);
const auto D = x.size(2);
const int threads = D;
const dim3 blocks(H, W);
raw_mul_cuda_forward_kernel<<<blocks, threads>>>(x.data<float>());
cudaDeviceSynchronize();
}
In Python:
import torch
import inplacemul_cuda
from tqdm import tqdm
def error_callback(error):
print(error, flush=True)
def process(rank):
device = torch.device(f"cuda:{rank}")
x = torch.rand(32, 32, 32).to(device)
gold = x * 2
inplacemul_cuda.forward(x)
assert torch.abs(gold - x).sum() == 0
if __name__ == '__main__':
assert torch.cuda.is_available()
# sequential test
for _ in tqdm(range(10)):
process(0)
print('Single-process test passed')
# multiproc test
import torch.multiprocessing as mp
ctx = mp.get_context("spawn")
N = 1000
pool = ctx.Pool(processes=2)
pbar = tqdm(total=N)
rets = []
num_gpus = torch.cuda.device_count()
gpu = 0
for _ in range(N):
ret = pool.apply_async(process, args=(gpu,),
callback=lambda _: pbar.update(1),
error_callback=error_callback)
rets.append(ret)
gpu = (gpu + 1) % num_gpus
pool.close()
pool.join()
for ret in rets:
ret.get()