CUDA error: an illegal memory access was encountered in cuda extension

I am realize sparse median filter with cuda extension, it is ok when kernel size is 5, but failed when kernel size 7, here is my code

#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>


template <typename scalar_t>
__device__ void quickSort(scalar_t * src, int begin, int end, int idn, int idh, int idw, int H, int W, int C) {
    if (begin >= end) {
        return ;
    }
    auto pivot = src[idn * C * H * W + begin * H * W + idh * W + idw];
    int i = begin;
    int j = end;
    while (i < j) {
        while (i < j && src[idn * C * H * W + j * H * W + idh * W + idw] >= pivot)
            j--;
        while (i < j && src[idn * C * H * W + i * H * W + idh * W + idw] <= pivot)
            i++;
        if (i < j) {
            float tmp = src[idn * C * H * W + i * H * W + idh * W + idw];
            src[idn * C * H * W + i * H * W + idh * W + idw] = src[idn * C * H * W + j * H * W + idh * W + idw];
            src[idn * C * H * W + j * H * W + idh * W + idw] = tmp;
        }
    }
    float tmp = src[idn * C * H * W + begin * H * W + idh * W + idw];
    src[idn * C * H * W + begin * H * W + idh * W + idw] = src[idn * C * H * W + i * H * W + idh * W + idw];
    src[idn * C * H * W + i * H * W + idh * W + idw] = tmp;

    quickSort(src, begin, i-1, idn, idh, idw, H, W, C);
    quickSort(src, i+1, end, idn, idh, idw, H, W, C);
}


template <typename scalar_t>
__global__ void median_cuda_ker(scalar_t *__restrict__ depth,
                              const scalar_t *__restrict__ mask,
                              scalar_t *__restrict__ output,
                              int N, int C, int H, int W, int ksize)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = blockDim.x * gridDim.x;
    // printf("blockIdx.x: %d, blockDim.x: %d, threadIdx.x:%d, gridDim.x: %d\n", blockIdx.x, blockDim.x, threadIdx.x, gridDim.x);
    for (int ids = index; ids < N * 1 * H * W; ids += stride)
    {
        // ids = nCHW + cHW + hW + w
        int idw = ids % W;
        ids = (ids - idw) / W;
        int idh = ids % H;
        ids = (ids - idh) / H;
        int idc = ids % 1;
        ids = (ids - idc) / 1;
        int idn = ids;

        // printf("%d, %d, %d, %d\n", idn, idc, idh, idw);
        if (mask[idn * C * H * W + C/2 * H * W + idh * W + idw] > 0){
            return;
        }

        int count = 0;
        for (int idx = 0; idx < C; idx++)
        {
            float m = mask[idn * C * H * W + idx * H * W + idh * W + idw];
            if (idh > -1 && idh < H && idw > -1 && idw < W && m > 0)
            {
                count += 1;
            }
        }
        if (count > 0){
            quickSort(depth, 0, C - 1, idn, idh, idw, H, W, C);
            int mid = (C - count) + count / 2;  // 0 num + count / 2
            // printf("%d, %d\n", mid, C, depth[idn * C * H * W + mid * H * W + idh * W + idw]);
            output[idn * 1 * H * W + 0 * H * W + idh * W + idw] = depth[idn * C * H * W + mid * H * W + idh * W + idw];
        }
    }
}


std::vector<torch::Tensor> median_cuda(torch::Tensor depth, torch::Tensor mask, int ksize, torch::Tensor output)
{
    const int N = depth.size(0);
    const int C = depth.size(1);
    const int H = depth.size(2);
    const int W = depth.size(3);

    // std::cout << "N=" << N << ", C=" << C << ", H=" << H << ", W=" << W << std::endl;

    const int threads = 1024;
    const int blocks = (N * 1 * H * W + threads - 1) / threads;

    AT_DISPATCH_FLOATING_TYPES(depth.type(), "median_cuda", ([&] {
                                   median_cuda_ker<scalar_t><<<blocks, threads>>>(
                                       depth.data<scalar_t>(),
                                       mask.data<scalar_t>(),
                                       output.data<scalar_t>(),
                                       N, C, H, W, ksize);
                               }));

    return {output};
}

and python code is

def median_filter(depth, continuity_map, ksize=3):
    B, _, H, W = depth.shape
    radius = ksize // 2
    depth_pad = F.pad(depth, (radius, radius, radius, radius), mode='replicate')
    cmap_pad = F.pad(continuity_map, (radius, radius, radius, radius), mode='replicate')
    
    depth_patch = F.unfold(depth_pad, ksize, stride=1, padding=0).view(B, -1, H, W).contiguous()   # BC(N=HW)
    cmap_patch = F.unfold(cmap_pad, ksize, stride=1, padding=0).view(B, -1, H, W).contiguous()   # BC(N=HW)
    depth_patch[cmap_patch == 0] = 0
    
    filtered_depth = depth.clone()
    t0 = time()
    # print(depth_patch.shape, cmap_patch.shape)
    filtered_depth = median(depth_patch, cmap_patch, ksize, filtered_depth)[0]  # BCHW
    # print("median: ", time() - t0)

    return filtered_depth.unsqueeze(0)

what is the reason?

Run the code through compute-sanitizer which should point you towards the illegal memory access. Alternatively, post a minimal and executable code snippet reproducing the issue ideally using torch.utils.cpp_extension.load_inline.