I am realize sparse median filter with cuda extension, it is ok when kernel size is 5, but failed when kernel size 7, here is my code
#include <torch/extension.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <vector>
template <typename scalar_t>
__device__ void quickSort(scalar_t * src, int begin, int end, int idn, int idh, int idw, int H, int W, int C) {
if (begin >= end) {
return ;
}
auto pivot = src[idn * C * H * W + begin * H * W + idh * W + idw];
int i = begin;
int j = end;
while (i < j) {
while (i < j && src[idn * C * H * W + j * H * W + idh * W + idw] >= pivot)
j--;
while (i < j && src[idn * C * H * W + i * H * W + idh * W + idw] <= pivot)
i++;
if (i < j) {
float tmp = src[idn * C * H * W + i * H * W + idh * W + idw];
src[idn * C * H * W + i * H * W + idh * W + idw] = src[idn * C * H * W + j * H * W + idh * W + idw];
src[idn * C * H * W + j * H * W + idh * W + idw] = tmp;
}
}
float tmp = src[idn * C * H * W + begin * H * W + idh * W + idw];
src[idn * C * H * W + begin * H * W + idh * W + idw] = src[idn * C * H * W + i * H * W + idh * W + idw];
src[idn * C * H * W + i * H * W + idh * W + idw] = tmp;
quickSort(src, begin, i-1, idn, idh, idw, H, W, C);
quickSort(src, i+1, end, idn, idh, idw, H, W, C);
}
template <typename scalar_t>
__global__ void median_cuda_ker(scalar_t *__restrict__ depth,
const scalar_t *__restrict__ mask,
scalar_t *__restrict__ output,
int N, int C, int H, int W, int ksize)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
// printf("blockIdx.x: %d, blockDim.x: %d, threadIdx.x:%d, gridDim.x: %d\n", blockIdx.x, blockDim.x, threadIdx.x, gridDim.x);
for (int ids = index; ids < N * 1 * H * W; ids += stride)
{
// ids = nCHW + cHW + hW + w
int idw = ids % W;
ids = (ids - idw) / W;
int idh = ids % H;
ids = (ids - idh) / H;
int idc = ids % 1;
ids = (ids - idc) / 1;
int idn = ids;
// printf("%d, %d, %d, %d\n", idn, idc, idh, idw);
if (mask[idn * C * H * W + C/2 * H * W + idh * W + idw] > 0){
return;
}
int count = 0;
for (int idx = 0; idx < C; idx++)
{
float m = mask[idn * C * H * W + idx * H * W + idh * W + idw];
if (idh > -1 && idh < H && idw > -1 && idw < W && m > 0)
{
count += 1;
}
}
if (count > 0){
quickSort(depth, 0, C - 1, idn, idh, idw, H, W, C);
int mid = (C - count) + count / 2; // 0 num + count / 2
// printf("%d, %d\n", mid, C, depth[idn * C * H * W + mid * H * W + idh * W + idw]);
output[idn * 1 * H * W + 0 * H * W + idh * W + idw] = depth[idn * C * H * W + mid * H * W + idh * W + idw];
}
}
}
std::vector<torch::Tensor> median_cuda(torch::Tensor depth, torch::Tensor mask, int ksize, torch::Tensor output)
{
const int N = depth.size(0);
const int C = depth.size(1);
const int H = depth.size(2);
const int W = depth.size(3);
// std::cout << "N=" << N << ", C=" << C << ", H=" << H << ", W=" << W << std::endl;
const int threads = 1024;
const int blocks = (N * 1 * H * W + threads - 1) / threads;
AT_DISPATCH_FLOATING_TYPES(depth.type(), "median_cuda", ([&] {
median_cuda_ker<scalar_t><<<blocks, threads>>>(
depth.data<scalar_t>(),
mask.data<scalar_t>(),
output.data<scalar_t>(),
N, C, H, W, ksize);
}));
return {output};
}
and python code is
def median_filter(depth, continuity_map, ksize=3):
B, _, H, W = depth.shape
radius = ksize // 2
depth_pad = F.pad(depth, (radius, radius, radius, radius), mode='replicate')
cmap_pad = F.pad(continuity_map, (radius, radius, radius, radius), mode='replicate')
depth_patch = F.unfold(depth_pad, ksize, stride=1, padding=0).view(B, -1, H, W).contiguous() # BC(N=HW)
cmap_patch = F.unfold(cmap_pad, ksize, stride=1, padding=0).view(B, -1, H, W).contiguous() # BC(N=HW)
depth_patch[cmap_patch == 0] = 0
filtered_depth = depth.clone()
t0 = time()
# print(depth_patch.shape, cmap_patch.shape)
filtered_depth = median(depth_patch, cmap_patch, ksize, filtered_depth)[0] # BCHW
# print("median: ", time() - t0)
return filtered_depth.unsqueeze(0)
what is the reason?