I refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/models/networks/DCNv2/src/dcn_v2_cuda.c
.
....
const int block = 128;
const int grid = (batch + block - 1) / block;
createBatchGemmBuffer<<<grid, block, 0, THCState_getCurrentStream(state)>>>(
input_b, output_b,
columns_b, ones_b,
weight_b, bias_b,
input.data<scalar_t>(),
output.data<scalar_t>(),
columns.data<scalar_t>(),
ones.data<scalar_t>(),
weight.data<scalar_t>(),
bias.data<scalar_t>(),
channels * width * height,
channels_out * width_out * height_out,
channels * kernel_h * kernel_w * height_out * width_out,
height_out * width_out,
batch);
long m_ = channels_out;
long n_ = height_out * width_out;
long k_ = 1;
THCudaBlas_SgemmBatched(state,
't',
'n',
n_,
m_,
k_,
1.0f,
ones_b, k_,
bias_b, k_,
0.0f,
output_b, n_,
batch);
modulated_deformable_im2col_cuda(THCState_getCurrentStream(state),
input.data<scalar_t>(),
offset.data<scalar_t>(),
mask.data<scalar_t>(),
batch, channels, height, width,
height_out, width_out, kernel_h, kernel_w,
pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
deformable_group,
columns.data<scalar_t>());
long m = channels_out;
long n = height_out * width_out;
....
I can compile in torch1.2, but failed in torch1.5.
I refer to https://github.com/pytorch/pytorch/issues/36132
, change THCState_getCurrentStream
to c10::cuda::getCurrentCUDAStream
, cuda::getCurrentCUDAStream
and at::cuda::getCurrentCUDAStream
. These all failed.