I am new to CUDA programming and I am trying to create a custom CUDA kernel for an operation I am working on but I can’t figure how to optimize my code. Below is a simple example of my own ceiling function and it seems to be much slower than the internal function implemented in Pytorch. I have noticed that the latency comes when I write the output_value to the output tensor inside the kernel. Running Pytorch ceiling operation for 100,000 iterations on a tensor with shape [100, 50, 100] takes 7.22s where mine takes 9.75s.
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>#include <THC/THC.h>
#include “THC/THCDeviceUtils.cuh”#include <cuda.h>
#include <cuda_runtime.h>#include <assert.h>
#include <cmath>
/**
input: [T, B, R]
output: [T, B, R]
**/
template <typename scalar_t>
__global__ void MethodKernel(
const at::PackedTensorAccessor32<scalar_t, 3, torch::RestrictPtrTraits> input,
scalar_t* output){const int length = input.size(0); const int batchSize = input.size(1); const int r_dim = input.size(2); const int index = blockDim.x * blockIdx.x + threadIdx.x; const int rIdx = index % r_dim; const int batchIdx = (index / r_dim) % batchSize; const int tokenIdx = ((index / r_dim) / batchSize); if (batchIdx < batchSize and tokenIdx < length and rIdx < r_dim) { scalar_t output_value = std::ceil(input[tokenIdx][batchIdx][rIdx]); output[(tokenIdx * batchSize + batchIdx) * r_dim + rIdx] = output_value; }
}
at::Tensor method_encoder_forward(at::Tensor input) {
const int length = input.size(0); const int batchSize = input.size(1); const int r_dim = input.size(2); const dim3 blockSize(1024); const dim3 gridSize( (length*batchSize*r_dim + blockSize.x - 1) / blockSize.x); auto output = at::zeros_like(input); AT_DISPATCH_FLOATING_TYPES_AND_HALF(output.type(), "method_encoder_forward", ([&] { auto inputAcsr = input.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>(); MethodKernel <<<gridSize, blockSize, 0, at::cuda::getCurrentCUDAStream()>>> ( inputAcsr, output.data<scalar_t>()); })); AT_CUDA_CHECK(cudaGetLastError()); return output;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def(“method_encoder_forward”, &method_encoder_forward, “Method Encoder Forward”);
}