Optimize CUDA kernel

I am new to CUDA programming and I am trying to create a custom CUDA kernel for an operation I am working on but I can’t figure how to optimize my code. Below is a simple example of my own ceiling function and it seems to be much slower than the internal function implemented in Pytorch. I have noticed that the latency comes when I write the output_value to the output tensor inside the kernel. Running Pytorch ceiling operation for 100,000 iterations on a tensor with shape [100, 50, 100] takes 7.22s where mine takes 9.75s.

#include <torch/extension.h>

#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

#include <THC/THC.h>
#include “THC/THCDeviceUtils.cuh”

#include <cuda.h>
#include <cuda_runtime.h>

#include <assert.h>

#include <cmath>

/**
input: [T, B, R]
output: [T, B, R]
**/
template <typename scalar_t>
__global__ void MethodKernel(
const at::PackedTensorAccessor32<scalar_t, 3, torch::RestrictPtrTraits> input,
scalar_t* output){

const int length = input.size(0);
const int batchSize = input.size(1);
const int r_dim = input.size(2);

const int index = blockDim.x * blockIdx.x + threadIdx.x;
const int rIdx = index % r_dim;
const int batchIdx = (index / r_dim) % batchSize;
const int tokenIdx = ((index / r_dim) / batchSize);

if (batchIdx < batchSize and tokenIdx < length and rIdx < r_dim) {
    scalar_t output_value = std::ceil(input[tokenIdx][batchIdx][rIdx]);
   
    output[(tokenIdx * batchSize + batchIdx) * r_dim + rIdx] = output_value;
}

}

at::Tensor method_encoder_forward(at::Tensor input) {

const int length = input.size(0);
const int batchSize = input.size(1);
const int r_dim = input.size(2);

const dim3 blockSize(1024);
const dim3 gridSize(
    (length*batchSize*r_dim + blockSize.x - 1) / blockSize.x);

auto output = at::zeros_like(input);


AT_DISPATCH_FLOATING_TYPES_AND_HALF(output.type(), "method_encoder_forward", ([&] {
    
    auto inputAcsr = input.packed_accessor32<scalar_t, 3, torch::RestrictPtrTraits>();
    
    MethodKernel <<<gridSize, blockSize, 0, at::cuda::getCurrentCUDAStream()>>> (
            inputAcsr, output.data<scalar_t>());

}));

AT_CUDA_CHECK(cudaGetLastError());

return output;

}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def(“method_encoder_forward”, &method_encoder_forward, “Method Encoder Forward”);
}