Trying to run cuda from torch inline extension, so file is not getting created while loading the module

I am new to cuda programming and trying to run a hello world program through notebooks in python like this :

%load_ext wurlitzer

import torch, os
from torch.utils.cpp_extension import load_inline


os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def load_cuda(cuda_src, cpp_src, funcs, opt=False, verbose=False):
    return load_inline(cuda_sources=[cuda_src], cpp_sources=[cpp_src], functions=funcs,
                       extra_cuda_cflags=["-O2"] if opt else [], verbose=verbose, name="inline_ext")



cuda_begin = r"""
#include 
#include 
#include 

#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

inline unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
"""

cuda_src = cuda_begin + r"""
__global__ void rgb_to_grayscale_kernel(unsigned char* x, unsigned char* out, int n) {
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if (i>>(
        input.data_ptr(), output.data_ptr(), w*h);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
    return output;
}"""


cpp_src = "torch::Tensor rgb_to_grayscale(torch::Tensor input);"
module = load_cuda(cuda_src, cpp_src, ['rgb_to_grayscale'], verbose=True)

This results in an error saying :

ImportError: /home/zeus/.cache/torch_extensions/py311_cu121/inline_ext/inline_ext.so: cannot open shared object file: No such file or directory

I have already tried deleting the cache dir and re-running the code but keep getting teh same error. Not sure how to debug this, can someone help?

Could you check the permissions of the cache folder and make sure your Python process has the needed rights?