CUDA Extension: Illegal memory access was encoutered

I have an Ubuntu 18.04 OS with miniconda3, python 3.7, CUDA 10.1, CuDNN 7.4 and GCC 7.4 installed. I also have compiled PyTorch 1.4 from source.

I wrote a PyTorch C++/CUDA extension code for a specific task that I had using the exact steps mentioned in the tutorial page. My extension looks like this:

// This is the .cpp file
#include <torch/extension.h>
#include <vector>

std::vector<torch::Tensor> zbuffertri_cuda_forward(
                                torch::Tensor s2d,
                                torch::Tensor tri,
                                torch::Tensor visible,
                                int img_size = 224);

// C++ interface

#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

std::vector<torch::Tensor> zbuffertri_forward(torch::Tensor s2d, torch::Tensor tri, torch::Tensor visible, int img_size = 224)
{
    CHECK_INPUT(s2d);
    CHECK_INPUT(tri);
    CHECK_INPUT(visible);

    return zbuffertri_cuda_forward(s2d, tri, visible, img_size);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &zbuffertri_forward, "ZBufferTri Operation (CUDA)");}
// This is the .cu file

# include <torch/types.h>
# include <cuda.h>
# include <cuda_runtime.h>
# include <vector>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=false)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void convert_to_mask(float *zbuffer, int img_size)
{
    for(int i=blockIdx.x*blockDim.x+threadIdx.x; i<img_size*img_size; i+=blockDim.x*gridDim.x)
    {
        if(zbuffer[i] == -INFINITY)
        {
            zbuffer[i] = 0;
        }
        else
        {
            zbuffer[i] = 1;
        }
    }
}

/* Forward Function */
std::vector<torch::Tensor> zbuffertri_cuda_forward(torch::Tensor s2d, torch::Tensor tri, torch::Tensor visible, int img_size = 224)
{
    auto s2d_data = s2d.data<float>();
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk( cudaDeviceSynchronize() );

    const int tri_num = tri.size(1);
    const int vertex_num = s2d.size(1);

    auto out = torch::ones({img_size, img_size}, torch::device(s2d.device())) * (tri_num-1);
    auto zbuffer = torch::ones({img_size, img_size}, torch::device(s2d.device())) * (-INFINITY);;

    int N = img_size*img_size;
    const int threads = 256;
    const dim3 blocks((img_size + threads - 1) / threads, img_size);
    int blockSize = 256;
    int numBlocks = (N + blockSize -1) / (2*blockSize);
    std::cout<<numBlocks<<" "<<blockSize<<std::endl;

    AT_DISPATCH_FLOATING_TYPES(zbuffer.type(), "zbuffer_tri_dispatch", ([&] {
        convert_to_mask<scalar_t><<<blocks, threads>>>(zbuffer.packed_accessor<scalar_t,2,torch::RestrictPtrTraits,size_t>());
    }));

    return {out, zbuffer};
}

It gets compiled correctly without any errors. I can also import the compiled library into my code successfully. However, when I run its forward function, it doesn’t work. (There are a few other function calls happening inside the .cu file that I have commented out for now because even the simplest code is not working.) When I use gdb or cuda-gdb to debug into the cuda code by adding a breakpoint at the cuda function, I see an error saying “s2d=<error reading variable: Cannot access memory at address 0x2>, … at zbuffertri_implementation.cu:190”

Single stepping until exit from function _Z23zbuffertri_cuda_forwardN2at6TensorES0_S0_i@plt,
which has no line number information.
zbuffertri_cuda_forward (s2d=<error reading variable: Cannot access memory at address 0x2>, tri=..., visible=..., img_size=32767) at zbuffertri_implementation.cu:190

However, I can run through the function completely without any errors. When I return to the python line where the C++ function was called and try to access the tensors returned from the call, I get the following error:

tri_map_2d, mask_i = zbuffertri.forward(vertex2d_i.contiguous(), self.tri.float(), visible_tri[i].contiguous(), output_size)
(gdb) n
(Pdb) p tri_map_2d
THCudaCheck FAIL file=../aten/src/THC/THCCachingHostAllocator.cpp line=278 error=700 : an illegal memory access was encountered

Been dealing with this issues for days now. Any help would be much appreciated.