CUDA kernel segfaults on dispatch

Here is my code. Even if I set the kernel code to be empty, it still segfaults. An equivalent file using only CPU works great so I am not sure what I am doing wrong.

Any ideas what may be the problem?

#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>

#include <iostream>
#include <stdio.h>
#include "cuda_helpers.h"

template <typename T>
__global__ void RgbGradientForward(
    const int nthreads,
    const T* data,
    const int batch_size,
    const int channels,
    const int height,
    const int width,
    T* Ix,
    T* Iy,
    T* output) {
  printf("inside kernel");
  int x =10;


at::Tensor RGBGradient_forward_cuda(const at::Tensor& input) {
  // NOTE We assume input and output are NHWC to make accessing the data a lot
  // easier
  AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");

  at::TensorArg input_t{input, "input", 1};

  at::CheckedFrom c = "RgbGradient_forward_cuda";
  at::checkAllSameGPU(c, {input_t});
  at::checkAllSameType(c, {input_t});

  at::cuda::CUDAGuard device_guard(input.device());

  auto batch_size = input.size(0);
  auto height = input.size(1);
  auto width = input.size(2);
  auto channels = input.size(3);

  // Define the gradient variables
  at::Tensor Ix = at::zeros({batch_size, height, width}, input.options());
  at::Tensor Iy = at::zeros({batch_size, height, width}, input.options());
  at::Tensor output = at::zeros({batch_size, height, width}, input.options());

  auto output_size = batch_size * height * width;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 grid(std::min(
          static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
  dim3 block(512);

  if (output.numel() == 0) {
    return output;

  std::cout << "launching kernel" << std::endl;

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "RGBGradient_forward", [&] {
    RgbGradientForward<scalar_t><<<grid, block, 0, stream>>>(
  return output;

@goldsborough @albanD any thoughts?

@varunagrawal If the CPU version works, it seems to suggest that it’s a CUDA memory pointer problem. Could you give us the backtrace for the segfault to further debug?

I found the issue. I had to rename the CUDA kernel function to RgbGradientForwardKernel since I was using RgbGradientForward for the CPU version. This shouldn’t be an issue though since torchvision uses the same function names for the CPU and GPU kernel.

Maybe a bug somewhere?