Here is my code. Even if I set the kernel code to be empty, it still segfaults. An equivalent file using only CPU works great so I am not sure what I am doing wrong.
Any ideas what may be the problem?
#include <ATen/ATen.h>
#include <ATen/TensorUtils.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
#include <iostream>
#include <stdio.h>
#include "cuda_helpers.h"
template <typename T>
__global__ void RgbGradientForward(
const int nthreads,
const T* data,
const int batch_size,
const int channels,
const int height,
const int width,
T* Ix,
T* Iy,
T* output) {
printf("inside kernel");
int x =10;
}
at::Tensor RGBGradient_forward_cuda(const at::Tensor& input) {
// NOTE We assume input and output are NHWC to make accessing the data a lot
// easier
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
at::TensorArg input_t{input, "input", 1};
at::CheckedFrom c = "RgbGradient_forward_cuda";
at::checkAllSameGPU(c, {input_t});
at::checkAllSameType(c, {input_t});
at::cuda::CUDAGuard device_guard(input.device());
auto batch_size = input.size(0);
auto height = input.size(1);
auto width = input.size(2);
auto channels = input.size(3);
// Define the gradient variables
at::Tensor Ix = at::zeros({batch_size, height, width}, input.options());
at::Tensor Iy = at::zeros({batch_size, height, width}, input.options());
at::Tensor output = at::zeros({batch_size, height, width}, input.options());
auto output_size = batch_size * height * width;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(
at::cuda::ATenCeilDiv(
static_cast<int64_t>(output_size), static_cast<int64_t>(512)),
static_cast<int64_t>(4096)));
dim3 block(512);
if (output.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
std::cout << "launching kernel" << std::endl;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "RGBGradient_forward", [&] {
RgbGradientForward<scalar_t><<<grid, block, 0, stream>>>(
output_size,
input.contiguous().data<scalar_t>(),
batch_size,
channels,
height,
width,
Ix.contiguous().data<scalar_t>(),
Iy.contiguous().data<scalar_t>(),
output.contiguous().data<scalar_t>());
});
AT_CUDA_CHECK(cudaGetLastError());
return output;
}