Suggested Environment for Developing C++/Cuda Extensions in Windows 10

I am trying to write a c++ extension using CUDA libraries in windows 10 following the tutorial here

I have python 3.6.11, pytorch 1.8.0.dev20201021, rtx 3080 gpu, cuda 11.1.

I ended up getting pytorch c++ 1.7 (stable/debug) with cuda 11.0 working in Microsoft Visual Studio 2019 v 16.6.5 with the cl.exe compiler, and I am wondering what is the best way to write code with syntax completion, debugging abilities so that when I run python setup.py install, that I can be sure it will work.

Questions are what is the suggested environment/debugging tools for windows 10 ?
Do I need to use Nvidia Nsight Compute to debug the Cuda Code?
What flags will I need to pass in to the nvcc compiler (c++11 and maybe --gpu-architecture=compute_86 --gpu-code=sm_86)
Should I use debug/release pytorch c++ ?

My Cmake file at the moment looks like:

cmake_minimum_required (VERSION 3.8)

project ("DAConvolution")

find_package(Torch REQUIRED)

add_library (DAConvolution "DAConvolution.cpp" )

# directory for python includes
include_directories("C:/Users/James/Anaconda3/envs/masters/include")

target_link_libraries(DAConvolution "${TORCH_LIBRARIES}" "C:/Users/James/Anaconda3/envs/masters/libs/python36.lib")

if (MSVC)
  file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll")
  add_custom_command(TARGET DAConvolution
                     POST_BUILD
                     COMMAND ${CMAKE_COMMAND} -E copy_if_different
                     ${TORCH_DLLS}
                     $<TARGET_FILE_DIR:DAConvolution>)
endif (MSVC)

And my code looks like (so far with no compilation errors, as a library)

// DAConvolution.cpp
#ifdef _DEBUG
#undef _DEBUG
#include <python.h>
#define _DEBUG
#else
#include <python.h>
#endif

#include <torch/extension.h>

using namespace std;

#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)

// CUDA Forward declarations

std::vector<torch::Tensor> DA_Conv_forward(
	torch::Tensor input_image,
	torch::Tensor depth_image,
	torch::Tensor bias,
	torch::Tensor weight,
	int stride,
	int dilation,
	int padding,
	float alpha
);

std::vector<torch::Tensor> DA_Conv_backward(
	torch::Tensor grad_output,
	torch::Tensor input_image,
	torch::Tensor depth_image,
	torch::Tensor bias,
	torch::Tensor weight,
	int stride,
	int dilation,
	int padding,
	float alpha
);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
	m.def("forward", &DA_Conv_forward, "DAConvolution forward");
	m.def("backward", &DA_Conv_backward, "DAConvolution backward");
}

Any tips or insights very much appreciated, would be happy to provide more information.