I am using the C++ API of LibTorch 2.5.1 and have a few self-written CUDA/HIP kernels. I managed to get my code running on Intel GPUs using the XPU device.
I would like to know how to port the CUDA kernels to kernels for Intel GPUs. It’s really not much
/**
@brief Compute Greville abscissae
*/
template <typename real_t>
__global__ void
greville_kernel(torch::PackedTensorAccessor64<real_t, 1> greville,
const torch::PackedTensorAccessor64<real_t, 1> knots,
int64_t ncoeffs, short_t degree, bool interior) {
for (int64_t k = blockIdx.x * blockDim.x + threadIdx.x;
k < ncoeffs - (interior ? 2 : 0); k += blockDim.x * gridDim.x) {
for (short_t l = 1; l <= degree; ++l)
greville[k] += knots[k + (interior ? 1 : 0) + l];
greville[k] /= real_t(degree);
}
}
/**
@brief Compute knot vector
*/
template <typename real_t>
__global__ void knots_kernel(torch::PackedTensorAccessor64<real_t, 1> knots,
int64_t ncoeffs, short_t degree) {
for (int64_t k = blockIdx.x * blockDim.x + threadIdx.x;
k < ncoeffs + degree + 1; k += blockDim.x * gridDim.x) {
knots[k] = (k < degree ? static_cast<real_t>(0)
: k < ncoeffs + 1 ? static_cast<real_t>(k - degree) /
static_cast<real_t>(ncoeffs - degree)
: static_cast<real_t>(1));
}
}