Hello, i have implemented a custom layer for pytorch and want to speed it up by using CUDA.
I was following the tutorial given here https://pytorch.org/tutorials/advanced/cpp_extension.html.
But it is not mentioned how to save tensors in the forward pass ctx.save_for_backward in CUDA.
I would really appreciate if someone could explain how to do this if possible or give me suggestion on how to do it in another way.
Here is the code for python implementation:
class NeighborFill(torch.autograd.Function):
@staticmethod
def forward(ctx, x, y, z, neighbors):
dt = TimestampMillisec64()
nx = torch.zeros((1, x.shape[1], neighbors.shape[1] * neighbors.shape[2])).cuda()
ny = torch.zeros((1, y.shape[1], neighbors.shape[1] * neighbors.shape[2])).cuda()
nz = torch.zeros((1, z.shape[1], neighbors.shape[1] * neighbors.shape[2])).cuda()
for i in range(0, neighbors.shape[1]):
for j in range(0, neighbors.shape[2]):
index = neighbors[0][i][j] - 1
if (index >= 0):
nx[0, :, i * neighbors.shape[2] + j] = x[0, :, index]
ny[0, :, i * neighbors.shape[2] + j] = y[0, :, index]
nz[0, :, i * neighbors.shape[2] + j] = z[0, :, index]
ctx.save_for_backward(x, y, z, neighbors)
ctx.mark_non_differentiable(neighbors)
return nx, ny, nz
@staticmethod
def backward(ctx, gradoutx, gradouty, gradoutz):
dt = TimestampMillisec64()
x, y, z, neighbors = ctx.saved_tensors
gradinpx = torch.zeros((1, x.shape[1], x.shape[2])).cuda()
gradinpy = torch.zeros((1, y.shape[1], y.shape[2])).cuda()
gradinpz = torch.zeros((1, z.shape[1], z.shape[2])).cuda()
for i in range(0, neighbors.shape[1]):
for j in range(0, neighbors.shape[2]):
index = neighbors[0][i][j] - 1
if (index >= 0):
gradinpx[0, :, i] += gradoutx[0, :, i * neighbors.shape[2] + j]
gradinpy[0, :, i] += gradouty[0, :, i * neighbors.shape[2] + j]
gradinpz[0, :, i] += gradoutz[0, :, i * neighbors.shape[2] + j]
return gradinpx, gradinpy, gradinpz, neighbors