How to utilize GPUs to accelerate my code?

Hi, I have a function render_texture that implements the z-buffer algorithm to render texture, but it’s very slow. I need to backpropagate through this function. How can I accelerate it by utilizing GPU’s parallelization?

def render_texture(vertices, colors, triangles, h, w, c=3, device="cpu"):
    """render mesh by z buffer. vertices,colors, and triangles are tensors
    Args:
        shape of vertices: 3 x n vertices
        shape of colors: 3 x n vertices
        shape of triangles: 3 x n triangles
        h: height
        w: width
    """
    # initial
    image = torch.zeros((h, w, c)).to(device)

    depth_buffer = torch.zeros([h, w]) - 999999.0
    # triangle depth: approximate the depth to the average value of z in each
    # vertex(v0, v1, v2), since the vertices are closed to each other
    tri_depth = (
        vertices[2, triangles[0, :]]
        + vertices[2, triangles[1, :]]
        + vertices[2, triangles[2, :]]
    ) / 3.0
    tri_tex = (
        colors[:, triangles[0, :]]
        + colors[:, triangles[1, :]]
        + colors[:, triangles[2, :]]
    ) / 3.0

    for i in range(triangles.shape[1]):
        tri = triangles[:, i]  # 3 vertex indices

        # the inner bounding box
        umin = max(int(torch.ceil(torch.min(vertices[0, tri]))), 0)
        umax = min(int(torch.floor(torch.max(vertices[0, tri]))), w - 1)

        vmin = max(int(torch.ceil(torch.min(vertices[1, tri]))), 0)
        vmax = min(int(torch.floor(torch.max(vertices[1, tri]))), h - 1)

        if umax < umin or vmax < vmin:
            continue

        for u in range(umin, umax + 1):
            for v in range(vmin, vmax + 1):
                if tri_depth[i] > depth_buffer[v, u] and isPointInTriangle(
                    [u, v], vertices[:2, tri].cpu().numpy()
                ):
                    depth_buffer[v, u] = tri_depth[i]
                    image[v, u, :] = tri_tex[:, i]
    return image

def isPointInTriangle(point, tri_points):
    """Judge whether the point is in the triangle
    Method:
        http://blackpawn.com/texts/pointinpoly/
    Args:
        point: [u, v] or [x, y]
        tri_points: three vertices(2d points) of a triangle. 2 coords x 3 vertices
    Returns:
        bool: true for in triangle
    """
    tp = tri_points

    # vectors
    v0 = tp[:, 2] - tp[:, 0]
    v1 = tp[:, 1] - tp[:, 0]
    v2 = point - tp[:, 0]

    # dot products
    dot00 = np.matmul(v0.T, v0)
    dot01 = np.matmul(v0.T, v1)
    dot02 = np.matmul(v0.T, v2)
    dot11 = np.matmul(v1.T, v1)
    dot12 = np.matmul(v1.T, v2)

    # barycentric coordinates
    if dot00 * dot11 - dot01 * dot01 == 0:
        inverDeno = 0
    else:
        inverDeno = 1 / (dot00 * dot11 - dot01 * dot01)

    u = (dot11 * dot02 - dot01 * dot12) * inverDeno
    v = (dot00 * dot12 - dot01 * dot02) * inverDeno

    # check if point in triangle
    return (u >= 0) & (v >= 0) & (u + v < 1)