Compute Gaussian Density Map on a 2D Point Set

Hi all, I have a question about how to efficiently compute a Gaussian density image on a given 2D point set. My current implementation is as the following with a plot:

import torch
import numpy as np
from time import time
import matplotlib.pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def computeGaussian(p, res=128, kernel_sigma=0.05):
    ksize = round(res * kernel_sigma * 6)
    sigma = kernel_sigma * res
    x = np.linspace(0, 1, int(res))
    y = np.linspace(0, 1, int(res))
    xv, yv = np.meshgrid(x, y)
    txv = torch.from_numpy(xv).unsqueeze(0).float()
    tyv = torch.from_numpy(yv).unsqueeze(0).float()
    mesh = torch.cat((txv, tyv), 0).to(device)   # positions
    img = torch.zeros((3, res, res)).to(device)  # create an empty gaussian density image

    for i in range(p.size()[0]):
        center = p[i, 0:2]  # go through each point
        hw = round(3 * kernel_sigma * res)  # only consider band-limited gaussian with 3sigma
        coord_center = torch.floor(center * res)

        up = torch.max(coord_center[1] - hw, torch.Tensor([0]).to(device))  # take the boundary into account
        down = torch.min(coord_center[1] + hw + 1, torch.Tensor([res]).to(device))
        left = torch.max(coord_center[0] - hw, torch.Tensor([0]).to(device))
        right = torch.min(coord_center[0] + hw + 1, torch.Tensor([res]).to(device))
        up = up.long()
        down = down.long()
        left = left.long()
        right = right.long()

        r, g, b = 1, 1, 1

        # apply gaussian kernel on the pixels based on their distance to the center
        img[0, up:down, left:right] += r * torch.exp(
            -(mesh.permute(1, 2, 0)[up:down, left:right, :] - center).pow(2).sum(2) / (2 * kernel_sigma ** 2))
        img[1, up:down, left:right] += g * torch.exp(
            -(mesh.permute(1, 2, 0)[up:down, left:right, :] - center).pow(2).sum(2) / (2 * kernel_sigma ** 2))
        img[2, up:down, left:right] += b * torch.exp(
            -(mesh.permute(1, 2, 0)[up:down, left:right, :] - center).pow(2).sum(2) / (2 * kernel_sigma ** 2))

    return img


pts = torch.rand(16, 2).to(device)
start = time()
density_map = computeGaussian(pts)
end = time()
print('Time: {:.4f}'.format(end - start))
plt.figure(1)
plt.subplot(121)
plt.imshow(density_map.permute(1, 2, 0).detach().cpu().numpy())
plt.subplot(122)
plt.scatter(pts[:, 0].detach().cpu().numpy(), (1 - pts[:, 1]).detach().cpu().numpy())
plt.axis('equal')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.show()

The problem is that I need to loop over all the points since they have different size of grid(i.e. [down:up, left:right]) to consider, because of the boundary. This makes it slow for a large point set and it’s even faster to run on CPU than CUDA. Is there any way to make it without the loop? Thanks in advance.