Parzen kernel density estimation for Multivariate time series data

I have a data set of shape (N,C,T,V) N samples, C channels, T frames, , V points. I want to estimate the point’s densities using parzen window technique to get the spatial distribution of each frame of each sequence. My expected output size is N,T,V.

I have developed a code where N=1. But when i have multiple samples like batch, the code will too slow since i have use a lot of for loops. So is there any method to improve and optimize this code.

import os
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import torch
from scipy.stats import multivariate_normal
def density_estimation(self):

        
        print ('data = ',data.shape)
        N,T,V,C = data.shape
        
        frames= self._normal_skeleton(data)
        print(frames[0].shape)
        frames = list(frames[0]) #select only one sequence 
        data = torch.tensor(data)
        # Define the window width (standard deviation of the Gaussian)
        window_width = 0.01

        joint_densities = torch.zeros((len(frames), frames[0].shape[0]))
        for frame_idx, points in enumerate(frames):
            for joint_idx in range(points.shape[0]):
                joint_densities[frame_idx, joint_idx] = self.parzen_window(points[joint_idx], points, window_width)
        print('joint densities 1 ', joint_densities)
        print('joint densities 1 ', joint_densities.shape)
        # Apply Parzen window to each joint
        
        
            
    # Define the Parzen window function (Gaussian kernel)
    def parzen_window(self,x, data, window_width):
        N, d = data.shape
        # print(x.shape , data.shape)
        # lk
        density = 0
        for i in range(N):
            density += multivariate_normal.pdf(x, mean=data[i], cov=window_width)
        return density / N       

to get process the while batch, i modified it as follows,

for i in range(N):
            for j in range(T):
                for k in range(V):
                    joint_data = data[i, j, k]
                    frame = data[i,j]
                    # density[i, j, k] = parzen_window2(joint_data, frame, window_width)
                    density[i, j, k] =self.parzen_window(joint_data, frame, window_width)
                # Calculate the average density over time for each joint
        print('joint densities', density.shape)

but the problem is when the no of points and no of samples get increased, the code is too slow. So how do i optimize this code ?