Batch mode of Jacobian calculation: how can I change my output shape as the output shape of torch.autograd.functional.jacobian

Hi all I am trying to calculate Jacobian matrix in the form of batch mode as my input ‘x’ has 128x1000 dimension where 128 is the batch size and 1000 is the number of classes. When I used torch.autograd.functional.jacobian it gave me ‘cuda out of memory’ error. So the batch mode was preferred. the output of the torch.autograd.functional.jacobian function in lower size is like [batch_size, n_classes, batch_size, n_classes] so I need to change my output like this 4D while my output is [batch_size, n_classes, n_classes].
my code is as follows;

import torch
import torch.nn.functional as F

def gradient(y, x, grad_outputs=None):
    """Compute dy/dx @ grad_outputs"""
    if grad_outputs is None:
        grad_outputs = torch.ones_like(y)
    grad = torch.autograd.grad(y, [x], grad_outputs = grad_outputs, create_graph=True)[0]
    return grad

def jacobian(y, x):
    """Compute dy/dx = dy/dx @ grad_outputs; 
    for grad_outputs in [1, 0, ..., 0], [0, 1, 0, ..., 0], ...., [0, ..., 0, 1]"""
    jac = torch.zeros(y.shape[0], x.shape[0]) 
    for i in range(y.shape[0]):
        grad_outputs = torch.zeros_like(y)
        grad_outputs[i] = 1
        jac[i] = gradient(y, x, grad_outputs = grad_outputs)
    return jac

def divergence(y, x):
    div = 0.
    for i in range(y.shape[-1]):
        div += torch.autograd.grad(y[..., i], x, torch.ones_like(y[..., i]), create_graph=True)[0][..., i:i+1]
    return div

def laplace(y, x):
    grad = gradient(y, x)
    return divergence(grad, x)

x = torch.rand(128,1000)
w = torch.rand(1000, 1000)
b = torch.rand(1000)
j=[]
for i in range(128):
    x_p = torch.tensor(x[i,:], requires_grad=True)
    y = F.linear(x_p, w, b)
    j_p=jacobian(y,x_p)
    j.append(j_p)

j=torch.stack(j,0)        

Thanks.