I want to train a neural network that can map the two-dimensional uniform distribution to the 512-dimensional normal distribution

I want to train a neural network that can map the two-dimensional uniform distribution to the 512-dimensional normal distribution. how should I do? I have construct a fully-connected and I am using the KL divergence function as loss function. It seems that the loss do not decrease while training the network.
The following is my code. Can anyone give some advice?

from cmath import tanh
from torch import nn
import torch


class Uniform_to_latent_Network(nn.Module):
def __init__(self,in_dim=2,middle_dim=512,out_dim=128) -> None:
    super().__init__()
    self.linear_layers=nn.Sequential(
        # nn.Dropout(0.5),
        nn.Linear(in_features=in_dim,out_features=middle_dim),
        nn.BatchNorm1d(middle_dim),
        nn.LeakyReLU(0.1),
        # nn.Tanh(),
        
        # nn.Dropout(0.5),
        nn.Linear(middle_dim,middle_dim),
        nn.BatchNorm1d(middle_dim),
        nn.LeakyReLU(0.1),
        # nn.Tanh(),
        # nn.Dropout(0.5),
        nn.Linear(middle_dim,middle_dim),
        nn.BatchNorm1d(middle_dim),
        nn.LeakyReLU(0.05),
        # nn.Tanh(),
        # nn.Dropout(0.5),
        nn.Linear(middle_dim,middle_dim),
        nn.BatchNorm1d(middle_dim),
        nn.LeakyReLU(0.05),
        # nn.Tanh(),
        # nn.Dropout(0.5),
        nn.Linear(middle_dim,out_features=out_dim),
        nn.Softmax()
    )
def forward(self,x):
    x=self.linear_layers(x)
    x=x
    return x



from operator import mod
import os,sys
from textwrap import indent
o_path = os.getcwd()
import torch
sys.path.append(o_path)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from torch.utils.data import TensorDataset,DataLoader
from torch.nn import MSELoss,CrossEntropyLoss

from umap import UMAP
from torch import optim
from torch.utils.data import random_split


device = "cuda:2" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")


model = Uniform_to_latent_Network().to(device)
print(model)


learning_rate=0.1
momentum = 0.5
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn=nn.KLDivLoss(reduction="batchmean")
train_losses=[]
train_counter=[]
model=model.to(device)

for epoch in range(30000):
    in_data=((torch.rand((20000,2))*160)-80).to(device=device)
    # print(torch.min(in_data))
    # print(torch.max(in_data))
    out_data=torch.randn(20000,128).to(device=device)
    optimizer.zero_grad()
    output = model(in_data)
    loss = loss_fn(output,out_data)
    loss.backward()
    optimizer.step()
    print(loss.item())
    torch.save(model.state_dict(), './DirectionDiscovery/train/trained_model/1.pth')

Hi zwj!

The essence of your problem is that KLDivLoss compares two (discrete)
probability distributions as described by actual (log) probabilities, rather
than as described by sets of samples from those distributions.

On a lower technical level, a symptom of your core problem is that
you can’t backpropagate through samples from a distribution, so your
loss-backward() isn’t going to do anything.

Best.

K. Frank

hello, sir, appreciate for your reply. Do you have some other better solution.