Bayesian Hierarchical models in pytorch (BayesianGMM)

I am aware of pyro facilitating probabilistic models through standard SVI inference. But is it possible to write Bayesian models in pure pytorch? Say for instance, MAP training in Bayesian GMM.
I specify a bunch of priors and a likelihood, provide a MAP objective and learn point estimates but I am missing something key in my attempt here, perhaps whats confusing is that means, scales and weights have priors but they still need to be trainable as params in MAP.

import numpy as np
import torch
from tqdm import trange
import torch.distributions as D
from matplotlib.patches import Ellipse
import matplotlib.pyplot as plt
import sklearn.datasets as skd

class BayesianGMM(torch.nn.Module):
    def __init__(self, Y, K, dim=2):
          :param Y (torch.tensor): data
          :param K (int): number of fixed components
          :param D (int): the dimension of data space.
        super(BayesianGMM, self).__init__()
        self.Y = Y
        self.K = K
        self.dim = dim
        self.mu_prior = torch.zeros((self.K,self.dim))
        self.sig_prior = torch.ones((self.K, self.dim))*3.0
        self.concentration = torch.randint(high=100,size=(1,))
        self.alpha = torch.nn.Parameter(torch.tensor([self.concentration]*self.K).float())
        self.means = D.Normal(self.mu_prior, self.sig_prior).sample()
        self.scales = D.Gamma(torch.tensor([1.0]), torch.tensor([1.0])).sample()
        self.weights = D.Dirichlet(self.alpha).sample()
        self.z = D.Categorical(self.weights)
    def forward(self, Y):
        comp = D.Independent(D.Normal(self.means, self.scales), 1)
        return D.MixtureSameFamily(self.z, comp).log_prob(Y)
    def log_prior(self, Y):
        self.means.log_prob(Y) + self.scales.log_prob(Y) + self.weights.log_prob(Y)
    def get_trainable_param_names(self):
      ''' Prints a list of parameters which will be 
      learnt in the process of optimising the objective '''
      for name, value in self.named_parameters():
    def loglikelihood(self, Y):
        return self.forward(Y).mean()

    def map_objective(self, Y):
        return self.loglikelihood(Y) + self.log_prior(Y)
    def train(self, Y, optimizer, n_steps):
        losses = np.zeros(n_steps)
        bar = trange(n_steps, leave=True)
        for step in bar:
            loss = self.map_objective(Y)
            losses[step] = loss
        return losses
if __name__ == '__main__':
    # Generate some data
    Y, labels = skd.make_blobs(n_samples=2000, random_state=42,
                              cluster_std=[2.5, 0.5, 1.0])
    Y = torch.tensor(Y).float()
    model = BayesianGMM(Y, K=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    losses = model.train(Y, optimizer, 4000)
  1. priors are normally added semi-implicitly by adding KL loss terms - same way as in VAE
  2. using distribution objects or sampling in __init__ is incorrect. you also need to use rsample() for gradient flows
  3. these changes still won’t be enough for this model, as discrete distributions won’t train without implementing REINFORCE (surrogate loss) - this is something that pyro would handle with less effort