torch.optim.Adam([params]).step() not updating Variables

Hi all – I know that this question has been asked several times on these forums previously, but it seems to me like I am not making any of the same mistakes as the previous askers. I am doing some relatively simple computations, where I initialize some Variables, define a loss function over them, and then initialize an optimizer and attempt to update those Variables with respect to the loss function. However, the initialized parameters and the parameters after “optimization” are equivalent.

Can anyone see what I’m doing incorrectly? My code is here; the Variables under “VARIATIONAL PARAMETERS” are the ones that I want to be updating:

import torch
from torch.distributions import *
from torch.autograd import Variable

data1 = MultivariateNormal(-5 * torch.ones(2), torch.eye(2)).sample([100])
data2 = MultivariateNormal(5 * torch.ones(2), torch.eye(2)).sample([100])
data3 = MultivariateNormal(torch.zeros(2), torch.eye(2)).sample([100])
data = torch.cat((data1, data2, data3))

N = data.shape[0]
X = torch.tensor(data, dtype=torch.float)

def mix_weights(beta): 
    weights = [beta[t] * torch.prod(1. - beta[:t], dim=0) for t in range(beta.shape[0])]
    weights += [1. - sum(weights)]
    return weights

### HYPERPARAMETERS ###
alpha = 1.2
T = 3
n_iter = 2000
num_samples = 100
lr = 0.01

### PRIORS ###
p_beta = Beta(1, alpha)
p_mu = MultivariateNormal(torch.zeros(2), torch.eye(2))
p_zeta = Categorical(torch.tensor(mix_weights(p_beta.rsample([T-1]))))

### VARIATIONAL PARAMETERS ###
kappa = Variable(Uniform(0, 2).rsample([T-1]), requires_grad=True) 
tau = Variable(MultivariateNormal(torch.zeros(2), 10 * torch.eye(2)).rsample([T]), requires_grad=True) 
phi = Variable(Dirichlet(1/T * torch.ones(T)).rsample([N]), requires_grad=True) 

def elbo(kappa, tau, phi):
  ### VARIATIONAL FACTORS ###
  q_beta = Beta(torch.ones(T-1), kappa) # T-1 batch size Beta dist (one for each variational factor)
  q_mu = MultivariateNormal(tau, torch.stack([torch.eye(2) for _ in range(T)])) # T batch size Normal dist (one for each variational factor)
  q_zeta = Categorical(phi) # N batch size Categorical dist (of length T)

  z_mc = q_zeta.sample([num_samples]) # num_samples x N 

  mu_mc = q_mu.sample([num_samples]) # num_samples x T x 2
  centers_mc = torch.zeros(num_samples, N, 2) # num samples x N x 2
  
  for s in range(num_samples):
    for n in range(N):
      centers_mc[s, n, :] = mu_mc[s, z_mc[s, n], :]

  px_mc = [MultivariateNormal(centers_mc[:, n, :], torch.stack([torch.eye(2) for _ in range(num_samples)])) for n in range(N)]


  log_probs = torch.zeros(num_samples, N)

  for n in range(N):
    log_probs[:, n] = px_mc[n].log_prob(X[n])
  
  mean_log_prob = torch.mean(log_probs, dim=0)
  sum_mean_log_prob = torch.sum(mean_log_prob)

  pq_pairs = [[q_beta, p_beta], [q_mu, p_mu], [q_zeta, p_zeta]]
  kl_qp = sum([sum(kl_divergence(q, p)) for q, p in pq_pairs])
  elbo_loss = Variable(kl_qp - sum_mean_log_prob, requires_grad=True)
  return elbo_loss

optimizer = torch.optim.Adam([kappa, tau, phi], lr=lr)

print (tau)
print(phi)
print(kappa)

for i in range(n_iter):
    optimizer.zero_grad()
    loss = elbo(kappa, tau, phi)
    loss.backward(retain_graph=True)
    optimizer.step()
    with torch.no_grad():
      kappa = kappa.clamp(0, np.inf)
      tau = tau.clamp(-10, 10)   
      phi = phi.clamp(0, np.inf)
      phi = phi / torch.sum(phi, dim=1).view(N, 1)

print(tau)
print(phi)
print(kappa)

The print statement before the optimization loop return identical items.

BTW variable is deprecated since pytorch 0.4
Besides, I think that

elbo_loss = Variable(kl_qp - sum_mean_log_prob, requires_grad=True)

This line breaks the computational graph as you are reinitilizing that parameter from kl-sum data