In the first repo there is a jupyter notebook containing several variations of dropout including this:

```
class VariationalDropout(nn.Module):
def __init__(self, alpha=1.0, dim=None):
super(VariationalDropout, self).__init__()
self.dim = dim
self.max_alpha = alpha
# Initial alpha
log_alpha = (torch.ones(dim) * alpha).log()
self.log_alpha = nn.Parameter(log_alpha)
def kl(self):
c1 = 1.16145124
c2 = -1.50204118
c3 = 0.58629921
alpha = self.log_alpha.exp()
negative_kl = 0.5 * self.log_alpha + c1 * alpha + c2 * alpha**2 + c3 * alpha**3
kl = -negative_kl
return kl.mean()
def forward(self, x):
"""
Sample noise e ~ N(1, alpha)
Multiply noise h = h_ * e
"""
if self.train():
# N(0,1)
epsilon = Variable(torch.randn(x.size()))
if x.is_cuda:
epsilon = epsilon.cuda()
# Clip alpha
self.log_alpha.data = torch.clamp(self.log_alpha.data, max=self.max_alpha)
alpha = self.log_alpha.exp()
# N(1, alpha)
epsilon = epsilon * alpha
return x * epsilon
else:
return x
```

And the second repo contains this implementation:

```
class VariationalDropout(nn.Module):
def __init__(self, input_size, out_size, log_sigma2=-10, threshold=3):
"""
:param input_size: An int of input size
:param log_sigma2: Initial value of log sigma ^ 2.
It is crusial for training since it determines initial value of alpha
:param threshold: Value for thresholding of validation. If log_alpha > threshold, then weight is zeroed
:param out_size: An int of output size
"""
super(VariationalDropout, self).__init__()
self.input_size = input_size
self.out_size = out_size
self.theta = Parameter(t.FloatTensor(input_size, out_size))
self.bias = Parameter(t.Tensor(out_size))
self.log_sigma2 = Parameter(t.FloatTensor(input_size, out_size).fill_(log_sigma2))
self.reset_parameters()
self.k = [0.63576, 1.87320, 1.48695]
self.threshold = threshold
def reset_parameters(self):
stdv = 1. / math.sqrt(self.out_size)
self.theta.data.uniform_(-stdv, stdv)
self.bias.data.uniform_(-stdv, stdv)
@staticmethod
def clip(input, to=8):
input = input.masked_fill(input < -to, -to)
input = input.masked_fill(input > to, to)
return input
def kld(self, log_alpha):
first_term = self.k[0] * F.sigmoid(self.k[1] + self.k[2] * log_alpha)
second_term = 0.5 * t.log(1 + t.exp(-log_alpha))
return -(first_term - second_term - self.k[0]).sum() / (self.input_size * self.out_size)
def forward(self, input):
"""
:param input: An float tensor with shape of [batch_size, input_size]
:return: An float tensor with shape of [batch_size, out_size] and negative layer-kld estimation
"""
log_alpha = self.clip(self.log_sigma2 - t.log(self.theta ** 2))
kld = self.kld(log_alpha)
if not self.training:
mask = log_alpha > self.threshold
return t.addmm(self.bias, input, self.theta.masked_fill(mask, 0))
mu = t.mm(input, self.theta)
std = t.sqrt(t.mm(input ** 2, self.log_sigma2.exp()) + 1e-6)
eps = Variable(t.randn(*mu.size()))
if input.is_cuda:
eps = eps.cuda()
return std * eps + mu + self.bias, kld
def max_alpha(self):
log_alpha = self.log_sigma2 - self.theta ** 2
return t.max(log_alpha.exp())
```

From scrolling through the paper and skimming the equations, it looks like they are both fine although I might also misread something.