Hi all,
Currently I am trying to implement the following paper about compressing word embeddings using deep compositional code learning (https://arxiv.org/pdf/1711.01068.pdf). Maybe it is a bit odd to ask specific questions regarding implementing architectures but I am stuck and not certain where to go. Currently I have this:
class Encoder(nn.Module):
def __init__(self, args):
super(Encoder, self).__init__()
self.E = 300
self.K = 2
self.M = 256
self.encode = nn.Sequential(
nn.Linear(self.E, (self.K * self.M) // 2),
nn.Tanh(),
nn.Linear((self.K * self.M) // 2, self.K * self.M),
nn.Softplus()
)
def forward(self, x):
encoded = self.encode(x)
a = encoded.reshape(-1, self.M, self.K)
a = self._gumbel_softmax(a)
return a
def _gumbel_softmax(self, x, eps=1e-10):
x = torch.log(x+eps)
g = self._sample_gumbel(x.size(0))
x = x + Variable(g)
x = F.softmax(x, dim=2)
return x
def _sample_gumbel(self, batch_size, eps=1e-10):
u = torch.zeros(batch_size, self.M, self.K).uniform_()
return -1 * torch.log(-1 * torch.log(u + eps) + eps)
class Decoder(nn.Module):
def __init__(self, args):
super(Decoder, self).__init__()
self.E = 300
self.K = 2
self.M = 256
self.A = nn.Parameter(torch.FloatTensor(self.M*self.K, self.E), requires_grad=True)
def forward(self, y):
y = y.reshape(-1, self.M * self.K)
reconstruction = torch.matmul(y, self.A)
return reconstruction
The code runs, but over time the reconstruction error only increases so I was wondering if I made any mistakes in the encoder-decoder architecture.