Trying to implement compositional code learning

Hi all,

Currently I am trying to implement the following paper about compressing word embeddings using deep compositional code learning (https://arxiv.org/pdf/1711.01068.pdf). Maybe it is a bit odd to ask specific questions regarding implementing architectures but I am stuck and not certain where to go. Currently I have this:

class Encoder(nn.Module):
    def __init__(self, args):
        super(Encoder, self).__init__()
        self.E = 300
        self.K = 2
        self.M = 256
        
        self.encode = nn.Sequential(
            nn.Linear(self.E, (self.K * self.M) // 2),
            nn.Tanh(),
            nn.Linear((self.K * self.M) // 2, self.K * self.M),
            nn.Softplus()
        )

    def forward(self, x):
        encoded = self.encode(x)
        a = encoded.reshape(-1, self.M, self.K) 
        a = self._gumbel_softmax(a) 
        
        return a
                      
    def _gumbel_softmax(self, x, eps=1e-10):
        x = torch.log(x+eps)
        g = self._sample_gumbel(x.size(0))
        x = x + Variable(g)
        x = F.softmax(x, dim=2)
        
        return x
        
    def _sample_gumbel(self, batch_size, eps=1e-10):
        u = torch.zeros(batch_size, self.M, self.K).uniform_()
        
        return -1 * torch.log(-1 * torch.log(u + eps) + eps)


class Decoder(nn.Module):
    def __init__(self, args):
        super(Decoder, self).__init__()
        self.E = 300
        self.K = 2
        self.M = 256
        self.A = nn.Parameter(torch.FloatTensor(self.M*self.K, self.E), requires_grad=True)

    def forward(self, y):
        y   = y.reshape(-1, self.M * self.K)
        reconstruction = torch.matmul(y, self.A)
        
        return reconstruction

The code runs, but over time the reconstruction error only increases so I was wondering if I made any mistakes in the encoder-decoder architecture.