Is this a correct reimplementation of Seq2Seq model?

I made a code that sort of change the tutorial script of seq2seq provided by Pytorch. Here’s the model:

class Seq2Seq(nn.Module):
    def __init__(self, encoder, batch_size, vocab_size, input_size, output_size, hidden_dim, embedding_dim, n_layers=2, dropout_p=0.5):
        super(Seq2Seq, self).__init__()

        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.input_length = input_size
        self.output_length = output_size
        self.vocab_size = vocab_size

        self.encoder = encoder
        self.dropout = nn.Dropout(dropout_p)
        self.selu = nn.SELU()
        self.decoder_embeddings = nn.Embedding(vocab_size, hidden_dim)
        self.decoder_gru = nn.GRU(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, vocab_size)
        self.softmax = nn.LogSoftmax()

    def decode(self, SOS_token, encoder_hidden, target_output, teacher_forcing_ratio=0.8):
        decoder_output_full = autograd.Variable(torch.zeros(self.output_length, self.batch_size, self.vocab_size))
        decoder_output_full = decoder_output_full.cuda() if use_cuda else decoder_output_full
        target = target_output.permute(1,0)

        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

        for idx in range(self.output_length):
            if idx == 0:
                decoder_input = SOS_token
                decoder_hidden = encoder_hidden.unsqueeze(0)
            output = self.decoder_embeddings(decoder_input).view(1, self.batch_size, -1)
            output = self.dropout(output)

            output = self.selu(output)

            if use_teacher_forcing:
                decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden)
                temp = 1
                out = self.out(decoder_output[0])
                out = out + sample_gumbel(out.shape)
                decoder_output = F.softmax(out / temp, dim=1)
                # decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1)
                decoder_output_full[idx, :, :] = decoder_output
                decoder_input = target[idx-1]  # Teacher forcing

            else:
                decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden)
                temp = 1
                out = self.out(decoder_output[0])
                out = out + sample_gumbel(out.shape)
                decoder_output = F.softmax(out / temp, dim=1)
                # decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1)
                topv, topi = decoder_output.data.topk(1)
                # print topi
                ni = topi
                # decoder_input_v = autograd.Variable(torch.LongTensor([[ni]]))
                decoder_input = autograd.Variable(ni)
                # decoder_input = decoder_input.cuda() if use_cuda else decoder_input
                # print decoder_input
                decoder_output_full[idx, :, :] = decoder_output

        decoder_output_full = decoder_output_full.permute(1,0,2)

        # gen_output = self.softmax(self.out(decoder_output_full))

        return decoder_output_full

    def forward(self, input, target_output, teacher_forcing_ratio=0.8):
        encoder_feat, _ = self.encoder(input)

        SOS_token = np.zeros((self.batch_size,1), dtype=np.int32)
        SOS_token = torch.LongTensor(SOS_token.tolist())
        SOS_token = autograd.Variable(SOS_token)
        if use_cuda:
            SOS_token = SOS_token.cuda(gpu)

        gen_output = self.decode(SOS_token, encoder_feat, target_output, teacher_forcing_ratio)

        return gen_output

    def initHidden(self):
        result = autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        if use_cuda:
            return result.cuda()
        else:
            return result

The way I calculate the NLL loss is by creating one whole sequence of output first and compare it with the target output. Here’s the loss function:

class batchNLLLoss(nn.Module):
    def __init__(self):
        super(batchNLLLoss, self).__init__()

    def forward(self, synt, target, claim_length=20):
        loss_fn = nn.NLLLoss()

        loss = 0

        for i in range(synt.shape[0]):
            for j in range(claim_length):
                loss += loss_fn(synt[i][j].unsqueeze(0), target[i][j])

        return loss

The current problem is the loss value is really small and seems like the network learns nothing (the output is the same word repeated again and again). Any thought about this? Thanks in advance!

I have no real experience with word models. Nevertheless, here are a couple of thoughts.

I can’t see why you need a custom batchNLLLoss. If you pass tensors of the right shape to NLLLoss then it should calculate the loss over an entire batch.

If the loss is really small, then the gradients will probably all be really small and maybe you could increase the learning rate to counterbalance that fact and speed up the learning.