Seq2seq model (encoder and decoder input)

I decided to venture into NLP in machine learning after giving it some thoughts, so I am curious as to how the encoder and decoder of a simple seq2seq model works, precisely I want to know how data is fed into the encoder and decoder give that the input data is of shape (batch_size, input_len), output of shape (batch_size, output_len), the text is vectorized with it’s unique token index from the vocabulary eg: (vocab:{’’:0, ‘’:1, ‘’:2, ‘’:3,…}, vectorized_text(‘i am …’): [1, 6, 33, 4…2, 0, 0, 0, 0]) and the RNN layer has batch_first = True.
Thanks to anyone who answers this.

This is a very general question, but a good starting point is PyTorch’s seq2seq tutorial.

1 Like

Thanks for your reply, I’ve actually made reference to that link but then I still don’t really get how it was done. to me it seemed as if they iterated through each sentence in a batch rather than each word in the sentences of a batch and the batch_first is set to false so it’s kinda confusing and all.

anyways here’s a code I did for two instances
instance 1:

import torch
import torch.nn as nn
import random


r"""The encoder takes in the SRC(feature_language)
    as input as ecodes them in form of a context 
    vector and sends them to the decoder
"""
#Encodder Model
class ModelEncoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout):
        super(ModelEncoder, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.embedding_dim = embedding_dim

        self.Embedding_Layer = nn.Sequential(
                                    nn.Embedding(self.input_dim, self.embedding_dim),
                                    nn.Dropout(self.dropout)
                                )
        self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim, 
                                    num_layers=self.num_layers, dropout=self.dropout,
                                    batch_first=True
                                )

    def forward(self, input):
        output = self.Embedding_Layer(input)
        _, (hidden, cell) = self.Recurrent_Layers(output)

        return hidden, cell



r"""The Decoder takes in the context vector
    generated by the encoder as hidden and 
    cell state and takes in Target labels as input
"""
#Decoder model
class ModelDecoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers, dropout):
        super(ModelDecoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.dropout = dropout

        self.Embedding_Layer = nn.Sequential(
                                    nn.Embedding(self.output_dim, self.embedding_dim),
                                    nn.Dropout(self.dropout)
                                )
        self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim,
                                    num_layers=self.num_layers, dropout=self.dropout,
                                    batch_first=True
                                )
        self.fc_Layer = nn.Sequential(
                            nn.Linear(self.hidden_dim, self.output_dim),
                            nn.Dropout(self.dropout)
                        )

    def forward(self, input, hidden, cell):
        output = self.Embedding_Layer(input)
        output, (hidden, cell) = self.Recurrent_Layers(output, (hidden, cell))
        output = self.fc_Layer(output.squeeze(0))

        return output, hidden, cell


#Seq-2-Seq model
class translator_seq2seq(nn.Module):
    def __init__(self, encoder, decoder, device = ('cuda' if torch.cuda.is_available == True else 'cpu')):
        super(translator_seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    #work to done
    def forward(self, SRC, TRG, teacher_force_ratio = 0.5):
        batch_size = TRG.shape[0]
        SRC_len = SRC.shape[1]
        TRG_len = TRG.shape[1]
        TRG_vocab_size = TRG.shape[1]

        #initialize variable to hold output
        outputs = torch.zeros(batch_size, TRG_len, TRG_vocab_size)

        for batch in range(batch_size):
            for src_batch in range(SRC_len):
                hidden, cell = self.encoder(SRC[batch, src_batch].reshape(1, 1))
            
            #initialize input to the decoder
            input = SRC[0, 0].reshape(1, 1)

            for trg_batch in range(TRG_len):
                if trg_batch == 0:
                    #first input to the decoder == <sos>
                    output, hidden, cell = self.decoder(SRC[0, 0].reshape(1, 1), hidden, cell)
                elif trg_batch > 0:
                    #for subsiquent inputs to naxt state
                    output, hidden, cell = self.decoder(input, hidden, cell)

                teacher_force = random.random() < teacher_force_ratio
                top_1 = output.argmax(1)
                input = TRG[batch, trg_batch] if teacher_force == True else top_1
                input = input.reshape(1, 1)
                outputs[batch, trg_batch] = output   
        return outputs

In this instance I iterated through each sentence in the input batch and iterated through each word in a sentence to get a hidden and cell state(context vectors) from the encoder given the input, then I used the context vectors as the 1st hidden and cell state for the decoder and used the token’s unique index as the 1st input of the decoder and and the subsequent outputs of the decoder as the next input (based on the teacher force ratio).

Instance 2:

import torch

import torch.nn as nn

import random

r"""The encoder takes in the SRC(feature_language)

    as input as ecodes them in form of a context 

    vector and sends them to the decoder

"""

#Encoder Model

class ModelEncoder(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout):

        super(ModelEncoder, self).__init__()

        self.input_dim = input_dim

        self.hidden_dim = hidden_dim

        self.num_layers = num_layers

        self.dropout = dropout

        self.embedding_dim = embedding_dim

        self.Embedding_Layer = nn.Sequential(

                                    nn.Embedding(self.input_dim, self.embedding_dim),

                                    nn.Dropout(self.dropout)

                                )

        self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim, 

                                    num_layers=self.num_layers, dropout=self.dropout,

                                    batch_first=True

                                )

    def forward(self, input):

        output = self.Embedding_Layer(input)

        _, (hidden, cell) = self.Recurrent_Layers(output)

        return hidden, cell

r"""The Decoder takes in the context vector

    generated by the encoder as hidden and 

    cell state and takes in Target labels as input

"""

#Decoder model

class ModelDecoder(nn.Module):

    def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers, dropout):

        super(ModelDecoder, self).__init__()

        self.hidden_dim = hidden_dim

        self.embedding_dim = embedding_dim

        self.output_dim = output_dim

        self.num_layers = num_layers

        self.dropout = dropout

        self.Embedding_Layer = nn.Sequential(

                                    nn.Embedding(self.output_dim, self.embedding_dim),

                                    nn.Dropout(self.dropout)

                                )

        self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim,

                                    num_layers=self.num_layers, dropout=self.dropout,

                                    batch_first=True

                                )

        self.fc_Layer = nn.Sequential(

                            nn.Linear(self.hidden_dim, self.output_dim),

                            nn.Dropout(self.dropout)

                        )

    def forward(self, input, hidden, cell):

        output = self.Embedding_Layer(input)

        output, (hidden, cell) = self.Recurrent_Layers(output, (hidden, cell))

        output = self.fc_Layer(output.squeeze(0))

        return output, hidden, cell

#Seq-2-Seq model

class translator_seq2seq(nn.Module):

    def __init__(self, encoder, decoder, device = ('cuda' if torch.cuda.is_available == True else 'cpu')):

        super(translator_seq2seq, self).__init__()

        self.encoder = encoder

        self.decoder = decoder

        self.device = device

    #work to done

    def forward(self, SRC, TRG, teacher_force_ratio = 0.5):

        batch_size = TRG.shape[0]

        SRC_len = SRC.shape[1]

        TRG_len = TRG.shape[1]

        TRG_vocab_size = TRG.shape[1]

        #initialize variable to hold output

        outputs = torch.zeros(batch_size, TRG_len, TRG_vocab_size)          

        #encode each word in a sentence and return the last hidden, cell state

        for src_batch in SRC:

            hidden, cell = self.encoder(src_batch.reshape(1, SRC_len))        

        #use the <sos> token as the first input of the decoder

        input = SRC[0, 0].reshape(1, 1)

        for  trg in range(batch_size):

            for trg_idx in range(1, TRG_len):

                output, hidden, cell = self.decoder(input, hidden, cell)

                teacher_force = random.random() < teacher_force_ratio

                top1 = output.argmax(1)

                input = TRG[trg, trg_idx] if teacher_force == True else top1

                input = input.reshape(1, 1)

            outputs[trg] = output

        return outputs

This instance is similar to instance 1 but the difference is that the context vector was generated by iterating through each sentence in the batch and not each word in the sentence(ie: the context vector was generated using the whole sentence as input at once)
the two instances of the decoder are still the same.

Now the thing is that instance 1 is working (loss is decreasing), while instance 2 is not working(loss not decreasing) despite the fact that their are no runtime errors in instance 2
The loss function I use in both instances is CrossEntropyLoss.
So my issue is this, is the implementation of instance 1 correct?
What do u think about the code? Are their some inappropriate things I did?