I decided to venture into NLP in machine learning after giving it some thoughts, so I am curious as to how the encoder and decoder of a simple seq2seq model works, precisely I want to know how data is fed into the encoder and decoder give that the input data is of shape (batch_size, input_len), output of shape (batch_size, output_len), the text is vectorized with it’s unique token index from the vocabulary eg: (vocab:{’’:0, ‘’:1, ‘’:2, ‘’:3,…}, vectorized_text(‘i am …’): [1, 6, 33, 4…2, 0, 0, 0, 0]) and the RNN layer has batch_first = True.
Thanks to anyone who answers this.
This is a very general question, but a good starting point is PyTorch’s seq2seq tutorial.
Thanks for your reply, I’ve actually made reference to that link but then I still don’t really get how it was done. to me it seemed as if they iterated through each sentence in a batch rather than each word in the sentences of a batch and the batch_first is set to false so it’s kinda confusing and all.
anyways here’s a code I did for two instances
instance 1:
import torch
import torch.nn as nn
import random
r"""The encoder takes in the SRC(feature_language)
as input as ecodes them in form of a context
vector and sends them to the decoder
"""
#Encodder Model
class ModelEncoder(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout):
super(ModelEncoder, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
self.embedding_dim = embedding_dim
self.Embedding_Layer = nn.Sequential(
nn.Embedding(self.input_dim, self.embedding_dim),
nn.Dropout(self.dropout)
)
self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim,
num_layers=self.num_layers, dropout=self.dropout,
batch_first=True
)
def forward(self, input):
output = self.Embedding_Layer(input)
_, (hidden, cell) = self.Recurrent_Layers(output)
return hidden, cell
r"""The Decoder takes in the context vector
generated by the encoder as hidden and
cell state and takes in Target labels as input
"""
#Decoder model
class ModelDecoder(nn.Module):
def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers, dropout):
super(ModelDecoder, self).__init__()
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.output_dim = output_dim
self.num_layers = num_layers
self.dropout = dropout
self.Embedding_Layer = nn.Sequential(
nn.Embedding(self.output_dim, self.embedding_dim),
nn.Dropout(self.dropout)
)
self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim,
num_layers=self.num_layers, dropout=self.dropout,
batch_first=True
)
self.fc_Layer = nn.Sequential(
nn.Linear(self.hidden_dim, self.output_dim),
nn.Dropout(self.dropout)
)
def forward(self, input, hidden, cell):
output = self.Embedding_Layer(input)
output, (hidden, cell) = self.Recurrent_Layers(output, (hidden, cell))
output = self.fc_Layer(output.squeeze(0))
return output, hidden, cell
#Seq-2-Seq model
class translator_seq2seq(nn.Module):
def __init__(self, encoder, decoder, device = ('cuda' if torch.cuda.is_available == True else 'cpu')):
super(translator_seq2seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
#work to done
def forward(self, SRC, TRG, teacher_force_ratio = 0.5):
batch_size = TRG.shape[0]
SRC_len = SRC.shape[1]
TRG_len = TRG.shape[1]
TRG_vocab_size = TRG.shape[1]
#initialize variable to hold output
outputs = torch.zeros(batch_size, TRG_len, TRG_vocab_size)
for batch in range(batch_size):
for src_batch in range(SRC_len):
hidden, cell = self.encoder(SRC[batch, src_batch].reshape(1, 1))
#initialize input to the decoder
input = SRC[0, 0].reshape(1, 1)
for trg_batch in range(TRG_len):
if trg_batch == 0:
#first input to the decoder == <sos>
output, hidden, cell = self.decoder(SRC[0, 0].reshape(1, 1), hidden, cell)
elif trg_batch > 0:
#for subsiquent inputs to naxt state
output, hidden, cell = self.decoder(input, hidden, cell)
teacher_force = random.random() < teacher_force_ratio
top_1 = output.argmax(1)
input = TRG[batch, trg_batch] if teacher_force == True else top_1
input = input.reshape(1, 1)
outputs[batch, trg_batch] = output
return outputs
In this instance I iterated through each sentence in the input batch and iterated through each word in a sentence to get a hidden and cell state(context vectors) from the encoder given the input, then I used the context vectors as the 1st hidden and cell state for the decoder and used the token’s unique index as the 1st input of the decoder and and the subsequent outputs of the decoder as the next input (based on the teacher force ratio).
Instance 2:
import torch
import torch.nn as nn
import random
r"""The encoder takes in the SRC(feature_language)
as input as ecodes them in form of a context
vector and sends them to the decoder
"""
#Encoder Model
class ModelEncoder(nn.Module):
def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, dropout):
super(ModelEncoder, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.dropout = dropout
self.embedding_dim = embedding_dim
self.Embedding_Layer = nn.Sequential(
nn.Embedding(self.input_dim, self.embedding_dim),
nn.Dropout(self.dropout)
)
self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim,
num_layers=self.num_layers, dropout=self.dropout,
batch_first=True
)
def forward(self, input):
output = self.Embedding_Layer(input)
_, (hidden, cell) = self.Recurrent_Layers(output)
return hidden, cell
r"""The Decoder takes in the context vector
generated by the encoder as hidden and
cell state and takes in Target labels as input
"""
#Decoder model
class ModelDecoder(nn.Module):
def __init__(self, output_dim, embedding_dim, hidden_dim, num_layers, dropout):
super(ModelDecoder, self).__init__()
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.output_dim = output_dim
self.num_layers = num_layers
self.dropout = dropout
self.Embedding_Layer = nn.Sequential(
nn.Embedding(self.output_dim, self.embedding_dim),
nn.Dropout(self.dropout)
)
self.Recurrent_Layers = nn.LSTM(self.embedding_dim, self.hidden_dim,
num_layers=self.num_layers, dropout=self.dropout,
batch_first=True
)
self.fc_Layer = nn.Sequential(
nn.Linear(self.hidden_dim, self.output_dim),
nn.Dropout(self.dropout)
)
def forward(self, input, hidden, cell):
output = self.Embedding_Layer(input)
output, (hidden, cell) = self.Recurrent_Layers(output, (hidden, cell))
output = self.fc_Layer(output.squeeze(0))
return output, hidden, cell
#Seq-2-Seq model
class translator_seq2seq(nn.Module):
def __init__(self, encoder, decoder, device = ('cuda' if torch.cuda.is_available == True else 'cpu')):
super(translator_seq2seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
#work to done
def forward(self, SRC, TRG, teacher_force_ratio = 0.5):
batch_size = TRG.shape[0]
SRC_len = SRC.shape[1]
TRG_len = TRG.shape[1]
TRG_vocab_size = TRG.shape[1]
#initialize variable to hold output
outputs = torch.zeros(batch_size, TRG_len, TRG_vocab_size)
#encode each word in a sentence and return the last hidden, cell state
for src_batch in SRC:
hidden, cell = self.encoder(src_batch.reshape(1, SRC_len))
#use the <sos> token as the first input of the decoder
input = SRC[0, 0].reshape(1, 1)
for trg in range(batch_size):
for trg_idx in range(1, TRG_len):
output, hidden, cell = self.decoder(input, hidden, cell)
teacher_force = random.random() < teacher_force_ratio
top1 = output.argmax(1)
input = TRG[trg, trg_idx] if teacher_force == True else top1
input = input.reshape(1, 1)
outputs[trg] = output
return outputs
This instance is similar to instance 1 but the difference is that the context vector was generated by iterating through each sentence in the batch and not each word in the sentence(ie: the context vector was generated using the whole sentence as input at once)
the two instances of the decoder are still the same.
Now the thing is that instance 1 is working (loss is decreasing), while instance 2 is not working(loss not decreasing) despite the fact that their are no runtime errors in instance 2
The loss function I use in both instances is CrossEntropyLoss.
So my issue is this, is the implementation of instance 1 correct?
What do u think about the code? Are their some inappropriate things I did?