I am new to Seq2Seq and hope to find a proper guildances, advices.
I am doing a Project from an online course so I can not give the material but I got my Project notebook on Github
I want to ask about my understanding about the architecture as well as the data dimension after each layer. Suppose I have a Seq2Seq model as below:
Seq2Seq( (encoder): Encoder( (embedding): Embedding(5678, 512) (lstm): LSTM(512, 512, batch_first=True) ) (decoder): Decoder( (embedding): Embedding(4297, 512) (lstm): LSTM(512, 512, batch_first=True) (fc): Linear(in_features=512, out_features=4297, bias=True) (dropout): Dropout(p=0.2, inplace=False) (softmax): LogSoftmax(dim=1) ) )
Where 5678 is source_vocab size, 512 is desired embedding size, 4297 is target_vocab size. You can check my Encoder, Decoder, Seq2Seq class as below:
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
#Force cpu
#device= "cpu"
print(device)
class Encoder(nn.Module):
def __init__(self, input_size, hidden_size):
super(Encoder, self).__init__()
self.input_size= input_size
self.hidden_size= hidden_size
self.embedding= nn.Embedding(self.input_size, self.hidden_size)
self.lstm= nn.LSTM(self.hidden_size, self.hidden_size, batch_first= True)
def forward(self, i):
print(i.size())
embedded= self.embedding(i)
print(embedded.size())
o,(h,c)= self.lstm(embedded)
return h, c
class Decoder(nn.Module):
def __init__(self, hidden_size, output_size):
super(Decoder, self).__init__()
self.hidden_size= hidden_size
self.output_size= output_size
self.embedding= nn.Embedding(self.output_size, self.hidden_size)
self.lstm= nn.LSTM(self.hidden_size, self.hidden_size, batch_first= True)
self.fc = nn.Linear(self.hidden_size, self.output_size)
self.dropout= nn.Dropout(0.2)
self.softmax= nn.LogSoftmax(dim= 1)
def forward(self, i, h, c):
embedded= self.embedding(i)
o,(h,c)= self.lstm(embedded, (h, c))
o= self.fc(o[0])
o= self.dropout(o)
o= self.softmax(o)
return o, h, c
class Seq2Seq(nn.Module):
def __init__(self, encoder_input_size, encoder_hidden_size, decoder_hidden_size, decoder_output_size):
super(Seq2Seq, self).__init__()
self.input_size= encoder_input_size
self.hidden_size= encoder_hidden_size
self.output_size= decoder_output_size
self.encoder= Encoder(self.input_size, self.hidden_size)
self.decoder= Decoder(self.hidden_size, self.output_size)
def forward(self, src, trg, teacher_forcing_ratio = 0.5):
output_seq= []
encoder_hidden, encoder_cell= self.encoder(src)
decoder_hidden= encoder_hidden
decoder_cell= encoder_cell
decoder_input= torch.Tensor([[target_vocab.token_to_index("<SOS>")]]).long().to(device)
for time_step in range(trg.size(0)):
output_token, decoder_hidden, decoder_cell= self.decoder(
decoder_input,
decoder_hidden,
decoder_cell
)
output_seq.append(output_token)
if self.training:
if random.random() < teacher_forcing_ratio:
decoder_input= trg[time_step]
else:
_, top_index= output_token.data.topk(1)
decoder_input= top_index.squeeze().detach()
return output_seq
My quesions is that the Input of Encoder is the soure vocabualry size, which mean that each token in the input sequence should be converted into one-hot vector before parsing to the Encoder before hand (for example, should a batch has a dimension of (batch_size, seq_len, vocab_size) instead of (batch_size, seq_len)?
I search others notebook and saw that they just parse a batch of (batch_size, seq_len) into the Encoder and I got confused.
Any help is appreciated.
I have tried pass (batch_size, seq_len, vocab_size) and the Embedding layer output dimension is (batch_size, seq_len, vocab_size, embedding_dim), which make me more confused, isn’t it should be (batch_size, seq_len, embedding_dim)