I want to implement a seq2seq model which is learning to generate text (source and target sequences are the same). Some parts of my code are shown below:
hyperparameters:
#Training hyperparameters
num_epochs = 1
learning_rate = 0.001
batch_size = 64
#Model hyperparameters
load_model = False
save_model = False
save_best_model = True
vocab_size = voc.count #===> input_size
output_size = voc.count
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout_p = 0.5
dec_dropout_p = 0.5
teacher_force_ratio = 0.5
max_length = 50
Encoder, Decoder, and Seq2Seq models:
class Encoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_p):
super(Encoder, self).__init__()
self.hidden_size = hidden_size #LSTM output size
self.num_layers = num_layers
self.dropout = nn.Dropout(dropout_p)
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = dropout_p)
def forward(self, x):
embedding = self.dropout(self.embedding(x))
output, (hidden, cell) = self.rnn(embedding)
return hidden, cell
class Decoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_p):
super(Decoder, self).__init__()
self.hidden_size = hidden_size #LSTM output size
self.num_layers = num_layers
self.dropout = nn.Dropout(dropout_p)
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = dropout_p)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden, cell): #x : (batch_size)
x = x.unsqueeze(0)
embedding = self.dropout(self.embedding(x))
outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
predictions = self.fc(outputs)
predictions = predictions.squeeze(0) #we predict one word at each time
return predictions, hidden, cell
class Seq2Seq(nn.Module):
def __init__(self, encoder,decoder):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, source, target, target_vocab_size, teacher_force_ratio = 0.5):
target_len = target.shape[0]
batch_size = source.shape[1]
outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(DEVICE)
hidden, cell = self.encoder(source)
x = target[0]
for t in range(1, target_len):
output, hidden, cell = self.decoder(x, hidden, cell)
outputs[t] = output
best_guess = output.argmax(1)
x = target[t] if random.random() < teacher_force_ratio else best_guess
return outputs
Declaration of model:
encoder_net = Encoder(vocab_size, encoder_embedding_size, hidden_size, num_layers, enc_dropout_p).to(DEVICE)
decoder_net = Decoder(vocab_size, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout_p).to(DEVICE)
model = Seq2Seq(encoder_net, decoder_net).to(DEVICE)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr = learning_rate)
Dataloader:
train_iterator = torch.utils.data.DataLoader(train_ds, batch_size=64, drop_last=True)
test_iterator = torch.utils.data.DataLoader(test_ds, batch_size=64, drop_last=True)
Train model:
print('Training starts...')
best_loss = float('inf')
for epoch in range(1 , 1 + num_epochs):
print(f'Epoch {epoch} / {num_epochs}')
sum_loss = 0
total = len(train_iterator)
if save_model:
checkpoint = {'state_dic' : model.state_dict(), 'optimizer' : optimizer.state_dict()}
save_checkpoint(checkpoint)
model.train()
for i, (train_inputs, train_labels) in tqdm(enumerate(train_iterator), desc='Training ', total=len(train_iterator)):
input = train_inputs.to(DEVICE)
target = input
output = model(input, target, vocab_size, teacher_force_ratio)
output = output[1:].reshape(-1, output.shape[2])
target = target[1:].reshape(-1)
optimizer.zero_grad()
# #calculate the loss from a predicted sentence with the expected result
# for ot in range(num_iter):
# loss += criterion(output[ot], target_tensor[ot])
loss = criterion(output, target)
loss.backward()#(retain_graph=True)
loss = loss.data.item()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
optimizer.step()
writer.add_scalar('Training loss', loss, global_step = step)
step += 1
# loss= loss.data.item()
sum_loss += loss
# print("loss = ", loss)
epoch_loss = sum_loss/total
print(f"LOSS: {epoch_loss:.5f}\n")
Test model:
def translate(model):
model.eval()
translated_sentences = []
for i, (test_inputs, test_labels) in tqdm(enumerate(test_iterator), desc='Test ', total=len(test_iterator)):
input = test_inputs.to(DEVICE)
target = input
# Build encoder hidden, cell state
with torch.no_grad():
hidden, cell = model.encoder(input)
print(hidden.shape)
outputs = [voc.word2index[SOS]]
output = model(input, target, vocab_size, teacher_force_ratio)
for _ in range(max_length):
previous_word = torch.LongTensor([outputs[-1]]).to(DEVICE)
with torch.no_grad():
output, hidden, cell = model.decoder(previous_word, hidden, cell)
best_guess = output.argmax(1).item()
outputs.append(best_guess)
if output.argmax(1).item() == voc.word2index[EOS]:
break
translated_sentence = [voc.index2word[idx] for idx in outputs]
# remove start token
translated_sentences.append(translated_sentence[1:])
return translated_sentences
translate(model)
but a runtime error occurs during the translate method:
RuntimeError Traceback (most recent call last)
<ipython-input-14-dbe83c5e4640> in <module>()
----> 1 translate(model)
6 frames
<ipython-input-13-cfd4f97b53f5> in translate(model)
24 previous_word = torch.LongTensor([outputs[-1]]).to(DEVICE)
25 with torch.no_grad():
---> 26 output, hidden, cell = model.decoder(previous_word, hidden, cell)
27 best_guess = output.argmax(1).item()
28
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
<ipython-input-9-8c3f838beb9d> in forward(self, x, hidden, cell)
38 embedding = self.dropout(self.embedding(x))
39
---> 40 outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
42 predictions = self.fc(outputs)
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
687 hx = self.permute_hidden(hx, sorted_indices)
688
--> 689 self.check_forward_args(input, hx, batch_sizes)
690 if batch_sizes is None:
691 result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in check_forward_args(self, input, hidden, batch_sizes)
632 self.check_input(input, batch_sizes)
633 self.check_hidden_size(hidden[0], self.get_expected_hidden_size(input, batch_sizes),
--> 634 'Expected hidden[0] size {}, got {}')
635 self.check_hidden_size(hidden[1], self.get_expected_cell_size(input, batch_sizes),
636 'Expected hidden[1] size {}, got {}')
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in check_hidden_size(self, hx, expected_hidden_size, msg)
224 msg: str = 'Expected hidden size {}, got {}') -> None:
225 if hx.size() != expected_hidden_size:
--> 226 raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
227
228 def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]):
RuntimeError: Expected hidden[0] size (2, 1, 1024), got [2, 100, 1024]
Whatâs the problem? did I do sth wrong in my codes and methods? How can I fix this error?