Expected hidden[0] size (2, 1, 1024), got [2, 100, 1024] runtime error in Seq2Seq model

I want to implement a seq2seq model which is learning to generate text (source and target sequences are the same). Some parts of my code are shown below:

hyperparameters:

#Training hyperparameters
num_epochs = 1
learning_rate = 0.001
batch_size = 64

#Model hyperparameters
load_model = False
save_model = False
save_best_model = True
vocab_size = voc.count #===> input_size
output_size = voc.count
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout_p = 0.5
dec_dropout_p = 0.5
teacher_force_ratio = 0.5
max_length = 50

Encoder, Decoder, and Seq2Seq models:

class Encoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_p):
    super(Encoder, self).__init__()
    self.hidden_size = hidden_size #LSTM output size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(dropout_p)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = dropout_p) 
   
  
  def forward(self, x):
    embedding = self.dropout(self.embedding(x))
    output, (hidden, cell) = self.rnn(embedding) 
    return hidden, cell


class Decoder(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_p): 
    super(Decoder, self).__init__()
    self.hidden_size = hidden_size #LSTM output size
    self.num_layers = num_layers

    self.dropout = nn.Dropout(dropout_p)
    self.embedding = nn.Embedding(input_size, embedding_size)
    self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout = dropout_p)
    self.fc = nn.Linear(hidden_size, output_size) 



  def forward(self, x, hidden, cell): #x : (batch_size)
    
    x = x.unsqueeze(0)
   
    embedding = self.dropout(self.embedding(x)) 
    
    outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell)) 
    
    predictions = self.fc(outputs)
    predictions = predictions.squeeze(0) #we predict one word at each time

    return predictions, hidden, cell


class Seq2Seq(nn.Module):
  def __init__(self, encoder,decoder):
    super(Seq2Seq, self).__init__()

    self.encoder = encoder
    self.decoder = decoder

  def forward(self, source, target, target_vocab_size, teacher_force_ratio = 0.5):
    target_len = target.shape[0]
    batch_size = source.shape[1]
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(DEVICE)
    hidden, cell = self.encoder(source)
    x = target[0]
    for t in range(1, target_len):
      output, hidden, cell = self.decoder(x, hidden, cell)
      outputs[t] = output
      best_guess = output.argmax(1) 
      x = target[t] if random.random() < teacher_force_ratio else best_guess
    return outputs

Declaration of model:

encoder_net = Encoder(vocab_size, encoder_embedding_size, hidden_size, num_layers, enc_dropout_p).to(DEVICE)
decoder_net = Decoder(vocab_size, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout_p).to(DEVICE)
model = Seq2Seq(encoder_net, decoder_net).to(DEVICE)
criterion = nn.NLLLoss() 
optimizer = optim.SGD(model.parameters(), lr = learning_rate) 

Dataloader:

train_iterator = torch.utils.data.DataLoader(train_ds, batch_size=64, drop_last=True) 
test_iterator = torch.utils.data.DataLoader(test_ds, batch_size=64, drop_last=True) 

Train model:

print('Training starts...')
best_loss = float('inf')

for epoch in range(1 , 1 + num_epochs):
  print(f'Epoch {epoch} / {num_epochs}')

  sum_loss = 0
  total = len(train_iterator)

  if save_model:
    checkpoint = {'state_dic' : model.state_dict(), 'optimizer' : optimizer.state_dict()}
    save_checkpoint(checkpoint)
  
  model.train()
  for i, (train_inputs, train_labels) in tqdm(enumerate(train_iterator), desc='Training ', total=len(train_iterator)):
    
    input = train_inputs.to(DEVICE)
    target = input

    output = model(input, target, vocab_size, teacher_force_ratio)

    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()

  # #calculate the loss from a predicted sentence with the expected result
  #  for ot in range(num_iter):
  #      loss += criterion(output[ot], target_tensor[ot])

    loss = criterion(output, target)
    loss.backward()#(retain_graph=True)
    loss = loss.data.item()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
    optimizer.step()

    writer.add_scalar('Training loss', loss, global_step = step) 
    step += 1
    
    # loss= loss.data.item()
    sum_loss += loss
    # print("loss = ", loss)
    

  epoch_loss = sum_loss/total
  print(f"LOSS: {epoch_loss:.5f}\n")

Test model:

def translate(model):
  model.eval()
  translated_sentences = []
  for i, (test_inputs, test_labels) in tqdm(enumerate(test_iterator), desc='Test ', total=len(test_iterator)):
    input = test_inputs.to(DEVICE)
    target = input

    # Build encoder hidden, cell state
    with torch.no_grad():
          hidden, cell = model.encoder(input)
          print(hidden.shape)


    outputs = [voc.word2index[SOS]]
    output = model(input, target, vocab_size, teacher_force_ratio)


    for _ in range(max_length):
      previous_word = torch.LongTensor([outputs[-1]]).to(DEVICE)
      with torch.no_grad():
        output, hidden, cell = model.decoder(previous_word, hidden, cell)
        best_guess = output.argmax(1).item()

      outputs.append(best_guess)

      if output.argmax(1).item() == voc.word2index[EOS]:
          break

    translated_sentence = [voc.index2word[idx] for idx in outputs]
    # remove start token
    translated_sentences.append(translated_sentence[1:])
    
    return translated_sentences

translate(model)

but a runtime error occurs during the translate method:

RuntimeError                              Traceback (most recent call last)

<ipython-input-14-dbe83c5e4640> in <module>()
----> 1 translate(model)

6 frames

<ipython-input-13-cfd4f97b53f5> in translate(model)
     24       previous_word = torch.LongTensor([outputs[-1]]).to(DEVICE)
     25       with torch.no_grad():
---> 26         output, hidden, cell = model.decoder(previous_word, hidden, cell)
     27         best_guess = output.argmax(1).item()
     28 

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

<ipython-input-9-8c3f838beb9d> in forward(self, x, hidden, cell)
     38     embedding = self.dropout(self.embedding(x)) 
     39 
---> 40     outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell)) 
     42     predictions = self.fc(outputs) 

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
   1100         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1101                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102             return forward_call(*input, **kwargs)
   1103         # Do not call functions when jit is used
   1104         full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    687             hx = self.permute_hidden(hx, sorted_indices)
    688 
--> 689         self.check_forward_args(input, hx, batch_sizes)
    690         if batch_sizes is None:
    691             result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in check_forward_args(self, input, hidden, batch_sizes)
    632         self.check_input(input, batch_sizes)
    633         self.check_hidden_size(hidden[0], self.get_expected_hidden_size(input, batch_sizes),
--> 634                                'Expected hidden[0] size {}, got {}')
    635         self.check_hidden_size(hidden[1], self.get_expected_cell_size(input, batch_sizes),
    636                                'Expected hidden[1] size {}, got {}')

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/rnn.py in check_hidden_size(self, hx, expected_hidden_size, msg)
    224                           msg: str = 'Expected hidden size {}, got {}') -> None:
    225         if hx.size() != expected_hidden_size:
--> 226             raise RuntimeError(msg.format(expected_hidden_size, list(hx.size())))
    227 
    228     def check_forward_args(self, input: Tensor, hidden: Tensor, batch_sizes: Optional[Tensor]):

RuntimeError: Expected hidden[0] size (2, 1, 1024), got [2, 100, 1024]

What’s the problem? did I do sth wrong in my codes and methods? How can I fix this error?

The error seems to be raised in the nn.LSTM layer.
The docs describe the expected inputs as:

input: tensor of shape (L,Hin) for unbatched input, (L,N,Hin) when batch_first=False or (N,L,Hin) when batch_first=True containing the features of the input sequence…

h_0: tensor of shape (D∗num_layers,Hout) for unbatched input or (D∗num_layers,N,Hout) containing the initial hidden state for each element in the input sequence…

c_0: tensor of shape (D∗num_layers,Hcell) for unbatched input or (D∗num_layers,N,Hcell) containing the initial cell state for each element in the input sequence.

Based on your error message it seems the batch size doesn’t match between the input and hidden state of the nn.LSTM layer.

Have you got any solution to this ? I am getting the same error.

i get same error as mentioned above, and I read that you get the same error, could you solve it? if yes then can you please tell me how?

No, I have not solved it yet!

outputs will contain the last hidden states (“last” with respect to number of layers) for all time steps, see here:

I assume that your sequence length is 100, so you would see the shape (2, 100, 1024) in the error message

I’m pretty sure you only want the last hidden state (“last” w.r.t. to the number of layers) of the last layer. In your encoder code this would the hidden[-1] (this does no longer work with bidirectional=True!). So the line should be:

predictions = self.fc(hidden[-1])