I am trying to replace GRU’s with LSTM’s in an Encoder-Decoder architecture and it results in an error. I don’t understand why the error is there. **Could someone shed some light on what is going on here?**

Here’s the code which works, along with its outputs. The code pertains to Exercise 5, Chapter 9.7 of the book Dive into Deep Learning:

```
#@save
class Seq2SeqEncoder(d2l.Encoder):
"""The RNN encoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqEncoder, self).__init__(**kwargs)
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.GRU(embed_size, num_hiddens, num_layers,
dropout=dropout)
def forward(self, X, *args):
# The output `X` shape: (`batch_size`, `num_steps`, `embed_size`)
X = self.embedding(X)
# In RNN models, the first axis corresponds to time steps
X = X.permute(1, 0, 2)
# When state is not mentioned, it defaults to zeros
output, state = self.rnn(X)
# `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
```

```
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
encoder.eval()
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
output.shape
```

`torch.Size([7, 4, 16])`

`state.shape`

`torch.Size([2, 4, 16])`

```
class Seq2SeqDecoder(d2l.Decoder):
"""The RNN decoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.GRU(embed_size + num_hiddens, num_hiddens, num_layers,
dropout=dropout)
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, *args):
return enc_outputs[1]
def forward(self, X, state):
# The output `X` shape: (`num_steps`, `batch_size`, `embed_size`)
X = self.embedding(X).permute(1, 0, 2)
print("X.shape")
print(X.shape)
# Broadcast `context` so it has the same `num_steps` as `X`
context = state[-1].repeat(X.shape[0], 1, 1)
X_and_context = torch.cat((X, context), 2)
output, state = self.rnn(X_and_context, state)
output = self.dense(output).permute(1, 0, 2)
# `output` shape: (`batch_size`, `num_steps`, `vocab_size`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
```

```
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
decoder.eval()
state = decoder.init_state(encoder(X))
output, state = decoder(X, state)
output.shape, state.shape
```

```
X.shape
torch.Size([7, 4, 8])
(torch.Size([4, 7, 10]), torch.Size([2, 4, 16]))
```

Now, here is my code. Notice how I just replaced `nn.GRU`

with `nn.LSTM`

:

```
#@save
class Seq2SeqEncoder(d2l.Encoder):
"""The RNN encoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqEncoder, self).__init__(**kwargs)
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, # not GRU, but LSTM
dropout=dropout)
def forward(self, X, *args):
# The output `X` shape: (`batch_size`, `num_steps`, `embed_size`)
X = self.embedding(X)
# In RNN models, the first axis corresponds to time steps
X = X.permute(1, 0, 2)
# When state is not mentioned, it defaults to zeros
output, state = self.rnn(X)
# `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
```

```
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
encoder.eval()
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
output.shape
```

`torch.Size([7, 4, 16])`

Now in the next code cell you can already see some issues:

`state.shape`

```
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_83249/2608527015.py in <module>
----> 1 state.shape
AttributeError: 'tuple' object has no attribute 'shape'
```

Here’s my decoder class:

```
class Seq2SeqDecoder(d2l.Decoder):
"""The RNN decoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size + num_hiddens, num_hiddens, num_layers, # not GRU, but LSTM
dropout=dropout)
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, *args):
return enc_outputs[1]
def forward(self, X, state):
# The output `X` shape: (`num_steps`, `batch_size`, `embed_size`)
X = self.embedding(X).permute(1, 0, 2)
print("X.shape:")
print(X.shape)
# Broadcast `context` so it has the same `num_steps` as `X`
context = state[-1].repeat(X.shape[0], 1, 1)
#print("state[-1].shape:")
#print(state[-1].shape)
#context = state[-1]
X_and_context = torch.cat((X, context), 2)
output, state = self.rnn(X_and_context, state)
output = self.dense(output).permute(1, 0, 2)
# `output` shape: (`batch_size`, `num_steps`, `vocab_size`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
```

```
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
num_layers=2)
decoder.eval()
state = decoder.init_state(encoder(X))
output, state = decoder(X, state)
output.shape, state.shape
```

```
X.shape:
torch.Size([7, 4, 8])
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_83249/3682229435.py in <module>
3 decoder.eval()
4 state = decoder.init_state(encoder(X))
----> 5 output, state = decoder(X, state)
6 output.shape, state.shape
~/anaconda3/envs/d2l/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/tmp/ipykernel_83249/4080674284.py in forward(self, X, state)
22 #print(state[-1].shape)
23 #context = state[-1]
---> 24 X_and_context = torch.cat((X, context), 2)
25 output, state = self.rnn(X_and_context, state)
26 output = self.dense(output).permute(1, 0, 2)
RuntimeError: torch.cat(): Sizes of tensors must match except in dimension 2. Got 7 and 14 in dimension 0 (The offending index is 1)
```

As you can see, I get a `RuntimeError`

.

**What is going on here?** Whenever I was modfying some code by replacing nn.GRU with nn.LSTM, the integration was seamless. I thought that nn.GRU and nn.LSTM differed only in their internal implementation and that their inputs and outptus were the same.

Thank you in advance!