Hello PyTorch developers,
I am trying to replace GRU’s with LSTM’s in an Encoder-Decoder architecture and it results in an error. I don’t understand why the error is there. Could someone shed some light on what is going on here?
Here’s the code which works, along with its outputs. The code pertains to Exercise 5, Chapter 9.7 of the book Dive into Deep Learning:
class Seq2SeqEncoder(d2l.Encoder):
"""The RNN encoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqEncoder, self).__init__(**kwargs)
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.GRU(embed_size, num_hiddens, num_layers,
def forward(self, X, *args):
# The output `X` shape: (`batch_size`, `num_steps`, `embed_size`)
X = self.embedding(X)
# In RNN models, the first axis corresponds to time steps
X = X.permute(1, 0, 2)
# When state is not mentioned, it defaults to zeros
output, state = self.rnn(X)
# `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
torch.Size([7, 4, 16])
torch.Size([2, 4, 16])
class Seq2SeqDecoder(d2l.Decoder):
"""The RNN decoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.GRU(embed_size + num_hiddens, num_hiddens, num_layers,
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, *args):
return enc_outputs[1]
def forward(self, X, state):
# The output `X` shape: (`num_steps`, `batch_size`, `embed_size`)
X = self.embedding(X).permute(1, 0, 2)
# Broadcast `context` so it has the same `num_steps` as `X`
context = state[-1].repeat(X.shape[0], 1, 1)
X_and_context = torch.cat((X, context), 2)
output, state = self.rnn(X_and_context, state)
output = self.dense(output).permute(1, 0, 2)
# `output` shape: (`batch_size`, `num_steps`, `vocab_size`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
state = decoder.init_state(encoder(X))
output, state = decoder(X, state)
output.shape, state.shape
torch.Size([7, 4, 8])
(torch.Size([4, 7, 10]), torch.Size([2, 4, 16]))
Now, here is my code. Notice how I just replaced nn.GRU
with nn.LSTM
class Seq2SeqEncoder(d2l.Encoder):
"""The RNN encoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqEncoder, self).__init__(**kwargs)
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size, num_hiddens, num_layers, # not GRU, but LSTM
def forward(self, X, *args):
# The output `X` shape: (`batch_size`, `num_steps`, `embed_size`)
X = self.embedding(X)
# In RNN models, the first axis corresponds to time steps
X = X.permute(1, 0, 2)
# When state is not mentioned, it defaults to zeros
output, state = self.rnn(X)
# `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
encoder = Seq2SeqEncoder(vocab_size=10, embed_size=8, num_hiddens=16,
X = torch.zeros((4, 7), dtype=torch.long)
output, state = encoder(X)
torch.Size([7, 4, 16])
Now in the next code cell you can already see some issues:
AttributeError Traceback (most recent call last)
/tmp/ipykernel_83249/2608527015.py in <module>
----> 1 state.shape
AttributeError: 'tuple' object has no attribute 'shape'
Here’s my decoder class:
class Seq2SeqDecoder(d2l.Decoder):
"""The RNN decoder for sequence to sequence learning."""
def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
dropout=0, **kwargs):
super(Seq2SeqDecoder, self).__init__(**kwargs)
self.embedding = nn.Embedding(vocab_size, embed_size)
self.rnn = nn.LSTM(embed_size + num_hiddens, num_hiddens, num_layers, # not GRU, but LSTM
self.dense = nn.Linear(num_hiddens, vocab_size)
def init_state(self, enc_outputs, *args):
return enc_outputs[1]
def forward(self, X, state):
# The output `X` shape: (`num_steps`, `batch_size`, `embed_size`)
X = self.embedding(X).permute(1, 0, 2)
# Broadcast `context` so it has the same `num_steps` as `X`
context = state[-1].repeat(X.shape[0], 1, 1)
#context = state[-1]
X_and_context = torch.cat((X, context), 2)
output, state = self.rnn(X_and_context, state)
output = self.dense(output).permute(1, 0, 2)
# `output` shape: (`batch_size`, `num_steps`, `vocab_size`)
# `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
return output, state
decoder = Seq2SeqDecoder(vocab_size=10, embed_size=8, num_hiddens=16,
state = decoder.init_state(encoder(X))
output, state = decoder(X, state)
output.shape, state.shape
torch.Size([7, 4, 8])
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_83249/3682229435.py in <module>
3 decoder.eval()
4 state = decoder.init_state(encoder(X))
----> 5 output, state = decoder(X, state)
6 output.shape, state.shape
~/anaconda3/envs/d2l/lib/python3.8/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1049 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1050 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1051 return forward_call(*input, **kwargs)
1052 # Do not call functions when jit is used
1053 full_backward_hooks, non_full_backward_hooks = [], []
/tmp/ipykernel_83249/4080674284.py in forward(self, X, state)
22 #print(state[-1].shape)
23 #context = state[-1]
---> 24 X_and_context = torch.cat((X, context), 2)
25 output, state = self.rnn(X_and_context, state)
26 output = self.dense(output).permute(1, 0, 2)
RuntimeError: torch.cat(): Sizes of tensors must match except in dimension 2. Got 7 and 14 in dimension 0 (The offending index is 1)
As you can see, I get a RuntimeError
What is going on here? Whenever I was modfying some code by replacing nn.GRU with nn.LSTM, the integration was seamless. I thought that nn.GRU and nn.LSTM differed only in their internal implementation and that their inputs and outptus were the same.
Thank you in advance!