Warning for Dropout LSTM model (nlayers = 2)

I’m trying to deploy a simple lstm network but I keep getting an error for every time I have my nlayers = 2

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNN_ENCODER(nn.Module):
    def __init__(self, ntoken, ninput=300, drop_prob=0.5,
                 nhidden=128, nlayers=2, bidirectional=False):
        super(RNN_ENCODER, self).__init__()
        self.n_steps = 10
        self.ntoken = ntoken  # size of the dictionary
        self.ninput = ninput  # size of each embedding vector
        self.drop_prob = drop_prob  # probability of an element to be zeroed
        self.nlayers = nlayers  # Number of recurrent layers
        self.bidirectional = bidirectional
        self.rnn_type = 'LSTM'
        if bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        # number of features in the hidden state
        self.nhidden = nhidden // self.num_directions

        self.define_module()
        self.init_weights()

    def define_module(self):
        self.encoder = nn.Embedding(self.ntoken, self.ninput)
        self.drop = nn.Dropout(self.drop_prob)
        if self.rnn_type == 'LSTM':
            # dropout: If non-zero, introduces a dropout layer on
            # the outputs of each RNN layer except the last layer
            self.rnn = nn.LSTM(self.ninput, self.nhidden,
                               self.nlayers, batch_first=True,
                               dropout=self.drop_prob,
                               bidirectional=self.bidirectional)
        elif self.rnn_type == 'GRU':
            self.rnn = nn.GRU(self.ninput, self.nhidden,
                              self.nlayers, batch_first=True,
                              dropout=self.drop_prob,
                              bidirectional=self.bidirectional)
        else:
            raise NotImplementedError

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        # Do not need to initialize RNN parameters, which have been initialized
        # http://pytorch.org/docs/master/_modules/torch/nn/modules/rnn.html#LSTM
        # self.decoder.weight.data.uniform_(-initrange, initrange)
        # self.decoder.bias.data.fill_(0)

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers * self.num_directions,
                                        bsz, self.nhidden).zero_(),
                    weight.new(self.nlayers * self.num_directions,
                                        bsz, self.nhidden).zero_())
        else:
            return weight.new(self.nlayers * self.num_directions,
                                       bsz, self.nhidden).zero_()

    def forward(self, captions, cap_lens, hidden, mask=None):
        # input: torch.LongTensor of size batch x n_steps
        # --> emb: batch x n_steps x ninput
        emb = self.drop(self.encoder(captions))
        #
        # Returns: a PackedSequence object
        cap_lens = cap_lens.data
        emb = pack_padded_sequence(emb, cap_lens, batch_first=True)
        # #hidden and memory (num_layers * num_directions, batch, hidden_size):
        # tensor containing the initial hidden state for each element in batch.
        # #output (batch, seq_len, hidden_size * num_directions)
        # #or a PackedSequence object:
        # tensor containing output features (h_t) from the last layer of RNN
        output, hidden = self.rnn(emb, hidden)
        # PackedSequence object
        # --> (batch, seq_len, hidden_size * num_directions)
        output = pad_packed_sequence(output, batch_first=True)[0]
        # output = self.drop(output)
        # --> batch x hidden_size*num_directions x seq_len
        words_emb = output.transpose(1, 2)
        # --> batch x num_directions*hidden_size
        if self.rnn_type == 'LSTM':
            sent_emb = hidden[0].transpose(0, 1).contiguous()
        else:
            sent_emb = hidden.transpose(0, 1).contiguous()
        sent_emb = sent_emb.view(-1, self.nhidden * self.num_directions)
        return words_emb, sent_emb


model = RNN_ENCODER(27297)
captions = torch.empty(48, 15, dtype=torch.long).random_(27297)
cap_lens = torch.sort(torch.empty(48, dtype=torch.long).random_(1, 15), descending=True)[0]
hidden = (torch.randn(1, 48, 128), torch.randn(1, 48, 128))


traced_script_module = torch.jit.trace(model, (captions, cap_lens, hidden), check_trace=False)

traced_script_module.save("lstm.pt")
Traceback (most recent call last):
  File "lstm.py", line 98, in <module>
    traced_script_module = torch.jit.trace(model, (captions, cap_lens, hidden), check_trace=False)

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/jit/__init__.py", line 565, in trace
    module._create_method_from_trace('forward', func, example_inputs)

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 475, in __call__
    result = self._slow_forward(*input, **kwargs)

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 465, in _slow_forward
    result = self.forward(*input, **kwargs)

  File "lstm.py", line 76, in forward
    output, hidden = self.rnn(emb, hidden)

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 475, in __call__
    result = self._slow_forward(*input, **kwargs)

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 465, in _slow_forward
    result = self.forward(*input, **kwargs)

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 175, in forward
    self.check_forward_args(input, hx, batch_sizes)

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 152, in check_forward_args
    'Expected hidden[0] size {}, got {}')

  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 148, in check_hidden_size
    raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
RuntimeError: Expected hidden[0] size (2, 48, 128), got (tensor(1), tensor(48), tensor(128))

However if I set nlayers = 1 I get a warning i think I can live with. I still want to know why nlayers = 2 is an issue for my code

/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py:46: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.5 and num_layers=1
  "num_layers={}".format(dropout, num_layers))

I think its because your not concatenating hidden tensor at

hidden = (torch.randn(1, 48, 128), torch.randn(1, 48, 128))

but just creating a list. This means you get a list of length 2, whereas Torch expects tensor of size [2,48,128]
Instead, using the torch.cat command

a = torch.randn(1, 48, 128)
b = torch.randn(1, 48, 128)
hidden = torch.cat((a,b))

hidden.size() will give you a shape of (2,48,128). Hope this works.

I changed hidden to

hidden = torch.cat(torch.randn(1, 48, 128), torch.randn(1, 48, 128))

and I get the following error

Traceback (most recent call last):
  File "lstm.py", line 95, in <module>
    hidden = torch.cat(torch.randn(1, 48, 128), torch.randn(1, 48, 128))
TypeError: cat(): argument 'tensors' (position 1) must be tuple of Tensors, not Tensor

Yes, thats why I defined the two tensors separately and used torch.cat - it seems to work then.

Sadly it doesn’t work for me :frowning:

Traceback (most recent call last):
  File "lstm.py", line 102, in <module>
    traced_script_module = torch.jit.trace(model, (captions, cap_lens, hidden), check_trace=False)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/jit/__init__.py", line 565, in trace
    module._create_method_from_trace('forward', func, example_inputs)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 475, in __call__
    result = self._slow_forward(*input, **kwargs)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 465, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "lstm.py", line 76, in forward
    output, hidden = self.rnn(emb, hidden)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 475, in __call__
    result = self._slow_forward(*input, **kwargs)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 465, in _slow_forward
    result = self.forward(*input, **kwargs)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 175, in forward
    self.check_forward_args(input, hx, batch_sizes)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 152, in check_forward_args
    'Expected hidden[0] size {}, got {}')
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 148, in check_hidden_size
    raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
RuntimeError: Expected hidden[0] size (2, 48, 128), got (tensor(48), tensor(128))

Can you show the code? I am certain you are concatenating it wrong: This will give you a tensor of (2,48,128)

Keep in mind I’m using the preview version of 1.0 pytorch

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class RNN_ENCODER(nn.Module):
    def __init__(self, ntoken, ninput=300, drop_prob=0.5,
                 nhidden=128, nlayers=2, bidirectional=False):
        super(RNN_ENCODER, self).__init__()
        self.n_steps = 10
        self.ntoken = ntoken  # size of the dictionary
        self.ninput = ninput  # size of each embedding vector
        self.drop_prob = drop_prob  # probability of an element to be zeroed
        self.nlayers = nlayers  # Number of recurrent layers
        self.bidirectional = bidirectional
        self.rnn_type = 'LSTM'
        if bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        # number of features in the hidden state
        self.nhidden = nhidden // self.num_directions

        self.define_module()
        self.init_weights()

    def define_module(self):
        self.encoder = nn.Embedding(self.ntoken, self.ninput)
        self.drop = nn.Dropout(self.drop_prob)
        if self.rnn_type == 'LSTM':
            # dropout: If non-zero, introduces a dropout layer on
            # the outputs of each RNN layer except the last layer
            self.rnn = nn.LSTM(self.ninput, self.nhidden,
                               self.nlayers, batch_first=True,
                               dropout=self.drop_prob,
                               bidirectional=self.bidirectional)
        elif self.rnn_type == 'GRU':
            self.rnn = nn.GRU(self.ninput, self.nhidden,
                              self.nlayers, batch_first=True,
                              dropout=self.drop_prob,
                              bidirectional=self.bidirectional)
        else:
            raise NotImplementedError

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        # Do not need to initialize RNN parameters, which have been initialized
        # http://pytorch.org/docs/master/_modules/torch/nn/modules/rnn.html#LSTM
        # self.decoder.weight.data.uniform_(-initrange, initrange)
        # self.decoder.bias.data.fill_(0)

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new(self.nlayers * self.num_directions,
                                        bsz, self.nhidden).zero_(),
                    weight.new(self.nlayers * self.num_directions,
                                        bsz, self.nhidden).zero_())
        else:
            return weight.new(self.nlayers * self.num_directions,
                                       bsz, self.nhidden).zero_()

    def forward(self, captions, cap_lens, hidden, mask=None):
        # input: torch.LongTensor of size batch x n_steps
        # --> emb: batch x n_steps x ninput
        emb = self.drop(self.encoder(captions))
        #
        # Returns: a PackedSequence object
        cap_lens = cap_lens.data
        emb = pack_padded_sequence(emb, cap_lens, batch_first=True)
        # #hidden and memory (num_layers * num_directions, batch, hidden_size):
        # tensor containing the initial hidden state for each element in batch.
        # #output (batch, seq_len, hidden_size * num_directions)
        # #or a PackedSequence object:
        # tensor containing output features (h_t) from the last layer of RNN
        output, hidden = self.rnn(emb, hidden)
        # PackedSequence object
        # --> (batch, seq_len, hidden_size * num_directions)
        output = pad_packed_sequence(output, batch_first=True)[0]
        # output = self.drop(output)
        # --> batch x hidden_size*num_directions x seq_len
        words_emb = output.transpose(1, 2)
        # --> batch x num_directions*hidden_size
        if self.rnn_type == 'LSTM':
            sent_emb = hidden[0].transpose(0, 1).contiguous()
        else:
            sent_emb = hidden.transpose(0, 1).contiguous()
        sent_emb = sent_emb.view(-1, self.nhidden * self.num_directions)
        return words_emb, sent_emb


model = RNN_ENCODER(27297)
captions = torch.empty(48, 15, dtype=torch.long).random_(27297)
cap_lens = torch.sort(torch.empty(48, dtype=torch.long).random_(1, 15), descending=True)[0]

# Voodoo magic
a = torch.randn(1, 48, 128)
b = torch.randn(1, 48, 128)
hidden = torch.cat((a,b))


traced_script_module = torch.jit.trace(model, (captions, cap_lens, hidden), check_trace=False)

traced_script_module.save("lstm.pt")

Ah, I have not yet migrated to the 1.0 preview - maybe thats why we get different results. Personally, I am going to wait till 1.0 becomes a full release, just to avoid issues like these.

I had the same issues for version 0.4

Really weird - I can’t see why. It works for me, and your error message points to the same issue. Lets see if someone else can chip in.

Can you try specifying the dimension to concatenate? Like

a = torch.randn(1, 48, 128)
b = torch.randn(1, 48, 128)
hidden = torch.cat((a,b), dim=0)

This will decide which dimension to concat over. Default is dim=0, but I just wanted to check,

This is missing a pair of parantheses to make a tuple of two tensors rather than two single ones. Personally, I would always include the dimension, but your style might differ from mine be excellent.
I’m not sure there is much reason to not use torch.randn(2, 48, 128), though.

Best regards

Thomas

1 Like

If I’m missing a parentheses can you show me where? I’m not seeing the error.

# wrong!
hidden = torch.cat(torch.randn(1, 48, 128), torch.randn(1, 48, 128), dim=0)

vs:

  hidden = torch.cat((torch.randn(1, 48, 128), torch.randn(1, 48, 128)), dim=0)

Best regards

Thomas

1 Like
/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py:46: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.5 and num_layers=1
  "num_layers={}".format(dropout, num_layers))
Traceback (most recent call last):
  File "lstm.py", line 647, in <module>
    words_embs, sent_emb = text_encoder(captions, cap_lens, hidden)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)
  File "lstm.py", line 87, in forward
    output, hidden = self.rnn(emb, hidden)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/module.py", line 477, in __call__
    result = self.forward(*input, **kwargs)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 175, in forward
    self.check_forward_args(input, hx, batch_sizes)
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 152, in check_forward_args
    'Expected hidden[0] size {}, got {}')
  File "/Users/user/anaconda2/lib/python2.7/site-packages/torch/nn/modules/rnn.py", line 148, in check_hidden_size
    raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
RuntimeError: Expected hidden[0] size (1, 48, 128), got (48, 128)

Yeah, the LSTM doc says you need to pass in h and c as a tuple for the hidden state and gives the shapes under “Inputs”.
You only give it one Tensor and it tries to take the first dimension apart to get h and c.

Best regards

Thomas

1 Like

You said “give it”. What exactly are you referring when passing in one tensor? The hidden?

he means that your code passes only hidden state to LSTM, but not cell state.

  output, hidden = self.rnn(emb, hidden)

shall be

  output, hidden = self.rnn(emb,  (hidden,  cell)) 

Refer to the LSTM documentation on the inputs needed.

So why does my code work for one layer with no cell state?

Probably because you didn’t edit the 2-layer initialization of hidden and that was used as h and c.
If you go the route of separating the two, the LSTM case should probably be

 output, (h, cl) = self.rnn(emb,  (h,  c)) 

but you can also define hidden to be the tuple of (h,c) and go with that.

The init_hidden seems relatively correct (even if you’d use weight.new_zeros these days), but your manual definition of hidden isn’t. Maybe you just can inspect the output of init_hidden in an interactive shell and draw your own conclusions.

Best regards

Thomas