[SOLVED] LSTM RNN bugs with dimensionality?

Dear Sir/Mdm at PyTorch,

I have a dimensionality problem which might be due to bug in LSTM. I have a text input of
Sample input size: torch.Size([256, 20]) in my training and test DataLoader. (My texts sequence length is only 20 and very short, my batch size is 256).

Here is the error:

Error: Expected hidden dimension of (2, 229, 256) but got (2, 256, 256)

I find it strange that the hidden dimension changes in LSTM. I have n_layer=2, batch_size=256, and hidden_dim=256. The error is found when I do the validation calculation (I have marked the erroneous line it in the training code I post below) but has no problem at all during backprops in the training. Please refer to the training code at the last code snippet.

The following model is my model:

import torch.nn as nn
import torch.nn.functional

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
    
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        softmax_out = self.softmax(out)
        
        # reshape to be batch_size first
        softmax_out = softmax_out.view(batch_size, -1, output_size)
        softmax_out = softmax_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return softmax_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

And a training code:

# Instantiate the model w/ hyperparams
vocab_size = len(embedding_int)+1 # +1 for the 0 padding + our word tokens
output_size = 17
embedding_dim = 40
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
# loss and optimization functions
lr=0.001

criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
def train():
    # training params
    epochs = 10
    counter = 0
    print_every = 10000
    clip=5 # gradient clipping

    # move model to GPU, if available
    if(train_on_gpu):
        net.cuda()

    for e in range(epochs):
        net.train()
        h = net.init_hidden(batch_size)

        # batch loop
        for inputs, labels in train_loader:
            counter += 1

            if(train_on_gpu):
                inputs, labels = inputs.cuda(), labels.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)
            # calculate the loss and perform backprop
            loss = criterion(output.squeeze(), labels)
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()

            if count%print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()

                for inputs, labels in valid_loader:
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])

                    if(train_on_gpu):
                        inputs, labels = inputs.cuda(), labels.cuda()

                    output, val_h = net(inputs, val_h) ######### ERROR HERE #############
                    val_loss = criterion(output.squeeze(), labels)

                    val_losses.append(val_loss.item())

                    net.train()
                    print("Epoch: {}/{}...".format(e+1, epochs),
                          "Step: {}...".format(counter),
                          "Loss: {:.6f}...".format(loss.item()),
                          "Val Loss: {:.6f}".format(np.mean(val_losses)))

Could you print the shapes of inputs and val_h in your validiation loop before passing them to your model?
The core runs fine if I use random inputs, so there seems to be some kind of shape mismatch coming from the valid_loader.

EDIT: You are currently using nn.Softmax as your last activation and nn.NLLLoss as your criterion.
Note that nn.NLLLoss expects nn.LogSoftmax as inputs, so you should change this activation.

Hi @ptrblck,
Here is the shapes of inputs and val_h in my validation loops just before I passed them to my model:

INPUTS SHAPE: torch.Size([256, 20])
VAL_H SHAPE: 2, torch.Size([2, 256, 256]), torch.Size([2, 256, 256]) #len(val_h), val_h[0].shape, val_h[1].shape

If it helps, I also printed the Model Summary

SentimentRNN(
  (embedding): Embedding(19940, 40)
  (lstm): LSTM(40, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=17, bias=True)
  (softmax): LogSoftmax()
)

Here is the full error logs from validation blocks (line 56):

RuntimeError                              Traceback (most recent call last)
<ipython-input-60-2da0ffaf5447> in <module>()
----> 1 train()

<ipython-input-59-f8c736390366> in train()
     54                     print("INPUTS SHAPE:", inputs.shape)
     55                     print("VAL_H SHAPE:", len(val_h), val_h[0].shape, val_h[1].shape)
---> 56                     output, val_h = net(inputs, val_h)
     57                     val_loss = criterion(output.squeeze(), labels)
     58 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

<ipython-input-31-f475a7e7b5f5> in forward(self, x, hidden)
     39         x = x.long()
     40         embeds = self.embedding(x)
---> 41         lstm_out, hidden = self.lstm(embeds, hidden)
     42 
     43         # stack up lstm outputs

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    487             result = self._slow_forward(*input, **kwargs)
    488         else:
--> 489             result = self.forward(*input, **kwargs)
    490         for hook in self._forward_hooks.values():
    491             hook_result = hook(self, input, result)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    173                 hx = (hx, hx)
    174 
--> 175         self.check_forward_args(input, hx, batch_sizes)
    176         _impl = _rnn_impls[self.mode]
    177         if batch_sizes is None:

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in check_forward_args(self, input, hidden, batch_sizes)
    150         if self.mode == 'LSTM':
    151             check_hidden_size(hidden[0], expected_hidden_size,
--> 152                               'Expected hidden[0] size {}, got {}')
    153             check_hidden_size(hidden[1], expected_hidden_size,
    154                               'Expected hidden[1] size {}, got {}')

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/rnn.py in check_hidden_size(hx, expected_hidden_size, msg)
    146         def check_hidden_size(hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
    147             if tuple(hx.size()) != expected_hidden_size:
--> 148                 raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
    149 
    150         if self.mode == 'LSTM':

RuntimeError: Expected hidden[0] size (2, 229, 256), got (2, 256, 256)

I’m not sure how my hidden_size become 229.
In addition to that, I have fix my softmax function. Thanks

In addition to the above code, I tried my own debugging by using only one batch. But found no luck in reproducing the bugs, but might be helpful to pinpoint where I miss. I use the following code to simulate the training & validation session with one batch from trainloader and testloader

The following are similar to my training code above, I just remove the epochs and batch loops in training and validation blocks. This works just fine without error from the LSTM expecting hidden_size exception:

epochs = 10
counter = 0
print_every = 10000
clip=5 # gradient clipping

net.train()
h = net.init_hidden(batch_size)

######## DEBUG #############
print("INIT_TRAIN_H :{}".format(len(h), h[0].shape))
############################

# batch loop
inputs, labels =  iter(train_loader).next()

train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    inputs, labels = inputs.cuda(), labels.cuda()

# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
h = tuple([each.data for each in h])

# zero accumulated gradients
net.zero_grad()

# get the output from the model
output, h = net(inputs, h)

######## DEBUG #############
print("OUTPUT_TRAIN: {}, TRAIN_H: {}".format(output.shape, (len(h), h[0].shape)))
############################

# calculate the loss and perform backprop
loss = criterion(output.squeeze(), labels)
loss.backward()

# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
nn.utils.clip_grad_norm_(net.parameters(), clip)
optimizer.step()


# Get validation loss
val_h = net.init_hidden(batch_size)


######## DEBUG #############
print("INIT_VAL_H :{}".format((len(val_h), val_h[0].shape)))
############################

val_losses = []
net.eval()

val_inputs, val_labels = iter(valid_loader).next()
# Creating new variables for the hidden state, otherwise
# we'd backprop through the entire training history
val_h = tuple([each.data for each in val_h])

if(train_on_gpu):
    val_inputs, val_labels = val_inputs.cuda(), val_labels.cuda()

output, val_h = net(inputs, val_h)

######## DEBUG #############
print("OUTPUT_VAL: {},  H_VAL: {}".format(output.shape, (len(val_h) ,val_h[0].shape)))
###########################

val_loss = criterion(output.squeeze(), val_labels)

val_losses.append(val_loss.item())

net.train()
# print("Epoch: {}/{}...".format(1, epochs),
#       "Step: {}...".format(counter),
#       "Loss: {:.6f}...".format(loss.item()),
#       "Val Loss: {:.6f}".format(np.mean(val_losses)))

Thanks for the code and debugging.
If the length of your Dataset is not divisible by the batch size without a remainder, the last batch might be smaller than the rest.
Try to set drop_last=True in your DataLoader and run the code again.

2 Likes

Shouldn’t net.init_hidden(batch_size) not be inside the inner loop? I thought you have to initialize the hidden state before/after each batch since you do a forward and backward for batch. After all, you do zero_grad() and step() in the inner loop.

This should also get rid of your error. Even if the last batch is not “full”. Currently, you set the size of the hidden state according to some fixed predefined value batch_size that never changes.

So either you ignore the last batch that is smaller than batch_size as suggested by @ptrblck or you do the following:

  • Put net.init_hidden(batch_size) into the inner loop (I still think that’s a must either way)
  • Before calling init_hidden() you first do batch_size = inputs.shape[0], assuming that the shape of inputs is (batch_size, ...)
2 Likes

Hi @vdw and @ptrblck,

Thanks for the help. I have refactored the RNN code and fix the net.init_hidden to be inside the inner loop and changes the batch_size to be inputs.shape[0]. I missed out that my models become dependant upon my batch size and hence my expected hidden[0] size changes. Thanks for helping the debugging. It’s working now.

2 Likes

@ptrblck @vdw I have a same problems. I printed the inputs shape and h_0, c_0 shape to check and found the batch_size was changed.
The full error log below

Enum count: 133
input: torch.Size([250, 25, 300]) h_0: torch.Size([1, 25, 256]) c_0: torch.Size([1, 25, 256])
validation - Input shape issue torch.Size([22, 250])
Enum count: 134
input: torch.Size([250, 372, 300]) h_0: torch.Size([1, 25, 256]) c_0: torch.Size([1, 25, 256])
Traceback (most recent call last):
File “lstm_pytorch.py”, line 290, in
main()
File “lstm_pytorch.py”, line 286, in main
do_cnn()
File “lstm_pytorch.py”, line 270, in do_cnn
acc, sentence_vec = train_test_one_split(cv, train_index, test_index)
File “lstm_pytorch.py”, line 256, in train_test_one_split
eval_acc, sentence_vector = evaluate(model, x_test, y_test)
File “lstm_pytorch.py”, line 139, in evaluate
preds, vector = model(inputs)
File “E:\Anaconda3\envs\keras-gpu\lib\site-packages\torch\nn\modules\module.py”, line 493, in call
result = self.forward(*input, **kwargs)
File “lstm_pytorch.py”, line 126, in forward
input, (h_0, c_0))
File “E:\Anaconda3\envs\keras-gpu\lib\site-packages\torch\nn\modules\module.py”, line 493, in call
result = self.forward(*input, **kwargs)
File “E:\Anaconda3\envs\keras-gpu\lib\site-packages\torch\nn\modules\rnn.py”, line 559, in forward
return self.forward_tensor(input, hx)
File “E:\Anaconda3\envs\keras-gpu\lib\site-packages\torch\nn\modules\rnn.py”, line 539, in forward_tensor
output, hidden = self.forward_impl(input, hx, batch_sizes, max_batch_size, sorted_indices)
File “E:\Anaconda3\envs\keras-gpu\lib\site-packages\torch\nn\modules\rnn.py”, line 519, in forward_impl
self.check_forward_args(input, hx, batch_sizes)
File “E:\Anaconda3\envs\keras-gpu\lib\site-packages\torch\nn\modules\rnn.py”, line 494, in check_forward_args
‘Expected hidden[0] size {}, got {}’)
File “E:\Anaconda3\envs\keras-gpu\lib\site-packages\torch\nn\modules\rnn.py”, line 172, in check_hidden_size
raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))
RuntimeError: Expected hidden[0] size (1, 372, 256), got (1, 25, 256)

Here is my model

class LSTMClassifier(nn.Module):
def init(self, batch_size = 50, output_size = 3, hidden_size = 256, vocab_size = len(vocabulary_inv_list), embedding_length = 300, weights = None):
super(LSTMClassifier, self).init()

    """
	Arguments
	---------
	batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
	output_size : 3 = (nor,pos, neg)
	hidden_sie : Size of the hidden_state of the LSTM
	vocab_size : Size of the vocabulary containing unique words
	embedding_length : Embeddding dimension of GloVe word embeddings
	weights : Pre-trained word2vec word_embeddings which we will use to create our word_embedding look-up table 
	
	"""

    self.batch_size = batch_size
    self.output_size = output_size
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.embedding_length = embedding_length
    #self.num_layers = num_layers

    # Initializing the look-up table.
    self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
    # Assigning the look-up table to the pre-trained GloVe word embedding.
    self.word_embeddings.weight = nn.Parameter(
        weights, requires_grad=False)
    self.lstm = nn.LSTM(embedding_length, hidden_size=hidden_size, num_layers = 1, batch_first = False)
    self.label = nn.Linear(hidden_size , output_size)

def forward(self, input_sentence, batch_size=None):
    """ 
    Parameters
    ----------
    input_sentence: input_sentence of shape = (batch_size, num_sequences)
    batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

    Returns
    -------
    Output of the linear layer containing logits for positive & negative class which receives its input as the final_hidden_state of the LSTM
    final_output.shape = (batch_size, output_size)

    """

    ''' Here we will map all the indexes present in the input sequence to the corresponding word vector using our pre-trained word_embedddins.'''
    global counter
    counter += 1
    print("Enum count:", counter)
    input = self.word_embeddings(
        input_sentence)
    
    # embedded input of shape = (batch_size, num_sequences,  embedding_length)
    # input.size() = (num_sequences, batch_size, embedding_length)
    input = input.permute(1, 0, 2).float()
    if batch_size is None:
        # Initial hidden state of the LSTM
        h_0 = Variable(torch.zeros(
            1, self.batch_size, self.hidden_size).cuda())
        # Initial cell state of the LSTM
        c_0 = Variable(torch.zeros(
            1, self.batch_size, self.hidden_size).cuda())
    else:
        h_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
        c_0 = Variable(torch.zeros(1, batch_size, self.hidden_size).cuda())
    
    
    
    print("input: ", input.size() ,"h_0: ", h_0.size(), "c_0: ", c_0.size())
    
    output, (final_hidden_state, final_cell_state) = self.lstm(
        input, (h_0, c_0))
    
   
    # final_hidden_state.size() = (1, batch_size, hidden_size) & final_output.size() = (batch_size, output_size)
    final_output = self.label(final_hidden_state[-1])

    return final_output

I had problems when the code run into the epoch as mentioned below:

def train_test_one_split(cv, train_index, test_index):
x_train, y_train = X[train_index], Y[train_index]
x_test, y_test = X[test_index], Y[test_index]

x_train = torch.from_numpy(x_train).long()
y_train = torch.from_numpy(y_train).long()
dataset_train = TensorDataset(x_train, y_train)


#train_loader = DataLoader(dataset_train, batch_size=hidden, shuffle=True, num_workers=4, pin_memory=True)
train_loader = DataLoader(
    dataset_train, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=False,drop_last=False)

x_test = torch.from_numpy(x_test).long()
y_test = torch.from_numpy(y_test).long()
try:
    if use_cuda:
        x_test = x_test.cuda()
        y_test = y_test.cuda()
except Exception as ex:
    print(ex)
#model = CNN(kernel_sizes, num_filters, embedding_dim, pretrained_embeddings)
model = LSTMClassifier(batch_size, output_size, hidden_size,
                       vocab_size, embedding_length, pretrained_embeddings)
if cv == 0:
    print("\n{}\n".format(str(model)))

if use_cuda:
    model = model.cuda()

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.0002)

loss_fn = nn.CrossEntropyLoss()

for epoch in range(10):
    tic = time.time()
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        batch_size = inputs.shape[0]
        inputs, labels = Variable(inputs), Variable(labels)                    
        if use_cuda:
            inputs, labels = inputs.cuda(), labels.cuda()
            if((inputs.shape[0],inputs.shape[1])!= (batch_size, sentence_len)):
                #(inputs.shape[0], inputs.shape[1]) = (batch_size, sentence_len)
                print('validation - Input shape issue', inputs.shape)
                continue
            
        

        preds = model(inputs)

        if use_cuda:
            preds = preds.cuda()
        try:
            loss = loss_fn(preds, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        except RuntimeError:
            continue

        # this does not improve the performance (even worse) (it was used in Kim's original paper)
        if 0:
            constrained_norm = 1  # 3 original parameter
            if model.fc.weight.norm().data[0] > constrained_norm:
                model.fc.weight.data = model.fc.weight.data * \
                    constrained_norm / model.fc.weight.data.norm()

    model.eval()
    eval_acc, sentence_vector = evaluate(model, x_test, y_test)
    #print('[epoch: {:d}] train_loss: {:.3f}   acc: {:.3f}   ({:.1f}s)'.format(epoch, loss.data[0], eval_acc, time.time()-tic) )
    print('[epoch: {:d}] train_loss: {:.3f}   acc: {:.3f}   ({:.1f}s)'.format(
        epoch, loss.item(), eval_acc, time.time()-tic))  # pytorch 0.4 and later
return eval_acc, sentence_vector