CUDA changes expected LSTM hidden dimensions

Hey,

I’ve been trying to transfer my LSTM model to CUDA, but it messes with the expected LSTM hidden dimensions. When on CPU the expected input (layers, batch, hidden: 4x2x8) just works, but when enable cuda (using model.cuda()) it expects a 1x2x8 dimension. Why?

RuntimeError: Expected hidden size (1, 2, 8), got (4, 2, 8)

Thanks in advance,
David

Could you share your model as well?

Of course. I experimented with batch_first, but without success. Right now it’s still set batch_first = True.

class SIMLSTM(nn.Module):

def __init__(self, vocab_size, tagset_size, hidden_dim, lstm_layers, embedding_dim):
    super(SIMLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim, tagset_size)
    
    self.hidden_dim = hidden_dim
    self.lstm_layers = lstm_layers
    
#         self.init_weights()


def init_weights(self):
    initrange = 0.1
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.fill_(0)
    self.fc.weight.data.uniform_(-initrange, initrange)

def forward(self, s1_in, s2_in, hidden_s1, hidden_s2):
    
    
    emb_s1 = self.embedding(s1_in)
    emb_s2 = self.embedding(s2_in)
    
    # Embeddings shaped like: batch_size x padded seq length (40) x embedding_dimension (300)
    
    out_s1, hidden_s1 = self.lstm(emb_s1, hidden_s1)
    out_s2, hidden_s2 = self.lstm(emb_s2, hidden_s2)
    
    print ("Succes!")

def init_hidden(self, bsz):
    weight = next(self.parameters()).data
    return (Variable(weight.new(self.lstm_layers, bsz, self.hidden_dim).zero_()),
                Variable(weight.new(self.lstm_layers, bsz, self.hidden_dim).zero_()))

model = SIMLSTM(VOCAB_SIZE, TAGSET_SIZE, HIDDEN_DIM, LSTM_LAYERS, EMBEDDING_DIM)
if torch.cuda.is_available():
    model.cuda()

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)


hidden_s1 = model.init_hidden(BATCH_SIZE)
hidden_s2 = model.init_hidden(BATCH_SIZE)

Run:

for i in range(1):
    for s1_in, s2_in, labels in generator(BATCH_SIZE, train_s1, train_s2, training_labels):
    
        if torch.cuda.is_available():
            s1_in = s1_in.cuda()
            s2_in = s2_in.cuda()
    
        output, hidden_s1, hidden_s2 = model(s1_in, s2_in, hidden_s1, hidden_s2)
#         hidden_s1 = repackage_hidden(hidden_s1)
#         hidden_s2 = repackage_hidden(hidden_s2)
#         return total_loss[0] / len(data_source)

Without the .is_cuda() statements the “Success” print gets printed, and I’ve done some epochs already that way.

The whole traceback:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-50-3dcd08b81ba1> in <module>()
      9             s2_in = s2_in.cuda()
     10 
---> 11         output, hidden_s1, hidden_s2 = model(s1_in, s2_in, hidden_s1, hidden_s2)
     12 #         hidden_s1 = repackage_hidden(hidden_s1)
     13 #         hidden_s2 = repackage_hidden(hidden_s2)

~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    222         for hook in self._forward_pre_hooks.values():
    223             hook(self, input)
--> 224         result = self.forward(*input, **kwargs)
    225         for hook in self._forward_hooks.values():
     226             hook_result = hook(self, input, result)

<ipython-input-49-c51f58f6693e> in forward(self, s1_in, s2_in, hidden_s1, hidden_s2)
     27         # Embeddings shaped like: batch_size x padded seq length (40) x embedding_dimension (300)
     28 
---> 29         out_s1, hidden_s1 = self.lstm(emb_s1, hidden_s1)
      30         out_s2, hidden_s2 = self.lstm(emb_s2, hidden_s2)
     31 

~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    222         for hook in self._forward_pre_hooks.values():
    223             hook(self, input)
 --> 224         result = self.forward(*input, **kwargs)
    225         for hook in self._forward_hooks.values():
     226             hook_result = hook(self, input, result)

~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/modules/rnn.py in forward(self, input, hx)
    160             flat_weight=flat_weight
    161         )
--> 162         output, hidden = func(input, self.all_weights, hx)
    163         if is_packed:
    164             output = PackedSequence(output, batch_sizes)

~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward(input, *fargs, **fkwargs)
    349         else:
    350             func = AutogradRNN(*args, **kwargs)
--> 351         return func(input, *fargs, **fkwargs)
    352 
    353     return forward

~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/autograd/function.py in _do_forward(self, *input)
    282         self._nested_input = input
    283         flat_input = tuple(_iter_variables(input))
--> 284         flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
    285         nested_output = self._nested_output
    286         nested_variables = _unflatten(flat_output, self._nested_output)

~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/autograd/function.py in forward(self, *args)
    304     def forward(self, *args):
    305         nested_tensors = _map_variable_tensor(self._nested_input)
--> 306         result = self.forward_extended(*nested_tensors)
    307         del self._nested_input
    308         self._nested_output = result

~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/_functions/rnn.py in forward_extended(self, input,     weight, hx)
     291             hy = tuple(h.new() for h in hx)
     292 
--> 293         cudnn.rnn.forward(self, input, hx, weight, output, hy)
     294 
     295         self.save_for_backward(input, hx, weight, output)

 ~/anaconda3/envs/py36/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py in forward(fn, input, hx, weight,                          output, hy)
    264         if tuple(hx.size()) != hidden_size:
    265             raise RuntimeError('Expected hidden size {}, got {}'.format(
--> 266                 hidden_size, tuple(hx.size())))
    267         if cx is not None and tuple(cx.size()) != hidden_size:
    268             raise RuntimeError('Expected cell size {}, got {}'.format(

RuntimeError: Expected hidden size (1, 2, 8), got (4, 2, 8)

Basically the problem is that I can only have 1 lstm layer with cuda enabled. If I put lstm_layers to 1 it runs fine.

Perhaps I’m not understanding correctly. But shouldn’t num_layers=lstm_layers be added to the LSTM constructer?

Oh my god, how could I have missed that…

Thank you so much Simon, I’ve been coding all day (it’s 8pm where I live) . Maybe I should’ve taken a break.

Anyway, you’re right, I forgot that parameter. Thank you so much!

David

2 Likes