Lstm init_hidden to GPU

this is the model I have define:

class LSTM (nn.Module) : 

  # constructor 
  def __init__(self,vocab_size,input_length, output_size, hidden_dim, num_layers, drop_prob,init_weight):
    super(LSTM, self).__init__() # call to super constructor
    
    #self.output_size  = output_size
    self.input_length = input_length
    self.num_layers   = num_layers
    self.hidden_dim = hidden_dim
    self.init_weight  = init_weight
    
    # define layers
    self.embedding = nn.Embedding(num_embeddings = vocab_size, 
                         embedding_dim = input_length) 
    self.lstm = nn.LSTM(input_size = input_length, 
                      hidden_size = hidden_dim,
                      num_layers = num_layers,
                      dropout = drop_prob,
                      batch_first = True, #batch_first= True, then the input and output tensors are provided as (batch, seq, feature)
                      bidirectional = False)
    self.fc = nn.Linear(hidden_dim, output_size)
    self.dropout = nn.Dropout(p = drop_prob)
    self.LogSoftMax=nn.LogSoftmax(dim = 1) # meaning result size [batch_size x output_size]
    

  # define forward function  
  def forward(self, x, hidden):
    
    batch_size = x.size(0)
    # input x size  :  [batch_size ,seq_len], note that (seq_len can vary between each mini batch)
    print("x size : " + str(x.size()) ) 
    
    # hidden size :[num_layers , batch_size , hidden_dim]
    print("hidden_at_time_0 size : ", str(hidden[0].size()) )

    # just to add protection since the input to the embedding should be long.
    x = x.long() 
     
    embeds = self.embedding(x) 

    # embeds size :[batch_size ,seq_len , input_length]
    # input_length is the size of the LSTM layer input. also called embedding_dim.
    #print("embeds size : ", str(embeds.size()) )

    lstm_out, hidden = self.lstm(embeds, hidden)

    # hidden size :[num_layers , batch_size , hidden_dim]
    #print("hidden size : ", str(hidden[0].size()))

    # lstm_out size :[seq_length , batch_size , hidden_dim]
    #print("lstm_out : ", str(lstm_out.size()))

    out = self.fc(lstm_out)

    # fc out size :[batch_size,seq_length,output_size]
    #print("fc out size : ", str(out.size()))

    out = self.dropout(out)

    # dropout out size :[batch_size,seq_length,output_size]
    #print("dropout out size : ", str(out.size()))

    out=self.LogSoftMax(out)

    # LogSoftMax out size :[batch_size,seq_length,output_size]
    #print("LogSoftMax out size : ", str(out.size()))
    
    #flatten out
    #out = out.view(batch_size, -1)
    #print("out.view : ", str(out.size()))
    #out = out[:,-1]
    #print("out[:,-1] : ", str(out.size()))
    return out, hidden
    
  def init_weights_uniform(self,m):
    #print(m)
    for name, param in m.named_parameters():
      if 'bias' in name:
        nn.init.constant_(param, 0.0)
        #print("bias") 
      elif 'weight' in name:
        #print("weight") 
        param.data.uniform_(- self.init_weight, self.init_weight)
      #print(param) 
        

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
              weight.new(self.num_layers, batch_size, self.hidden_dim).zero_())
    return hidden

and I use it with :

batch_size =20
input_length=20
output_size=vocab_size = 10000
num_layers=2
hidden_units=200.
dropout=0
init_weight=0.1

and I found out that when I try to train in Colab i always get runtime crashes.

So i created this small test:

print("Test LSTM model on dummy inputs\n")
# test the LSTM forward feed with 1 batch, this test the Network model
model = LSTM(Myparams.vocab_size,Myparams.input_length, Myparams.output_size, Myparams.hidden_units, Myparams.layers, Myparams.dropout,Myparams.init_weight)

model.to(device)
# test the LSTM forward feed with 2 batch, on dummy input
# if input is  batch_size x seq_len
Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]) 
batch_size=len(Input)

model.apply(model.init_weights_uniform) # init weights
hidden = model.init_hidden(batch_size)  # init first hidden stage

print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))
out, hidden=model(Input,hidden)
print(out.size()) #has no real value, since this untrained network

and I received
"Expected object of device type cuda but got device type cpu for argument #3 ‘index’ in call to _th_index_select
"
on line out, hidden=model(Input,hidden)
so i guess that hidden still in CPU! and not the GPU. (how to check this?)
so I tried doing

      
  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device),
              weight.new(self.num_layers, batch_size, self.hidden_dim).zero_()).to(device)
    return hidden

but now I get an error that
'tuple' object has no attribute 'to'

how to solve this?

did you try to move hidden.h and hidden.c into gpu?

how? how to do that?

If you put your code completely here, I can help you more. But for now, I think if you call model.to(device) after hidden = model.init_hidden(batch_size) maybe solve the problem.

but i did post it completely.

I tried doing model.to(device) after hidden = model.init_hidden(batch_size), but got the same error:

call to PrepareDatasetAsNetworkInput

Myparams=NetworkParams(batch_size = 1,

                   input_length = 20, #input_length to the LSM, EmbededDim#TrainDatabase.size(1),

                   output_size = 10000,#1

                   layers = 2,

                   decay = 2,

                   hidden_units = 200, #rnn_size 

                   dropout = 0.5,

                   init_weight = 0.1,

                   lr = 1,

                   vocab_size = 1000,

                   max_epoch = 4,

                   max_max_epoch = 13,

                   max_grad_norm = 5)

print(“Test LSTM model on dummy inputs\n”)

test the LSTM forward feed with 1 batch, this test the Network model

model = LSTM(Myparams.vocab_size,Myparams.input_length, Myparams.output_size, Myparams.hidden_units, Myparams.layers, Myparams.dropout,Myparams.init_weight)

# test the LSTM forward feed with 2 batch, on dummy input

# if input is  batch_size x seq_len

Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]) 

batch_size=len(Input)

model.apply(model.init_weights_uniform) # init weights

hidden = model.init_hidden(batch_size) # init first hidden stage

print(len(hidden))

for e in hidden:

  print(e.data.size())

print("------")

#hidden = torch.LongTensor([e.data for e in hidden])

Input.to(device)

model.to(device)

print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))

out, hidden=model(Input,hidden)

print(out.size()) #has no real value, since this untrained network

print(out[0,0,]) #has no real value, since this untrained network

and the output:

Test LSTM model on dummy inputs

2
torch.Size([2, 2, 200])
torch.Size([2, 2, 200])
------
Input size 2 x 5
hidden_at_time_0 size :  torch.Size([2, 2, 200])
---------------------------------------------------------------------------


RuntimeError                              Traceback (most recent call last)
<ipython-input-14-847ac0cf6c23> in <module>()
     40 
     41 print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))
---> 42 out, hidden=model(Input,hidden)
     43 print(out.size()) #has no real value, since this untrain network
     44 print(out[0,0,]) #has no real value, since this untrain network

4 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

<ipython-input-12-ef9fdf59add0> in forward(self, x, hidden)
     43     x = x.long()
     44 
---> 45     embeds = self.embedding(x)
     46 
     47     # embeds size :[batch_size ,seq_len , input_length]

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/sparse.py in forward(self, input)
    112         return F.embedding(
    113             input, self.weight, self.padding_idx, self.max_norm,
--> 114             self.norm_type, self.scale_grad_by_freq, self.sparse)
    115 
    116     def extra_repr(self):

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1722         # remove once script supports set_grad_enabled
   1723         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1724     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   1725 
   1726 

RuntimeError: Expected object of device type cuda but got device type cpu for argument #3 'index' in call to _th_index_select

I just changed your input tensor liek this: Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]).to(device) and it works.
here is the complete code:

import torch
import numpy as np
import torch.nn as nn

device = 'cuda:0'

batch_size =20
input_length=20
output_size=vocab_size = 10000
num_layers=2
hidden_units=200.
dropout=0
init_weight=0.1

class LSTM (nn.Module) : 

  # constructor 
    def __init__(self,vocab_size,input_length, output_size, hidden_dim, num_layers, drop_prob,init_weight):
        super(LSTM, self).__init__() # call to super constructor

        #self.output_size  = output_size
        self.input_length = input_length
        self.num_layers   = num_layers
        self.hidden_dim = hidden_dim
        self.init_weight  = init_weight

        # define layers
        self.embedding = nn.Embedding(num_embeddings = vocab_size, 
                             embedding_dim = input_length) 
        self.lstm = nn.LSTM(input_size = input_length, 
                          hidden_size = hidden_dim,
                          num_layers = num_layers,
                          dropout = drop_prob,
                          batch_first = True, #batch_first= True, then the input and output tensors are provided as (batch, seq, feature)
                          bidirectional = False)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.dropout = nn.Dropout(p = drop_prob)
        self.LogSoftMax=nn.LogSoftmax(dim = 1) # meaning result size [batch_size x output_size]


  # define forward function  
    def forward(self, x, hidden):
    
        batch_size = x.size(0)
        # input x size  :  [batch_size ,seq_len], note that (seq_len can vary between each mini batch)
        print("x size : " + str(x.size()) ) 

        # hidden size :[num_layers , batch_size , hidden_dim]
        print("hidden_at_time_0 size : ", str(hidden[0].size()) )

        # just to add protection since the input to the embedding should be long.
        x = x.long() 

        embeds = self.embedding(x) 

        # embeds size :[batch_size ,seq_len , input_length]
        # input_length is the size of the LSTM layer input. also called embedding_dim.
        #print("embeds size : ", str(embeds.size()) )

        lstm_out, hidden = self.lstm(embeds, hidden)

        # hidden size :[num_layers , batch_size , hidden_dim]
        #print("hidden size : ", str(hidden[0].size()))

        # lstm_out size :[seq_length , batch_size , hidden_dim]
        #print("lstm_out : ", str(lstm_out.size()))

        out = self.fc(lstm_out)

        # fc out size :[batch_size,seq_length,output_size]
        #print("fc out size : ", str(out.size()))

        out = self.dropout(out)

        # dropout out size :[batch_size,seq_length,output_size]
        #print("dropout out size : ", str(out.size()))

        out=self.LogSoftMax(out)

        # LogSoftMax out size :[batch_size,seq_length,output_size]
        #print("LogSoftMax out size : ", str(out.size()))

        #flatten out
        #out = out.view(batch_size, -1)
        #print("out.view : ", str(out.size()))
        #out = out[:,-1]
        #print("out[:,-1] : ", str(out.size()))
        return out, hidden

    def init_weights_uniform(self,m):
        #print(m)
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            #print("bias") 
            elif 'weight' in name:
            #print("weight") 
                param.data.uniform_(- self.init_weight, self.init_weight)
          #print(param) 


    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.num_layers, batch_size, self.hidden_dim).zero_())
        return hidden

batch_size = 1

input_length = 20 #input_length to the LSM, EmbededDim#TrainDatabase.size(1),

output_size = 10000#1

layers = 2

decay = 2

hidden_units = 200 #rnn_size 

dropout = 0.5

init_weight = 0.1

lr = 1

vocab_size = 1000

max_epoch = 4

max_max_epoch = 13

max_grad_norm = 5


print("Test LSTM model on dummy inputs\n")
# test the LSTM forward feed with 1 batch, this test the Network model
model = LSTM(vocab_size, input_length, output_size, hidden_units, layers, dropout, init_weight)

model.to(device)
# test the LSTM forward feed with 2 batch, on dummy input
# if input is  batch_size x seq_len
Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]).to(device)
batch_size=len(Input)

model.apply(model.init_weights_uniform) # init weights
hidden = model.init_hidden(batch_size)  # init first hidden stage

print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))
out, hidden=model(Input, hidden)
print(out.size()) #has no real value, since this untrained network