Lstm init_hidden to GPU

NearIt · May 15, 2020, 10:17pm

this is the model I have define:

class LSTM (nn.Module) : 

  # constructor 
  def __init__(self,vocab_size,input_length, output_size, hidden_dim, num_layers, drop_prob,init_weight):
    super(LSTM, self).__init__() # call to super constructor
    
    #self.output_size  = output_size
    self.input_length = input_length
    self.num_layers   = num_layers
    self.hidden_dim = hidden_dim
    self.init_weight  = init_weight
    
    # define layers
    self.embedding = nn.Embedding(num_embeddings = vocab_size, 
                         embedding_dim = input_length) 
    self.lstm = nn.LSTM(input_size = input_length, 
                      hidden_size = hidden_dim,
                      num_layers = num_layers,
                      dropout = drop_prob,
                      batch_first = True, #batch_first= True, then the input and output tensors are provided as (batch, seq, feature)
                      bidirectional = False)
    self.fc = nn.Linear(hidden_dim, output_size)
    self.dropout = nn.Dropout(p = drop_prob)
    self.LogSoftMax=nn.LogSoftmax(dim = 1) # meaning result size [batch_size x output_size]
    

  # define forward function  
  def forward(self, x, hidden):
    
    batch_size = x.size(0)
    # input x size  :  [batch_size ,seq_len], note that (seq_len can vary between each mini batch)
    print("x size : " + str(x.size()) ) 
    
    # hidden size :[num_layers , batch_size , hidden_dim]
    print("hidden_at_time_0 size : ", str(hidden[0].size()) )

    # just to add protection since the input to the embedding should be long.
    x = x.long() 
     
    embeds = self.embedding(x) 

    # embeds size :[batch_size ,seq_len , input_length]
    # input_length is the size of the LSTM layer input. also called embedding_dim.
    #print("embeds size : ", str(embeds.size()) )

    lstm_out, hidden = self.lstm(embeds, hidden)

    # hidden size :[num_layers , batch_size , hidden_dim]
    #print("hidden size : ", str(hidden[0].size()))

    # lstm_out size :[seq_length , batch_size , hidden_dim]
    #print("lstm_out : ", str(lstm_out.size()))

    out = self.fc(lstm_out)

    # fc out size :[batch_size,seq_length,output_size]
    #print("fc out size : ", str(out.size()))

    out = self.dropout(out)

    # dropout out size :[batch_size,seq_length,output_size]
    #print("dropout out size : ", str(out.size()))

    out=self.LogSoftMax(out)

    # LogSoftMax out size :[batch_size,seq_length,output_size]
    #print("LogSoftMax out size : ", str(out.size()))
    
    #flatten out
    #out = out.view(batch_size, -1)
    #print("out.view : ", str(out.size()))
    #out = out[:,-1]
    #print("out[:,-1] : ", str(out.size()))
    return out, hidden
    
  def init_weights_uniform(self,m):
    #print(m)
    for name, param in m.named_parameters():
      if 'bias' in name:
        nn.init.constant_(param, 0.0)
        #print("bias") 
      elif 'weight' in name:
        #print("weight") 
        param.data.uniform_(- self.init_weight, self.init_weight)
      #print(param) 
        

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
              weight.new(self.num_layers, batch_size, self.hidden_dim).zero_())
    return hidden

and I use it with :

batch_size =20
input_length=20
output_size=vocab_size = 10000
num_layers=2
hidden_units=200.
dropout=0
init_weight=0.1

and I found out that when I try to train in Colab i always get runtime crashes.

So i created this small test:

print("Test LSTM model on dummy inputs\n")
# test the LSTM forward feed with 1 batch, this test the Network model
model = LSTM(Myparams.vocab_size,Myparams.input_length, Myparams.output_size, Myparams.hidden_units, Myparams.layers, Myparams.dropout,Myparams.init_weight)

model.to(device)
# test the LSTM forward feed with 2 batch, on dummy input
# if input is  batch_size x seq_len
Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]) 
batch_size=len(Input)

model.apply(model.init_weights_uniform) # init weights
hidden = model.init_hidden(batch_size)  # init first hidden stage

print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))
out, hidden=model(Input,hidden)
print(out.size()) #has no real value, since this untrained network

and I received
"Expected object of device type cuda but got device type cpu for argument #3 ‘index’ in call to _th_index_select
"
on line out, hidden=model(Input,hidden)
so i guess that hidden still in CPU! and not the GPU. (how to check this?)
so I tried doing

      
  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device),
              weight.new(self.num_layers, batch_size, self.hidden_dim).zero_()).to(device)
    return hidden

but now I get an error that
'tuple' object has no attribute 'to'

how to solve this?

Isaac_Kargar · May 16, 2020, 6:57pm

did you try to move hidden.h and hidden.c into gpu?

NearIt · May 16, 2020, 7:29pm

how? how to do that?

Isaac_Kargar · May 16, 2020, 8:19pm

If you put your code completely here, I can help you more. But for now, I think if you call model.to(device) after hidden = model.init_hidden(batch_size) maybe solve the problem.

NearIt · May 17, 2020, 5:50am

but i did post it completely.

I tried doing model.to(device) after hidden = model.init_hidden(batch_size), but got the same error:

call to PrepareDatasetAsNetworkInput

Myparams=NetworkParams(batch_size = 1,

                   input_length = 20, #input_length to the LSM, EmbededDim#TrainDatabase.size(1),

                   output_size = 10000,#1

                   layers = 2,

                   decay = 2,

                   hidden_units = 200, #rnn_size 

                   dropout = 0.5,

                   init_weight = 0.1,

                   lr = 1,

                   vocab_size = 1000,

                   max_epoch = 4,

                   max_max_epoch = 13,

                   max_grad_norm = 5)

print(“Test LSTM model on dummy inputs\n”)

test the LSTM forward feed with 1 batch, this test the Network model

model = LSTM(Myparams.vocab_size,Myparams.input_length, Myparams.output_size, Myparams.hidden_units, Myparams.layers, Myparams.dropout,Myparams.init_weight)

# test the LSTM forward feed with 2 batch, on dummy input

# if input is  batch_size x seq_len

Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]) 

batch_size=len(Input)

model.apply(model.init_weights_uniform) # init weights

hidden = model.init_hidden(batch_size) # init first hidden stage

print(len(hidden))

for e in hidden:

  print(e.data.size())

print("------")

#hidden = torch.LongTensor([e.data for e in hidden])

Input.to(device)

model.to(device)

print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))

out, hidden=model(Input,hidden)

print(out.size()) #has no real value, since this untrained network

print(out[0,0,]) #has no real value, since this untrained network

and the output:

Test LSTM model on dummy inputs

2
torch.Size([2, 2, 200])
torch.Size([2, 2, 200])
------
Input size 2 x 5
hidden_at_time_0 size :  torch.Size([2, 2, 200])
---------------------------------------------------------------------------


RuntimeError                              Traceback (most recent call last)
<ipython-input-14-847ac0cf6c23> in <module>()
     40 
     41 print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))
---> 42 out, hidden=model(Input,hidden)
     43 print(out.size()) #has no real value, since this untrain network
     44 print(out[0,0,]) #has no real value, since this untrain network

4 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

<ipython-input-12-ef9fdf59add0> in forward(self, x, hidden)
     43     x = x.long()
     44 
---> 45     embeds = self.embedding(x)
     46 
     47     # embeds size :[batch_size ,seq_len , input_length]

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/sparse.py in forward(self, input)
    112         return F.embedding(
    113             input, self.weight, self.padding_idx, self.max_norm,
--> 114             self.norm_type, self.scale_grad_by_freq, self.sparse)
    115 
    116     def extra_repr(self):

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   1722         # remove once script supports set_grad_enabled
   1723         _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 1724     return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
   1725 
   1726 

RuntimeError: Expected object of device type cuda but got device type cpu for argument #3 'index' in call to _th_index_select

Isaac_Kargar · May 17, 2020, 6:56am

I just changed your input tensor liek this: Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]).to(device) and it works.
here is the complete code:

import torch
import numpy as np
import torch.nn as nn

device = 'cuda:0'

batch_size =20
input_length=20
output_size=vocab_size = 10000
num_layers=2
hidden_units=200.
dropout=0
init_weight=0.1

class LSTM (nn.Module) : 

  # constructor 
    def __init__(self,vocab_size,input_length, output_size, hidden_dim, num_layers, drop_prob,init_weight):
        super(LSTM, self).__init__() # call to super constructor

        #self.output_size  = output_size
        self.input_length = input_length
        self.num_layers   = num_layers
        self.hidden_dim = hidden_dim
        self.init_weight  = init_weight

        # define layers
        self.embedding = nn.Embedding(num_embeddings = vocab_size, 
                             embedding_dim = input_length) 
        self.lstm = nn.LSTM(input_size = input_length, 
                          hidden_size = hidden_dim,
                          num_layers = num_layers,
                          dropout = drop_prob,
                          batch_first = True, #batch_first= True, then the input and output tensors are provided as (batch, seq, feature)
                          bidirectional = False)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.dropout = nn.Dropout(p = drop_prob)
        self.LogSoftMax=nn.LogSoftmax(dim = 1) # meaning result size [batch_size x output_size]


  # define forward function  
    def forward(self, x, hidden):
    
        batch_size = x.size(0)
        # input x size  :  [batch_size ,seq_len], note that (seq_len can vary between each mini batch)
        print("x size : " + str(x.size()) ) 

        # hidden size :[num_layers , batch_size , hidden_dim]
        print("hidden_at_time_0 size : ", str(hidden[0].size()) )

        # just to add protection since the input to the embedding should be long.
        x = x.long() 

        embeds = self.embedding(x) 

        # embeds size :[batch_size ,seq_len , input_length]
        # input_length is the size of the LSTM layer input. also called embedding_dim.
        #print("embeds size : ", str(embeds.size()) )

        lstm_out, hidden = self.lstm(embeds, hidden)

        # hidden size :[num_layers , batch_size , hidden_dim]
        #print("hidden size : ", str(hidden[0].size()))

        # lstm_out size :[seq_length , batch_size , hidden_dim]
        #print("lstm_out : ", str(lstm_out.size()))

        out = self.fc(lstm_out)

        # fc out size :[batch_size,seq_length,output_size]
        #print("fc out size : ", str(out.size()))

        out = self.dropout(out)

        # dropout out size :[batch_size,seq_length,output_size]
        #print("dropout out size : ", str(out.size()))

        out=self.LogSoftMax(out)

        # LogSoftMax out size :[batch_size,seq_length,output_size]
        #print("LogSoftMax out size : ", str(out.size()))

        #flatten out
        #out = out.view(batch_size, -1)
        #print("out.view : ", str(out.size()))
        #out = out[:,-1]
        #print("out[:,-1] : ", str(out.size()))
        return out, hidden

    def init_weights_uniform(self,m):
        #print(m)
        for name, param in m.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            #print("bias") 
            elif 'weight' in name:
            #print("weight") 
                param.data.uniform_(- self.init_weight, self.init_weight)
          #print(param) 


    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
                  weight.new(self.num_layers, batch_size, self.hidden_dim).zero_())
        return hidden

batch_size = 1

input_length = 20 #input_length to the LSM, EmbededDim#TrainDatabase.size(1),

output_size = 10000#1

layers = 2

decay = 2

hidden_units = 200 #rnn_size 

dropout = 0.5

init_weight = 0.1

lr = 1

vocab_size = 1000

max_epoch = 4

max_max_epoch = 13

max_grad_norm = 5


print("Test LSTM model on dummy inputs\n")
# test the LSTM forward feed with 1 batch, this test the Network model
model = LSTM(vocab_size, input_length, output_size, hidden_units, layers, dropout, init_weight)

model.to(device)
# test the LSTM forward feed with 2 batch, on dummy input
# if input is  batch_size x seq_len
Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]]).to(device)
batch_size=len(Input)

model.apply(model.init_weights_uniform) # init weights
hidden = model.init_hidden(batch_size)  # init first hidden stage

print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))
out, hidden=model(Input, hidden)
print(out.size()) #has no real value, since this untrained network