this is the model I have define:
class LSTM (nn.Module) :
# constructor
def __init__(self,vocab_size,input_length, output_size, hidden_dim, num_layers, drop_prob,init_weight):
super(LSTM, self).__init__() # call to super constructor
#self.output_size = output_size
self.input_length = input_length
self.num_layers = num_layers
self.hidden_dim = hidden_dim
self.init_weight = init_weight
# define layers
self.embedding = nn.Embedding(num_embeddings = vocab_size,
embedding_dim = input_length)
self.lstm = nn.LSTM(input_size = input_length,
hidden_size = hidden_dim,
num_layers = num_layers,
dropout = drop_prob,
batch_first = True, #batch_first= True, then the input and output tensors are provided as (batch, seq, feature)
bidirectional = False)
self.fc = nn.Linear(hidden_dim, output_size)
self.dropout = nn.Dropout(p = drop_prob)
self.LogSoftMax=nn.LogSoftmax(dim = 1) # meaning result size [batch_size x output_size]
# define forward function
def forward(self, x, hidden):
batch_size = x.size(0)
# input x size : [batch_size ,seq_len], note that (seq_len can vary between each mini batch)
print("x size : " + str(x.size()) )
# hidden size :[num_layers , batch_size , hidden_dim]
print("hidden_at_time_0 size : ", str(hidden[0].size()) )
# just to add protection since the input to the embedding should be long.
x = x.long()
embeds = self.embedding(x)
# embeds size :[batch_size ,seq_len , input_length]
# input_length is the size of the LSTM layer input. also called embedding_dim.
#print("embeds size : ", str(embeds.size()) )
lstm_out, hidden = self.lstm(embeds, hidden)
# hidden size :[num_layers , batch_size , hidden_dim]
#print("hidden size : ", str(hidden[0].size()))
# lstm_out size :[seq_length , batch_size , hidden_dim]
#print("lstm_out : ", str(lstm_out.size()))
out = self.fc(lstm_out)
# fc out size :[batch_size,seq_length,output_size]
#print("fc out size : ", str(out.size()))
out = self.dropout(out)
# dropout out size :[batch_size,seq_length,output_size]
#print("dropout out size : ", str(out.size()))
out=self.LogSoftMax(out)
# LogSoftMax out size :[batch_size,seq_length,output_size]
#print("LogSoftMax out size : ", str(out.size()))
#flatten out
#out = out.view(batch_size, -1)
#print("out.view : ", str(out.size()))
#out = out[:,-1]
#print("out[:,-1] : ", str(out.size()))
return out, hidden
def init_weights_uniform(self,m):
#print(m)
for name, param in m.named_parameters():
if 'bias' in name:
nn.init.constant_(param, 0.0)
#print("bias")
elif 'weight' in name:
#print("weight")
param.data.uniform_(- self.init_weight, self.init_weight)
#print(param)
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_dim).zero_())
return hidden
and I use it with :
batch_size =20
input_length=20
output_size=vocab_size = 10000
num_layers=2
hidden_units=200.
dropout=0
init_weight=0.1
and I found out that when I try to train in Colab i always get runtime crashes.
So i created this small test:
print("Test LSTM model on dummy inputs\n")
# test the LSTM forward feed with 1 batch, this test the Network model
model = LSTM(Myparams.vocab_size,Myparams.input_length, Myparams.output_size, Myparams.hidden_units, Myparams.layers, Myparams.dropout,Myparams.init_weight)
model.to(device)
# test the LSTM forward feed with 2 batch, on dummy input
# if input is batch_size x seq_len
Input = torch.LongTensor([[1,2,3,4,5],[6,5,5,4,6]])
batch_size=len(Input)
model.apply(model.init_weights_uniform) # init weights
hidden = model.init_hidden(batch_size) # init first hidden stage
print("Input size " + str(Input.size(0)) + " x " + str(Input.size(1)))
out, hidden=model(Input,hidden)
print(out.size()) #has no real value, since this untrained network
and I received
"Expected object of device type cuda but got device type cpu for argument #3 ‘index’ in call to _th_index_select
"
on line out, hidden=model(Input,hidden)
so i guess that hidden still in CPU! and not the GPU. (how to check this?)
so I tried doing
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(self.num_layers, batch_size, self.hidden_dim).zero_().to(device),
weight.new(self.num_layers, batch_size, self.hidden_dim).zero_()).to(device)
return hidden
but now I get an error that
'tuple' object has no attribute 'to'
how to solve this?