Hi everyone,
I made a model that has:
- an encoder layer (embedding and GRU)
- a linear layer
- sigmoid function
The linear layer takes every output of the encoder concatenated with the last hidden unit and makes a binary classification (per encoded output).
First I tried this code for one sentence, and it is working:
def __one_iter__(self, encoder_input_variable, target_name):
# Creating the hidden unit of zeros
encoder_hidden = self.encoder.initHidden()
# Sets the gradient of all parameters of the model to zero
self.optimizer.zero_grad()
input_length = encoder_input_variable.size()[0]
encoder_outputs = Variable(torch.zeros(input_length, self.encoder.hidden_size))
encoder_outputs = encoder_outputs.cuda() if self.use_cuda else encoder_outputs
start_encoder = time.time()
targets = []
for ei in range(input_length):
# Take the "ei" word and give it to the encoder (encoder.forward(., .)).
# It gives you the output and the new hidden state
encoder_output, encoder_hidden = self.encoder.forward(encoder_input_variable[ei], encoder_hidden)
if ei == 0:
print('Encoder input size: ', encoder_input_variable[ei].size())
# In this model I need to save the output of every word to use them as input of the classifier
encoder_outputs[ei] = encoder_output[0]
token = encoder_input_variable[ei]
# Wrong name entities will have output 0, the right ones 1
if token.equal(target_name):
targets.append(Variable(torch.FloatTensor([1]), requires_grad=False))
else:
targets.append(Variable(torch.FloatTensor([0]), requires_grad=False))
print("[INFO] Encoder time: {}".format(time.time() - start_encoder))
last_hidden = encoder_hidden[0]
classifier_inputs = [torch.cat((x.view(1, -1), last_hidden), dim=1) for x in encoder_outputs]
loss = None
ei = 0
print('Len classifier inputs: ', len(classifier_inputs))
for input_, target in zip(classifier_inputs, targets):
# target = Variable(torch.FloatTensor([target]))
target = target.cuda() if self.use_cuda else target
output = self.linear_layer(input_)
output = self.sigmoid(output)
if ei == 0:
print('Linear input size: ', input_.size())
print('Output size: ', output.size())
print('OUTPUT: ', output)
ei += 1
if loss is None:
loss = self.criterion(output[0], target)
else:
loss += self.criterion(output[0], target)
loss = loss.cuda() if self.use_cuda else loss
start = time.time()
loss.backward()
print("[INFO] Backward time: {}".format(time.time() - start))
self.optimizer.step()
return loss.data[0] / len(targets)
Then, since I wanted to use batches, I edited the above code a little:
def __one_iter__(self, encoder_input_variable, target_name):
# Creating the hidden unit of zeros
encoder_hidden = self.encoder.initHidden()
# Sets the gradient of all parameters of the model to zero
self.encoder_optimizer.zero_grad()
input_length = encoder_input_variable.size()[0]
encoder_outputs = Variable(torch.zeros(input_length, self.encoder.hidden_size))
encoder_outputs = encoder_outputs.cuda() if self.use_cuda else encoder_outputs
targets = Variable(torch.zeros(input_length, 1))
targets = targets.cuda() if self.use_cuda else targets
start_encoder = time.time()
for ei in range(0, input_length, self.batch_size):
# Take the "ei" word and give it to the encoder (encoder.forward(., .)).
# It gives you the output and the new hidden state
batch = encoder_input_variable[ei:min(ei + self.batch_size, input_length)]
# Last batch - adding enough data to continue
if ei + self.batch_size > input_length:
batch = torch.cat([batch, Variable(torch.LongTensor([self.padding_value] * (input_length%self.batch_size)))], 0)
encoder_output, encoder_hidden = self.encoder.forward(batch.view(self.batch_size, 1), encoder_hidden)
if ei == 0:
print('Encoder input size: ', batch.size())
# In this model I need to save the output of every word to use them as input of the classifier
# for i in range(len(encoder_output)):
for i in range(self.batch_size):
if ei + i < input_length:
encoder_outputs[ei + i] = encoder_output[i]
token = encoder_input_variable[ei]
# Wrong name entities will have output 0, the right ones 1
if token.equal(target_name):
targets[ei + i] = (Variable(torch.FloatTensor([1]), requires_grad=False))
else:
targets[ei + i] = (Variable(torch.FloatTensor([0]), requires_grad=False))
else:
break
print("[INFO] Encoder time: {}".format(time.time() - start_encoder))
dim_encoder_output = self.encoder.hidden_size * 2
last_hidden = encoder_hidden[0]
classifier_inputs = torch.stack([torch.cat((x.view(1, -1), last_hidden), dim=1) for x in encoder_outputs])
classifier_inputs = classifier_inputs.view(input_length, dim_encoder_output)
print('Len classifier inputs: ', len(classifier_inputs))
loss = None
for ei in range(0, input_length, self.batch_size):
batch_input = classifier_inputs[ei:min(ei+self.batch_size, input_length)]
batch_target = targets[ei:min(ei+self.batch_size, input_length)]
# Only to fix the size of the last batch
if ei + self.batch_size > input_length:
batch_input = torch.cat([batch_input, Variable(torch.FloatTensor([[0]*dim_encoder_output] * (input_length%self.batch_size)))], 0)
batch_target = torch.cat([batch_target, Variable(torch.FloatTensor([[0]*(input_length%self.batch_size)]))], 0)
output = self.linear_layer(batch_input)
output = self.sigmoid(output)
if ei == 0:
print('Linear input size: ', batch_input.size())
print('Output size: ', output.size())
print(output)
if loss is None:
loss = self.criterion(output, batch_target)
else:
loss += self.criterion(output, batch_target)
loss = loss.cuda() if self.use_cuda else loss
start = time.time()
loss.backward()
print("[INFO] Backward time: {}".format(time.time() - start))
self.encoder_optimizer.step()
return loss.data[0] / len(targets)
Now the results are the same, but when I use the loss.backward() in the second piece of code the time of computing is almost doubled and I can’t explain why (and neither I know where to search to answer my question).
A little help is appreciated! Thanks in advance