Loss.backward() slower when using batch

Hi everyone,

I made a model that has:

  • an encoder layer (embedding and GRU)
  • a linear layer
  • sigmoid function

The linear layer takes every output of the encoder concatenated with the last hidden unit and makes a binary classification (per encoded output).

First I tried this code for one sentence, and it is working:

    def __one_iter__(self, encoder_input_variable, target_name):

        # Creating the hidden unit of zeros
        encoder_hidden = self.encoder.initHidden()

        # Sets the gradient of all parameters of the model to zero
        self.optimizer.zero_grad()
        input_length = encoder_input_variable.size()[0]

        encoder_outputs = Variable(torch.zeros(input_length, self.encoder.hidden_size))
        encoder_outputs = encoder_outputs.cuda() if self.use_cuda else encoder_outputs

        start_encoder = time.time()
        targets = []
        for ei in range(input_length):
            # Take the "ei" word and give it to the encoder (encoder.forward(., .)).
            # It gives you the output and the new hidden state
            encoder_output, encoder_hidden = self.encoder.forward(encoder_input_variable[ei], encoder_hidden)
            if ei == 0:
                print('Encoder input size: ', encoder_input_variable[ei].size())

            # In this model I need to save the output of every word to use them as input of the classifier
            encoder_outputs[ei] = encoder_output[0]

            token = encoder_input_variable[ei]
            # Wrong name entities will have output 0, the right ones 1
            if token.equal(target_name):
                targets.append(Variable(torch.FloatTensor([1]), requires_grad=False))
            else:
                targets.append(Variable(torch.FloatTensor([0]), requires_grad=False))

        print("[INFO] Encoder time: {}".format(time.time() - start_encoder))

        last_hidden = encoder_hidden[0]

        classifier_inputs = [torch.cat((x.view(1, -1), last_hidden), dim=1) for x in encoder_outputs]

        loss = None
        ei = 0
        print('Len classifier inputs: ', len(classifier_inputs))
        for input_, target in zip(classifier_inputs, targets):
            # target = Variable(torch.FloatTensor([target]))
            target = target.cuda() if self.use_cuda else target
            output = self.linear_layer(input_)
            output = self.sigmoid(output)
            if ei == 0:
                print('Linear input size: ', input_.size())
                print('Output size: ', output.size())
                print('OUTPUT: ', output)
                ei += 1

            if loss is None:
                loss = self.criterion(output[0], target)

            else:
                loss += self.criterion(output[0], target)

        loss = loss.cuda() if self.use_cuda else loss

        start = time.time()
        loss.backward()
        print("[INFO] Backward time: {}".format(time.time() - start))

        self.optimizer.step()

        return loss.data[0] / len(targets)

Then, since I wanted to use batches, I edited the above code a little:

 def __one_iter__(self, encoder_input_variable, target_name):

        # Creating the hidden unit of zeros
        encoder_hidden = self.encoder.initHidden()

        # Sets the gradient of all parameters of the model to zero
        self.encoder_optimizer.zero_grad()
        input_length = encoder_input_variable.size()[0]

        encoder_outputs = Variable(torch.zeros(input_length, self.encoder.hidden_size))
        encoder_outputs = encoder_outputs.cuda() if self.use_cuda else encoder_outputs

        targets = Variable(torch.zeros(input_length, 1))
        targets = targets.cuda() if self.use_cuda else targets

        start_encoder = time.time()
        for ei in range(0, input_length, self.batch_size):
            # Take the "ei" word and give it to the encoder (encoder.forward(., .)).
            # It gives you the output and the new hidden state

            batch = encoder_input_variable[ei:min(ei + self.batch_size, input_length)]
            # Last batch - adding enough data to continue
            if ei + self.batch_size > input_length:
                batch = torch.cat([batch, Variable(torch.LongTensor([self.padding_value] * (input_length%self.batch_size)))], 0)

            encoder_output, encoder_hidden = self.encoder.forward(batch.view(self.batch_size, 1), encoder_hidden)
            if ei == 0:
                print('Encoder input size: ', batch.size())

            # In this model I need to save the output of every word to use them as input of the classifier
            # for i in range(len(encoder_output)):
            for i in range(self.batch_size):
                if ei + i < input_length:
                    encoder_outputs[ei + i] = encoder_output[i]

                    token = encoder_input_variable[ei]
                    # Wrong name entities will have output 0, the right ones 1
                    if token.equal(target_name):
                        targets[ei + i] = (Variable(torch.FloatTensor([1]), requires_grad=False))
                    else:
                        targets[ei + i] = (Variable(torch.FloatTensor([0]), requires_grad=False))

                else:
                    break

        print("[INFO] Encoder time: {}".format(time.time() - start_encoder))

        dim_encoder_output = self.encoder.hidden_size * 2

        last_hidden = encoder_hidden[0]
        classifier_inputs = torch.stack([torch.cat((x.view(1, -1), last_hidden), dim=1) for x in encoder_outputs])
        classifier_inputs = classifier_inputs.view(input_length, dim_encoder_output)

        print('Len classifier inputs: ', len(classifier_inputs))
        loss = None
        for ei in range(0, input_length, self.batch_size):
            batch_input = classifier_inputs[ei:min(ei+self.batch_size, input_length)]
            batch_target = targets[ei:min(ei+self.batch_size, input_length)]
            
            # Only to fix the size of the last batch
            if ei + self.batch_size > input_length:
                batch_input = torch.cat([batch_input, Variable(torch.FloatTensor([[0]*dim_encoder_output] * (input_length%self.batch_size)))], 0)
                batch_target = torch.cat([batch_target, Variable(torch.FloatTensor([[0]*(input_length%self.batch_size)]))], 0)

            output = self.linear_layer(batch_input)
            output = self.sigmoid(output)
            if ei == 0:
                print('Linear input size: ', batch_input.size())
                print('Output size: ', output.size())
                print(output)

            if loss is None:
                loss = self.criterion(output, batch_target)

            else:
                loss += self.criterion(output, batch_target)

        loss = loss.cuda() if self.use_cuda else loss

        start = time.time()
        loss.backward()
        print("[INFO] Backward time: {}".format(time.time() - start))

        self.encoder_optimizer.step()

        return loss.data[0] / len(targets)

Now the results are the same, but when I use the loss.backward() in the second piece of code the time of computing is almost doubled and I can’t explain why (and neither I know where to search to answer my question).

A little help is appreciated! Thanks in advance