Difference in model performance when using pack_padded_sequence

Here is my code for some toy data:

# packed_sequence = nn.utils.rnn.pack_padded_sequence(inputs, [10, 9, 8, 7, 5], batch_first=True)
# print(packed_sequence)
# unpacked_tensor, lengths = nn.utils.rnn.pad_packed_sequence(packed_sequence, batch_first=True)
# print(unpacked_tensor)

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
from random import randint
import sys

use_cuda = False


class SentenceEncoderRNN(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, embeddings=None, fine_tune_embeddings=True, variable_lengths=True, bidirectional=True):
        super(SentenceEncoderRNN, self).__init__()
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.variable_lengths = variable_lengths
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding(input_dim, embed_dim, padding_idx=0)
        # self.embedding.weight = nn.Parameter(torch.from_numpy(embeddings).float())
        # self.embedding.weight.requires_grad = fine_tune_embeddings
        self.gru = nn.GRU(embed_dim, hidden_dim, bidirectional=bidirectional, batch_first=True)

    def forward(self, inputs, lengths):
        embedded = self.embedding(inputs)
        if self.variable_lengths:
            embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths, batch_first=True)
        all_outs, hidden = self.gru(embedded)
        return all_outs, hidden


class Classifier(nn.Module):
    """docstring for Classifier"""

    def __init__(self, encoder, hidden_dim):
        super(Classifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.encoder = encoder
        self.Wc = nn.Linear(hidden_dim, 1)

    def forward(self, inputs, lengths):
        outputs, hidden = self.encoder(inputs, lengths)
        if self.encoder.variable_lengths:
            outputs, lengths = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True)
        last_outputs = outputs[:, -1, :]
        temp = self.Wc(last_outputs)
        prob = F.sigmoid(temp)
        return prob

    def predict(self, inputs, lengths):
        return self.forward(inputs, lengths).data


def train(encoder, clfr, optimizer, inputs, lengths, targets, batch_size):
    epochs = 1000
    copy_epochs = epochs
    while copy_epochs > 0:
        optimizer.zero_grad()
        outputs = clfr(inputs, lengths)
        loss_function = nn.MSELoss()
        loss = loss_function(outputs, targets)
        loss.backward()
        optimizer.step()
        copy_epochs -= 1
        if copy_epochs % 100 == 0:
            print(loss.data[0])


def prepare_data(batch_size, max_timesteps, vocab_size):
    inputs = torch.LongTensor(batch_size, max_timesteps).zero_()
    lengths = [10, 9, 7, 7, 5]
    targets = Variable(torch.Tensor([1, 0, 0, 0, 1]))
    for i in range(batch_size):
        for j in range(max_timesteps):
            if j == lengths[i]:
                break
            inputs[i, j] = randint(1, vocab_size - 1)
    inputs = Variable(inputs)
    return inputs, lengths, targets


if __name__ == '__main__':
    variable_lengths = True # or False
    print('Variable lengths:', variable_lengths)
    batch_size, max_timesteps, vocab_size, embed_dim, hidden_dim = 5, 10, 20, 15, 7
    bidirectional = False
    if bidirectional:
        num_dir = 2
    else:
        num_dir = 1

    inputs, lengths, targets = prepare_data(batch_size, max_timesteps, vocab_size)
    print('Inputs:')
    print(inputs)
    encoder = SentenceEncoderRNN(vocab_size, embed_dim, hidden_dim, variable_lengths=variable_lengths, bidirectional=bidirectional)
    clfr = Classifier(encoder, hidden_dim * num_dir)
    optimizer = torch.optim.SGD(clfr.parameters(), lr=0.01, momentum=0.9)
    print('Loss:')
    train(encoder, clfr, optimizer, inputs, lengths, targets, batch_size)
    print('Predictions:')
    print(clfr.predict(inputs, lengths))

Here are the outputs:
Without using pack_padded_sequence

Variable lengths: False
Inputs:
Variable containing:
4 11 6 10 5 17 1 1 3 9
18 4 8 19 19 14 1 18 11 0
19 3 8 3 10 17 12 0 0 0
5 19 9 6 13 2 17 0 0 0
6 14 2 18 1 0 0 0 0 0
[torch.LongTensor of size 5x10]

Loss:
0.19118347764015198
0.15429019927978516
0.1406058967113495
0.11726896464824677
0.05257026106119156
0.013342835009098053
0.005831681191921234
0.003512600902467966
0.0024605693761259317
0.001874120207503438
Predictions:

0.9772
0.0425
0.0303
0.0413
0.9337
[torch.FloatTensor of size 5x1]

Using pack_padded_sequence

Variable lengths: True
Inputs:
Variable containing:
9 10 12 11 5 1 7 5 16 16
16 18 15 13 12 17 16 16 16 0
5 2 17 3 19 2 5 0 0 0
18 10 16 16 15 7 3 0 0 0
17 15 2 5 11 0 0 0 0 0
[torch.LongTensor of size 5x10]

Loss:
0.1822376549243927
0.16268190741539001
0.1562534123659134
0.15366964042186737
0.1524355411529541
0.1517626792192459
0.15135648846626282
0.15109115839004517
0.15090689063072205
0.15077264606952667
Predictions:

0.9393
0.2565
0.2565
0.2565
0.2565
[torch.FloatTensor of size 5x1]

The input to my models is a bunch of sequence of variable lengths.
I am getting significantly lower performance of the model when I use pack_padded_sequence (to try and automatically deal with variable length input).

Is there any bug in my implementation? Can anyone explain why this may be happening?

Thanks!