Unable to reproduce saved model

I’ve forked the official word level language model of PyTorch. I was managed to train the network and arrived at the following result.

Have a look at the code.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

def get_batch(source, i):
    seq_len = min(seq_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
        return tuple(repackage_hidden(v) for v in h)

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)

        self.LSTM = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)


        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.LSTM(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())

        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))

def evaluate(data_source, eval_batch_size):
    # Turn on evaluation mode which disables dropout.
    total_loss = 0.
    ntokens = vocab_size
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_length):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

def train():
    # Turn on training mode which enables dropout.
    total_loss = 0.
    start_time = time.time()
    ntokens = vocab_size
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_length)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        clip_grad_norm_(model.parameters(), 0.25)
        for p in model.parameters():
            p.data.add_(-learning_rate, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:5.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} |'.format(
                epoch, batch, len(train_data) // seq_length, learning_rate,
                elapsed * 1000 /log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

best_val_loss = None

training_loss = []
validation_loss = []

# At any point you can hit Ctrl + C to break out of training early.
    for epoch in range(1, num_epochs+1):
        epoch_start_time = time.time()
        val_loss = evaluate(val_data, eval_batch_size)
        tr_loss = evaluate(train_data, batch_size)
        print('-' * 122)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | training loss {:5.2f} | training ppl {:8.2f} |'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), tr_loss, math.exp(tr_loss)))
        print('-' * 122)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            torch.save(model.state_dict(), "model.pt")
            best_val_loss = val_loss
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            learning_rate = learning_rate /1.5
except KeyboardInterrupt:
    print('-' * 122)
    print('Exiting from training early')

When I tried to get the saved model to reproduce my results, I got the following result.

End of training | test loss 17.28 | test ppl 32104905.14

The code I used is,

model = RNNModel(ntoken=vocab_size, ninp=embed_size, nhid=hidden_size, nlayers=num_layers, dropout=dropout_value).to(device)
criterion = nn.CrossEntropyLoss()

What is wrong here? I’m starting this thread as an extension of the discussion happened here directed by ptrblck

Did you just call evaluate after loading the model and got this result?

That’s right.

# Run on test data.
test_loss = evaluate(test_data, eval_batch_size)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

Do I need to keep the same word2idx and idx2word ?
Is it a problem if I call model.eval() globally before evaluate though it is called inside the function?

I think you should use the same transformation. Could you check it?
The eval() call should be alright, as long as you don’t call .train() afterwards.

1 Like

I think I’ve committed a mistake by setting no random seed. Now it’s set, I’m training the network. Hopefully the problem will be solved.

Interested to hear if you solved the problem. Tried setting all parameters as suggested here (during training and during validation), but accuracies still differ substantially between training+saving and loading+validating the model.