Unable to reproduce saved model

sleebapaul · September 4, 2018, 2:31pm

I’ve forked the official word level language model of PyTorch. I was managed to train the network and arrived at the following result.

Have a look at the code.

def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


def get_batch(source, i):
    seq_len = min(seq_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)



class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)

        self.LSTM = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.LSTM(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())

        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))
    

def evaluate(data_source, eval_batch_size):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = vocab_size
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_length):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)


def train():
    # Turn on training mode which enables dropout.
    
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = vocab_size
    hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_length)):
        data, targets = get_batch(train_data, i)
        
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(data, hidden)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        clip_grad_norm_(model.parameters(), 0.25)
        for p in model.parameters():
            p.data.add_(-learning_rate, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:5.4f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f} |'.format(
                epoch, batch, len(train_data) // seq_length, learning_rate,
                elapsed * 1000 /log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

best_val_loss = None

training_loss = []
validation_loss = []

# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, num_epochs+1):
        epoch_start_time = time.time()
        
        train()
        
        val_loss = evaluate(val_data, eval_batch_size)
        tr_loss = evaluate(train_data, batch_size)
        
        
        training_loss.append(tr_loss)
        validation_loss.append(val_loss)
        
        print('-' * 122)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f} | training loss {:5.2f} | training ppl {:8.2f} |'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss), tr_loss, math.exp(tr_loss)))
        print('-' * 122)
        
        
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            torch.save(model.state_dict(), "model.pt")
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            learning_rate = learning_rate /1.5
except KeyboardInterrupt:
    print('-' * 122)
    print('Exiting from training early')

When I tried to get the saved model to reproduce my results, I got the following result.

End of training | test loss 17.28 | test ppl 32104905.14

The code I used is,

model = RNNModel(ntoken=vocab_size, ninp=embed_size, nhid=hidden_size, nlayers=num_layers, dropout=dropout_value).to(device)
model.load_state_dict(torch.load("model.ckpt"))
criterion = nn.CrossEntropyLoss()

What is wrong here? I’m starting this thread as an extension of the discussion happened here directed by ptrblck

ptrblck · September 4, 2018, 2:36pm

Did you just call evaluate after loading the model and got this result?

sleebapaul · September 4, 2018, 2:37pm

That’s right.

# Run on test data.
test_loss = evaluate(test_data, eval_batch_size)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

sleebapaul · September 4, 2018, 2:41pm

Do I need to keep the same word2idx and idx2word ?
Is it a problem if I call model.eval() globally before evaluate though it is called inside the function?

ptrblck · September 4, 2018, 2:49pm

I think you should use the same transformation. Could you check it?
The eval() call should be alright, as long as you don’t call .train() afterwards.

sleebapaul · September 5, 2018, 6:17am

I think I’ve committed a mistake by setting no random seed. Now it’s set, I’m training the network. Hopefully the problem will be solved.

domantasjurkus · March 20, 2019, 11:54am

Interested to hear if you solved the problem. Tried setting all parameters as suggested here (during training and during validation), but accuracies still differ substantially between training+saving and loading+validating the model.