How to save checkpoints correctly

I’m training GRU model in PyTorch for timeseries forecasting.

This is my model

class GRU(nn.Module):

    def __init__(self, args, input_dim):
        super(GRU, self).__init__()
        self.args = args
        self.hidden_dim = args.hidden_units1
        self.input_dim = input_dim
        self.output_dim = args.pred_len
        self.layer_dim = args.num_layers
        # GRU layers
        self.gru = nn.GRU(
            self.input_dim, self.hidden_dim, self.layer_dim, batch_first=True, 
        dropout=args.dropout, bidirectional=True)
        self.fc = nn.Linear(self.hidden_dim * 2, self.output_dim)
       
    def forward(self, x, eva=False): ....

My train Class:

class TorchTrainer:
    def __init__(self, model, loss_fn, optimizer):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.train_losses = []
        self.val_losses = []

    def train_step(self, x, y):
        self.model.train()
        yhat = self.model(x)
        loss = self.loss_fn(y, yhat)
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()
        return loss.item()

    def train(self, train_loader, val_loader, batch_size, n_epochs, n_features, result_path, best_loss=5):
        for epoch in range(1, n_epochs + 1):
            self.optimizer.zero_grad()
            batch_losses = []
            for x_batch, y_batch in train_loader:
                x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
                y_batch = y_batch.to(device)
                loss = self.train_step(x_batch, y_batch)
                batch_losses.append(loss)
            training_loss = np.mean(batch_losses)
            self.train_losses.append(training_loss)

            with torch.no_grad():
                batch_val_losses = []
                for x_val, y_val in val_loader:
                    x_val = x_val.view([batch_size, -1, n_features]).to(device)
                    y_val = y_val.to(device)
                    self.model.eval()
                    yhat = self.model(x_val)
                    predictions = yhat.detach().cpu().numpy()
                    val_loss = self.loss_fn(y_val, yhat).item()
                    batch_val_losses.append(val_loss)

                validation_loss = np.mean(batch_val_losses)
                self.val_losses.append(validation_loss)

            mse = MSE(y_val.detach().cpu().numpy(), predictions)

            if mse <= best_loss:
                best_loss = mse
                save_checkpoint(mode)

            print(
                "epoch : {} , t_loss : {} , v_loss : {},  best loss : {}".format(epoch, training_loss, validation_loss,
                                                                                 best_loss))


    def evaluate(self, test_loader, batch_size=1, n_features=2):

        with torch.no_grad():
            preds = []
            trues = []
            for x_test, y_test in test_loader:
                x_test = x_test.view([batch_size, -1, n_features]).to(device)
                y_test = y_test.to(device)
                self.model.eval()
                yhat = self.model(x_test, True)
                yhat=yhat.cpu().data.numpy()
                preds.append(yhat)
                y_test=y_test.cpu().data.numpy()
                trues.append(y_test)

        preds = np.array(preds)
        trues = np.array(trues) 

I want to save the model checkpoints everytime the model achives new best performance, to ensure that I will have the best-performing model, even if training is interrupted or if overfitting occurs later in the training process.

To do so, I added theses lines:
mse = MSE(y_val.detach().cpu().numpy(), predictions)

        if mse <= best_loss:
            best_loss = mse
            save_checkpoint(mode)

But I’m not sure if I’m doing this correctly. Because the best_valid values I get during the training output are not stable

Yes, this looks correct only

1 Like

@Mohammed_Alruqimi i guess @Aniruth_Sundararaja1 has already evaluated syntactically

What do you mean by

Because the best_valid values I get during the training output are not stable

you can do the following to get a better answer

  • From your subsequent model snapshots, re-generate the MSE values and confirm whether they are declining

  • Are you using some other metric later on to find out whether the output values are good

1 Like