Results for simple ANN not reproducable between runs despite seeds set

Hi everybody,

unfortunately I am not able to reproduce the results between different runs of the same scripts on the same machine. The results are the same within the for-loop, but if I start the script new, the results are different. As all seeds are set (multiple times), I am not shuffling in the data loader and I spent a lot of time in debugging and looking at the inputs, weights etc, it would be great if you could help me.

Here you can find my code. Unfortunately I am not able to share the data, so I am also not able to provide a complete working example.

import configparser
import os
import sys
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import sklearn.preprocessing
import datetime
import pandas as pd

from training import TrainHelper, ModelsANN, ModelsBaseClass

class ANN(torch.nn.Module):

    def __init__(self, n_feature: int, n_hidden: int, num_hidden_layer: int, n_output: int = 1,
                 dropout_rate: float = 0.0):
        super(ANN, self).__init__()
        TrainHelper.init_pytorch_seeds()
        self.hidden_layer = nn.ModuleList()
        hidden_in = n_feature
        hidden_out = n_hidden
        for layer_num in range(num_hidden_layer):
            self.hidden_layer.append(nn.Linear(in_features=hidden_in, out_features=hidden_out))
            hidden_in = hidden_out
            hidden_out = int(hidden_in / 2)
        self.output_layer = nn.Linear(in_features=hidden_in, out_features=n_output)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, x):
        TrainHelper.init_pytorch_seeds()
        for layer in self.hidden_layer:
            x = F.relu(layer(x))
            x = self.dropout(x)
        out = self.output_layer(x)
        return out

# get optim parameters
base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
    TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column,
                                                split_perc=split_perc, period=period)

# load datasets
datasets = TrainHelper.load_datasets(config=config, company=company, target_column=target_column, period=period)

dataset = datasets[0]

train_test_list = TrainHelper.get_ready_train_test_lst(dataset=dataset, config=config,
                                                       init_train_len=init_train_len,
                                                       test_len=test_len, split_perc=split_perc,
                                                       imputation='mean',
                                                       target_column='CutFlowers',
                                                       dimensionality_reduction=None,
                                                       featureset='full')

pred_list = []
inst_list = []
models_list = []
for diff_run in range(0, 3):
    TrainHelper.init_pytorch_seeds()
    # noinspection PyUnboundLocalVariable
    for train, test in train_test_list:
        # noinspection PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker
        model = ANN(n_feature=train.shape[1] - 1, n_hidden=10, num_hidden_layer=1, dropout_rate=0)
        batch_size = 4
        learning_rate = 1e-1
        epochs = 10000
        min_val_loss_improvement = 100
        max_epochs_wo_improvement = 20
        x_scaler = sklearn.preprocessing.StandardScaler()
        y_scaler = sklearn.preprocessing.StandardScaler()
        valid_size = 0.2
        split_ind = int(train.shape[0] * (1 - valid_size))
        train_data = train.iloc[:split_ind]
        valid_data = train.iloc[split_ind:]
        # scale input data
        x_train = x_scaler.fit_transform(train_data.drop(target_column, axis=1))
        x_valid = x_scaler.transform(valid_data.drop(target_column, axis=1))
        # create train ready data
        x_train = torch.tensor(x_train.astype(np.float32))
        x_valid = torch.tensor(x_valid.astype(np.float32))
        y_train = torch.tensor(data=train_data[target_column].values.reshape(-1, 1).astype(np.float32))
        y_valid = torch.tensor(data=valid_data[target_column].values.reshape(-1, 1).astype(np.float32))
        # noinspection PyUnresolvedReferences,PyUnresolvedReferences
        train_loader = torch.utils.data.DataLoader(dataset=torch.utils.data.TensorDataset(x_train, y_train),
                                                   batch_size=batch_size, shuffle=False, drop_last=False,
                                                   worker_init_fn=np.random.seed(0))
        loss = nn.MSELoss()
        # more identical checkpoint name to prevent loading of checkpoints of parallel runs
        checkpoint_name = '_' + datetime.datetime.now().strftime("%d-%b-%Y_%H-%M-%S-%f")
        min_valid_loss = 99999999
        epochs_wo_improvement_threshold = 0
        epochs_wo_improvement_total = 0
        # instantiate new optimizer to ensure independence of previous runs
        optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
        # Adam as standard optimizer, change with if-elif loop if another optimizer should be used
        # get device and shift model and data to it
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        model.to(device)
        x_valid, y_valid = x_valid.to(device), y_valid.to(device)
        for e in range(200):
            model.train()
            for (batch_x, batch_y) in train_loader:
                # copy data to device
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                # gradients are summed up so they need to be zeroed for new run
                optimizer.zero_grad()
                y_pred = model(batch_x)
                loss_train = loss(y_pred, batch_y)
                loss_train.backward()
                optimizer.step()
            model.eval()
            y_pred_valid = model(x_valid)
            loss_valid = loss(y_pred_valid, y_valid).item()
            if loss_valid < min_valid_loss:
                min_valid_loss = loss_valid
                epochs_wo_improvement_threshold = 0
                epochs_wo_improvement_total = 0
                torch.save(model.state_dict(), 'Checkpoints/checkpoint_' + checkpoint_name + '.pt')
            if e % 100 == 0:
                print('Epoch ' + str(e) + ': valid loss = ' + str(loss_valid)
                      + ', min_valid_loss = ' + str(min_valid_loss))
        model.load_state_dict(state_dict=torch.load('Checkpoints/checkpoint_' + checkpoint_name + '.pt'))
        os.remove('Checkpoints/checkpoint_' + checkpoint_name + '.pt')
        model.eval()
        # predict on cpu
        model.to(torch.device("cpu"))
        x_train = torch.tensor(data=x_scaler.transform(train.drop(target_column, axis=1)).astype(np.float32))
        insample = pd.DataFrame(data=model(x=x_train).data.numpy(),
                                index=train.index, columns=['Insample'])
        x_test = torch.tensor(data=x_scaler.transform(test.drop(target_column, axis=1)).astype(np.float32))
        model.eval()
        # predict on cpu
        model.to(torch.device("cpu"))
        predict = model(x=x_test).data.numpy().flatten()
        predictions = pd.DataFrame({'Prediction': predict}, index=test.index)
    pred_list.append(predictions)
    inst_list.append(insample)
    models_list.append(model)

And here you can find the helper function setting all the seeds:

def init_pytorch_seeds():
    seed = 0
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

Not all operations are deterministic even is all seeds are properly set as described in the reproducibility docs.

The new torch.set_deterministic() call is currently in its beta stage, but would raise an error if such a non-deterministic operation is used in your script.