Hi everybody,
unfortunately I am not able to reproduce the results between different runs of the same scripts on the same machine. The results are the same within the for-loop, but if I start the script new, the results are different. As all seeds are set (multiple times), I am not shuffling in the data loader and I spent a lot of time in debugging and looking at the inputs, weights etc, it would be great if you could help me.
Here you can find my code. Unfortunately I am not able to share the data, so I am also not able to provide a complete working example.
import configparser
import os
import sys
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import sklearn.preprocessing
import datetime
import pandas as pd
from training import TrainHelper, ModelsANN, ModelsBaseClass
class ANN(torch.nn.Module):
def __init__(self, n_feature: int, n_hidden: int, num_hidden_layer: int, n_output: int = 1,
dropout_rate: float = 0.0):
super(ANN, self).__init__()
TrainHelper.init_pytorch_seeds()
self.hidden_layer = nn.ModuleList()
hidden_in = n_feature
hidden_out = n_hidden
for layer_num in range(num_hidden_layer):
self.hidden_layer.append(nn.Linear(in_features=hidden_in, out_features=hidden_out))
hidden_in = hidden_out
hidden_out = int(hidden_in / 2)
self.output_layer = nn.Linear(in_features=hidden_in, out_features=n_output)
self.dropout = nn.Dropout(p=dropout_rate)
def forward(self, x):
TrainHelper.init_pytorch_seeds()
for layer in self.hidden_layer:
x = F.relu(layer(x))
x = self.dropout(x)
out = self.output_layer(x)
return out
# get optim parameters
base_dir, seasonal_periods, split_perc, init_train_len, test_len, resample_weekly = \
TrainHelper.get_optimization_run_parameters(config=config, company=company, target_column=target_column,
split_perc=split_perc, period=period)
# load datasets
datasets = TrainHelper.load_datasets(config=config, company=company, target_column=target_column, period=period)
dataset = datasets[0]
train_test_list = TrainHelper.get_ready_train_test_lst(dataset=dataset, config=config,
init_train_len=init_train_len,
test_len=test_len, split_perc=split_perc,
imputation='mean',
target_column='CutFlowers',
dimensionality_reduction=None,
featureset='full')
pred_list = []
inst_list = []
models_list = []
for diff_run in range(0, 3):
TrainHelper.init_pytorch_seeds()
# noinspection PyUnboundLocalVariable
for train, test in train_test_list:
# noinspection PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker,PyTypeChecker
model = ANN(n_feature=train.shape[1] - 1, n_hidden=10, num_hidden_layer=1, dropout_rate=0)
batch_size = 4
learning_rate = 1e-1
epochs = 10000
min_val_loss_improvement = 100
max_epochs_wo_improvement = 20
x_scaler = sklearn.preprocessing.StandardScaler()
y_scaler = sklearn.preprocessing.StandardScaler()
valid_size = 0.2
split_ind = int(train.shape[0] * (1 - valid_size))
train_data = train.iloc[:split_ind]
valid_data = train.iloc[split_ind:]
# scale input data
x_train = x_scaler.fit_transform(train_data.drop(target_column, axis=1))
x_valid = x_scaler.transform(valid_data.drop(target_column, axis=1))
# create train ready data
x_train = torch.tensor(x_train.astype(np.float32))
x_valid = torch.tensor(x_valid.astype(np.float32))
y_train = torch.tensor(data=train_data[target_column].values.reshape(-1, 1).astype(np.float32))
y_valid = torch.tensor(data=valid_data[target_column].values.reshape(-1, 1).astype(np.float32))
# noinspection PyUnresolvedReferences,PyUnresolvedReferences
train_loader = torch.utils.data.DataLoader(dataset=torch.utils.data.TensorDataset(x_train, y_train),
batch_size=batch_size, shuffle=False, drop_last=False,
worker_init_fn=np.random.seed(0))
loss = nn.MSELoss()
# more identical checkpoint name to prevent loading of checkpoints of parallel runs
checkpoint_name = '_' + datetime.datetime.now().strftime("%d-%b-%Y_%H-%M-%S-%f")
min_valid_loss = 99999999
epochs_wo_improvement_threshold = 0
epochs_wo_improvement_total = 0
# instantiate new optimizer to ensure independence of previous runs
optimizer = torch.optim.Adam(params=model.parameters(), lr=learning_rate)
# Adam as standard optimizer, change with if-elif loop if another optimizer should be used
# get device and shift model and data to it
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
x_valid, y_valid = x_valid.to(device), y_valid.to(device)
for e in range(200):
model.train()
for (batch_x, batch_y) in train_loader:
# copy data to device
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
# gradients are summed up so they need to be zeroed for new run
optimizer.zero_grad()
y_pred = model(batch_x)
loss_train = loss(y_pred, batch_y)
loss_train.backward()
optimizer.step()
model.eval()
y_pred_valid = model(x_valid)
loss_valid = loss(y_pred_valid, y_valid).item()
if loss_valid < min_valid_loss:
min_valid_loss = loss_valid
epochs_wo_improvement_threshold = 0
epochs_wo_improvement_total = 0
torch.save(model.state_dict(), 'Checkpoints/checkpoint_' + checkpoint_name + '.pt')
if e % 100 == 0:
print('Epoch ' + str(e) + ': valid loss = ' + str(loss_valid)
+ ', min_valid_loss = ' + str(min_valid_loss))
model.load_state_dict(state_dict=torch.load('Checkpoints/checkpoint_' + checkpoint_name + '.pt'))
os.remove('Checkpoints/checkpoint_' + checkpoint_name + '.pt')
model.eval()
# predict on cpu
model.to(torch.device("cpu"))
x_train = torch.tensor(data=x_scaler.transform(train.drop(target_column, axis=1)).astype(np.float32))
insample = pd.DataFrame(data=model(x=x_train).data.numpy(),
index=train.index, columns=['Insample'])
x_test = torch.tensor(data=x_scaler.transform(test.drop(target_column, axis=1)).astype(np.float32))
model.eval()
# predict on cpu
model.to(torch.device("cpu"))
predict = model(x=x_test).data.numpy().flatten()
predictions = pd.DataFrame({'Prediction': predict}, index=test.index)
pred_list.append(predictions)
inst_list.append(insample)
models_list.append(model)
And here you can find the helper function setting all the seeds:
def init_pytorch_seeds():
seed = 0
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True