My main.py contains iteration of multiple training.
I use seed, but even with same training setups, the result is slightly different.
How can I fix it? Please help me
I run multiple experiments inside main.py to compare.
Below is some outputs. Here debug1 and debug2 is experiments of completely identical configs.
The data loader result seems to be constant across experiments (the train_dl size and eval_dl size have no change. In my case this should mean they are exactly same data split).
But the AUC is slightly different.
(base) $ python main.py config/debug.json
INFO : 4547 : 22:12 : __main__ | Starting Experiment: debug1
INFO : 4547 : 22:12 : ksdkt/debug1 | PyTorch: 1.3.0
INFO : 4547 : 22:12 : ksdkt/debug1 | Using Device: cuda
INFO : 4547 : 22:12 : ksdkt/debug1 | train_dl.dataset size: 14312
INFO : 4547 : 22:12 : ksdkt/debug1 | eval_dl.dataset size: 3644
INFO : 4547 : 22:12 : ksdkt/debug1 | The model has 514,424 trainable parameters
INFO : 4547 : 22:12 : ksdkt/debug1 | Starting train
INFO : 4547 : 22:13 : ksdkt/debug1 | Epoch 100 Train Loss: 0.61461 AUC: 0.846741
INFO : 4547 : 22:13 : ksdkt/debug1 | Epoch 100 Valid Loss: 0.691517 AUC: 0.528118
INFO : 4547 : 22:13 : ksdkt/debug1 | Best AUC 0.528118 refreshed and saved!
INFO : 4547 : 22:13 : ksdkt/debug1 | At 22:13 0m 0s passed ( - 0m 0s til 22:13) (100epoch 100.0%)
INFO : 4547 : 22:13 : ksdkt/debug1 | Starting evaluation
INFO : 4547 : 22:13 : ksdkt/debug1 | Valid Loss: 0.691517 AUC: 0.528118
INFO : 4547 : 22:13 : ksdkt/debug1 | At 22:13 0m 0s passed ( - 0m 0s til 22:13)
INFO : 4547 : 22:13 : __main__ | Starting Experiment: debug2
INFO : 4547 : 22:13 : ksdkt/debug2 | PyTorch: 1.3.0
INFO : 4547 : 22:13 : ksdkt/debug2 | Using Device: cuda
INFO : 4547 : 22:13 : ksdkt/debug2 | train_dl.dataset size: 14312
INFO : 4547 : 22:13 : ksdkt/debug2 | eval_dl.dataset size: 3644
INFO : 4547 : 22:13 : ksdkt/debug2 | The model has 514,424 trainable parameters
INFO : 4547 : 22:13 : ksdkt/debug2 | Starting train
INFO : 4547 : 22:13 : ksdkt/debug2 | Epoch 100 Train Loss: 0.614401 AUC: 0.850618
INFO : 4547 : 22:13 : ksdkt/debug2 | Epoch 100 Valid Loss: 0.691517 AUC: 0.52813
INFO : 4547 : 22:13 : ksdkt/debug2 | Best AUC 0.52813 refreshed and saved!
INFO : 4547 : 22:13 : ksdkt/debug2 | At 22:13 0m 0s passed ( - 0m 0s til 22:13) (100epoch 100.0%)
INFO : 4547 : 22:13 : ksdkt/debug2 | Starting evaluation
INFO : 4547 : 22:13 : ksdkt/debug2 | Valid Loss: 0.691517 AUC: 0.52813
INFO : 4547 : 22:13 : ksdkt/debug2 | At 22:13 0m 0s passed ( - 0m 0s til 22:13)
INFO : 4547 : 22:13 : __main__ | All experiments done!
I attach the abstract code below.
I split train.py from main.py, but even if I run seed_everything
inside train.py too it didn’t get fixed.
I deleted a lot of logger.info to simplify the code in this question.
# main.py
from src.train import Trainer
def seed_everything(seed: int=42):
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
def main(configpath: Path):
with open('config.json', r) as f:
cfg = json.load(f)
experiments = cfg['experiments']
for exp_dict in experiments:
config = Config(exp_dict)
run(config)
logger.info('All experiments done!')
def run(config):
seed_everything()
trainer = Trainer(config)
if not config.load_model:
try:
trainer.train_model()
except KeyboardInterrupt as e:
print(e)
finally:
trainer.dump_report()
trainer.evaluate_model()
# src/train.py
class Trainer(object):
def __init__(self, config):
self.config = config
self.device = self.get_device(self.config)
self.train_dl, self.eval_dl = self.get_dataloader(
self.config, self.device)
model = self.get_model(self.config, self.device)
if self.config.load_model:
model.load_state_dict(torch.load(str(self.config.load_model_path)))
model = model.to(self.device)
self.model = model
self.opt = self.get_opt(self.model)
self._report = {
'config': self.config.as_dict(),
'indicator': defaultdict(list)
}
def get_device(self, config):
device = torch.device(
'cuda' if config.cuda and torch.cuda.is_available() else 'cpu')
return device
def get_model(self, config, device):
if config.model_name == 'MyModel':
model = MyModel(config, device).to(device)
else:
raise ValueError(f'model_name {config.model_name} is wrong')
return model
def get_dataloader(self, config, device):
train_dl, eval_dl = prepare_dataloader(
config, device=device, pad=config.pad)
return train_dl, eval_dl
def get_opt(self, model):
opt = torch.optim.SGD(model.parameters(), lr=self.config.lr)
return opt
def train_model(self, validate=True):
for epoch in range(1, self.config.epoch_size + 1):
self.model.train()
t_idc = self.exec_core(self.train_dl, self.opt)
t_loss, t_auc = t_idc['loss'], t_idc['auc']
if epoch % 10 == 0 and validate:
with torch.no_grad():
self.model.eval()
v_idc = self.exec_core(dl=self.eval_dl, opt=None)
v_loss, v_auc = v_idc['loss'], v_idc['auc']
def exec_core(self, dl, opt, only_eval=False):
arr_len = len(dl) if not self.config.debug else 1
pred_mx = np.zeros([arr_len, self.config.batch_size])
actu_mx = np.zeros([arr_len, self.config.batch_size])
pred_ls = []
actu_ls = []
pred_v_mx = np.zeros(
[arr_len, self.config.batch_size * self.config.n_skills])
actu_v_mx = np.zeros(
[arr_len, self.config.batch_size * self.config.n_skills])
loss_ar = np.zeros(arr_len)
wvn1_ar = np.zeros(arr_len)
wvn2_ar = np.zeros(arr_len)
ksv1_ar = np.zeros(arr_len)
if only_eval:
q_all_count = defaultdict(int)
q_cor_count = defaultdict(int)
q_pred_list = defaultdict(list)
for i, (xseq, yseq, mask) in enumerate(dl):
out = self.model.loss_batch(xseq, yseq, mask, opt=opt)
loss_ar[i] = out['loss'].item()
fpr, tpr, _thresholds = metrics.roc_curve(
torch.cat(actu_ls).detach().cpu().numpy().reshape(-1),
torch.cat(pred_ls).detach().cpu().numpy().reshape(-1), pos_label=1)
auc = metrics.auc(fpr, tpr)
indicators = {
'loss': loss_ar.mean(),
'auc': auc,
}
return indicators
def _train_model_simple(self):
'''simplest skelton'''
for epoch in range(1, self.config.epoch_size + 1):
self.model.train()
for i, (xseq, yseq) in enumerate(self.train_dl):
out = self.model.loss_batch(xseq, yseq, opt=self.opt)
def evaluate_model(self):
with torch.no_grad():
self.model.eval()
indicators = self.exec_core(
dl=self.eval_dl, opt=None, only_eval=True)
v_loss, v_auc = indicators['loss'], indicators['auc']