Reproductivity problem with multiple training iteration

My main.py contains iteration of multiple training.
I use seed, but even with same training setups, the result is slightly different.
How can I fix it? Please help me :pray:

I run multiple experiments inside main.py to compare.
Below is some outputs. Here debug1 and debug2 is experiments of completely identical configs.
The data loader result seems to be constant across experiments (the train_dl size and eval_dl size have no change. In my case this should mean they are exactly same data split).
But the AUC is slightly different.

(base) $ python main.py config/debug.json 
INFO : 4547 : 22:12 : __main__  | Starting Experiment: debug1
INFO : 4547 : 22:12 : ksdkt/debug1      | PyTorch: 1.3.0
INFO : 4547 : 22:12 : ksdkt/debug1      | Using Device: cuda
INFO : 4547 : 22:12 : ksdkt/debug1      | train_dl.dataset size: 14312
INFO : 4547 : 22:12 : ksdkt/debug1      | eval_dl.dataset size: 3644
INFO : 4547 : 22:12 : ksdkt/debug1      | The model has 514,424 trainable parameters
INFO : 4547 : 22:12 : ksdkt/debug1      | Starting train
INFO : 4547 : 22:13 : ksdkt/debug1      |       Epoch 100       Train Loss: 0.61461     AUC: 0.846741
INFO : 4547 : 22:13 : ksdkt/debug1      |       Epoch 100       Valid Loss: 0.691517    AUC: 0.528118
INFO : 4547 : 22:13 : ksdkt/debug1      | Best AUC 0.528118 refreshed and saved!
INFO : 4547 : 22:13 : ksdkt/debug1      | At 22:13 0m 0s passed ( - 0m 0s til 22:13) (100epoch 100.0%)
INFO : 4547 : 22:13 : ksdkt/debug1      | Starting evaluation
INFO : 4547 : 22:13 : ksdkt/debug1      |       Valid Loss: 0.691517    AUC: 0.528118
INFO : 4547 : 22:13 : ksdkt/debug1      | At 22:13 0m 0s passed ( - 0m 0s til 22:13)
INFO : 4547 : 22:13 : __main__  | Starting Experiment: debug2
INFO : 4547 : 22:13 : ksdkt/debug2      | PyTorch: 1.3.0
INFO : 4547 : 22:13 : ksdkt/debug2      | Using Device: cuda
INFO : 4547 : 22:13 : ksdkt/debug2      | train_dl.dataset size: 14312
INFO : 4547 : 22:13 : ksdkt/debug2      | eval_dl.dataset size: 3644
INFO : 4547 : 22:13 : ksdkt/debug2      | The model has 514,424 trainable parameters
INFO : 4547 : 22:13 : ksdkt/debug2      | Starting train
INFO : 4547 : 22:13 : ksdkt/debug2      |       Epoch 100       Train Loss: 0.614401    AUC: 0.850618
INFO : 4547 : 22:13 : ksdkt/debug2      |       Epoch 100       Valid Loss: 0.691517    AUC: 0.52813
INFO : 4547 : 22:13 : ksdkt/debug2      | Best AUC 0.52813 refreshed and saved!
INFO : 4547 : 22:13 : ksdkt/debug2      | At 22:13 0m 0s passed ( - 0m 0s til 22:13) (100epoch 100.0%)
INFO : 4547 : 22:13 : ksdkt/debug2      | Starting evaluation
INFO : 4547 : 22:13 : ksdkt/debug2      |       Valid Loss: 0.691517    AUC: 0.52813
INFO : 4547 : 22:13 : ksdkt/debug2      | At 22:13 0m 0s passed ( - 0m 0s til 22:13)
INFO : 4547 : 22:13 : __main__  | All experiments done!

I attach the abstract code below.
I split train.py from main.py, but even if I run seed_everything inside train.py too it didn’t get fixed.
I deleted a lot of logger.info to simplify the code in this question.

# main.py
from src.train import Trainer

def seed_everything(seed: int=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def main(configpath: Path):
    with open('config.json', r) as f:
        cfg = json.load(f)
    experiments = cfg['experiments']
    for exp_dict in experiments:
        config = Config(exp_dict)
        run(config)
    logger.info('All experiments done!')

def run(config):
    seed_everything()
    trainer = Trainer(config)
    if not config.load_model:
        try:
            trainer.train_model()
        except KeyboardInterrupt as e:
            print(e)
        finally:
            trainer.dump_report()

    trainer.evaluate_model()
# src/train.py

class Trainer(object):

    def __init__(self, config):
        self.config = config
        self.device = self.get_device(self.config)
        self.train_dl, self.eval_dl = self.get_dataloader(
            self.config, self.device)
        model = self.get_model(self.config, self.device)
        if self.config.load_model:
            model.load_state_dict(torch.load(str(self.config.load_model_path)))
            model = model.to(self.device)
        self.model = model
        self.opt = self.get_opt(self.model)

        self._report = {
            'config': self.config.as_dict(),
            'indicator': defaultdict(list)
        }

    def get_device(self, config):
        device = torch.device(
            'cuda' if config.cuda and torch.cuda.is_available() else 'cpu')
        return device

    def get_model(self, config, device):
        if config.model_name == 'MyModel':
            model = MyModel(config, device).to(device)
        else:
            raise ValueError(f'model_name {config.model_name} is wrong')
        return model

    def get_dataloader(self, config, device):
        train_dl, eval_dl = prepare_dataloader(
            config, device=device, pad=config.pad)
        return train_dl, eval_dl

    def get_opt(self, model):
        opt = torch.optim.SGD(model.parameters(), lr=self.config.lr)
        return opt

    def train_model(self, validate=True):
        for epoch in range(1, self.config.epoch_size + 1):
            self.model.train()
            t_idc = self.exec_core(self.train_dl, self.opt)
            t_loss, t_auc = t_idc['loss'], t_idc['auc']

            if epoch % 10 == 0 and validate:
                with torch.no_grad():
                    self.model.eval()
                    v_idc = self.exec_core(dl=self.eval_dl, opt=None)
                    v_loss, v_auc = v_idc['loss'], v_idc['auc']


    def exec_core(self, dl, opt, only_eval=False):
        arr_len = len(dl) if not self.config.debug else 1
        pred_mx = np.zeros([arr_len, self.config.batch_size])
        actu_mx = np.zeros([arr_len, self.config.batch_size])
        pred_ls = []
        actu_ls = []
        pred_v_mx = np.zeros(
            [arr_len, self.config.batch_size * self.config.n_skills])
        actu_v_mx = np.zeros(
            [arr_len, self.config.batch_size * self.config.n_skills])
        loss_ar = np.zeros(arr_len)
        wvn1_ar = np.zeros(arr_len)
        wvn2_ar = np.zeros(arr_len)
        ksv1_ar = np.zeros(arr_len)
        if only_eval:
            q_all_count = defaultdict(int)
            q_cor_count = defaultdict(int)
            q_pred_list = defaultdict(list)
        for i, (xseq, yseq, mask) in enumerate(dl):
            out = self.model.loss_batch(xseq, yseq, mask, opt=opt)
            loss_ar[i] = out['loss'].item()

        fpr, tpr, _thresholds = metrics.roc_curve(
            torch.cat(actu_ls).detach().cpu().numpy().reshape(-1),
            torch.cat(pred_ls).detach().cpu().numpy().reshape(-1), pos_label=1)
        auc = metrics.auc(fpr, tpr)
        indicators = {
            'loss': loss_ar.mean(),
            'auc': auc,
        }
        return indicators

    def _train_model_simple(self):
        '''simplest skelton'''
        for epoch in range(1, self.config.epoch_size + 1):
            self.model.train()
            for i, (xseq, yseq) in enumerate(self.train_dl):
                out = self.model.loss_batch(xseq, yseq, opt=self.opt)

    def evaluate_model(self):
        with torch.no_grad():
            self.model.eval()
            indicators = self.exec_core(
                dl=self.eval_dl, opt=None, only_eval=True)
            v_loss, v_auc = indicators['loss'], indicators['auc']

Some operations on the GPU are not deterministic due to calls into atomicAdd as described here. Could you check, if your model might be using one of these methods?

Hi, @ptrblck,
Thanks for the quick response.

MyModel does not directly call either torch.Tensor.index_add_() , torch.Tensor.scatter_add_() , torch.bincount() ,
torch.nn.functional.embedding_bag() , or
torch.nn.functional.ctc_loss() .

However MyModel calls such as nn.LSTM, nn.Linear, nn.Embedding, torch.sigmoid, torch.max, torch.sum, and torch.pow.
Do you mean some of the functions MyModel calls could contain atomicAdd inside, and causing the problem?

If so, do you mean there is no fix so far?

Yes, some of these functions might yield deterministic results.
One workaround would be to try to isolate the layers which output non-deterministic results and apply the operation on the CPU.

What is your current use case that needs bitwise determinism?