The evaluate influence the training?

I have some bug in training model, I did two experiments. In the first one, when I evaluate in training every 1000steps, and I evaluate the model in the end of each epoch, but not to save every epoch end model. In the second, I evaluate in training every 1000steps, and not have evaluate in the end of each epoch.This two experiments had not same as result.I dont not this is why?
The train and evaluate function are here:
Train:

def train_and_evaluate(args, model, tokenizer, optimizer, scheduler, train_dataloader, val_loader, epoch, max_f1):
    """ Train the model """
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataloader)*args.train_batch_size)

    epoch_step = 0
    epoch_loss = 0.0
    model.zero_grad()

    # 下面这里读取batch数据需要根据自己的数据脚本进行修改
    epoch_iterator = tqdm(train_dataloader, desc="Training")
    # model.train()
    scaler = GradScaler()

    # 增加对抗训练代码
    # fgm = FGM(model, epsilon=1, emb_name='word_embeddings.weight')
    # pgd = PGD(model, emb_name='word_embeddings.weight', epsilon=1.0, alpha=0.3)
    # k=3

    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(args.device) for t in batch)
        inputs = {'input_ids':batch[0], 'attention_mask':batch[1], 
                    'token_type_ids':batch[2], 
                    'start_positions':batch[3],
                    'end_positions':batch[4],
                    'answerable_label':batch[5]}


        if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

        if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[6],
                               'p_mask':       batch[9]})
        with autocast():
            outputs = model(**inputs)
            loss = outputs[0]

            # if args.n_gpu > 1:
            #     loss = loss.mean()  # mean() to average on multi-gpu parallel training

        epoch_loss += loss.item()
        scaler.scale(loss).backward()
        # if args.fp16:
        #     with amp.scale_loss(loss, optimizer) as scaled_loss:
        #         scaled_loss.backward()
        # else:
        # loss.backward()

        # pgd对抗训练
        # pgd.backup_grad()
        # for t in range(k):
        #     pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
        #     if t != k-1:
        #         model.zero_grad()
        #     else:
        #         pgd.restore_grad()
        #     with autocast():
        #         loss_adv = model(**inputs)[0]
        #     scaler.scale(loss_adv).backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
        # pgd.restore() # 恢复embedding参数

        # 增加fgm对抗训练的代码
        # fgm.attack()
        # with autocast():
        #     adv_outputs = model(**inputs)
        #     loss_adv = adv_outputs[0]

        # if args.n_gpu > 1:
        #     loss_adv = loss_adv.mean()  # mean() to average on multi-gpu parallel training
        # scaler.scale(loss_adv).backward()
        # if args.fp16:
        #     with amp.scale_loss(loss_adv, optimizer) as adv_scaled_loss:
        #         adv_scaled_loss.backward()
        # else:
        #     loss_adv.backward()
        # fgm.restore()

        # if args.fp16:
        #     torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
        # else:
        #     torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
        # optimizer.step()
        scaler.step(optimizer)
        # scaler.step(aux_opt)
        scaler.update()
        # optimizer.step()
        scheduler.step()  # Update learning rate schedule
        optimizer.zero_grad()
        epoch_step += 1
        # optimizer.step()
        # scheduler.step()  # Update learning rate schedule
        # model.zero_grad()
        # epoch_step += 1

        # evaluate model in some steps
        if (epoch_step % args.evaluate_steps == 0) or (step == len(train_dataloader) - 1):

            val_results = evaluate(args, model, tokenizer, val_loader)
            # logger.info('evaluate f1 is {:.4f}'.format(val_results.get('f1')))
            # logger.info('***** Epoch {} Running result *****'.format(epoch+1))
            # logger.info('Training loss is {:.4f}'.format(epoch_loss/epoch_step))
            # logger.info("***** Eval results %s *****", "")
            # info = "-".join([f' {key}: {value:.4f} ' for key, value in val_results.items()])
            # logger.info(info)

            if max_f1 < val_results.get('f1'):
                max_f1 = val_results.get('f1')

                # logger.info('Epoch {} Training loss is {:.4f}'.format(epoch+1, epoch_loss/epoch_step))
                logger.info("***** Eval results %s *****", "")
                info = "-".join([f' {key}: {value:.4f} ' for key, value in val_results.items()])
                logger.info(info)

                # Save best model checkpoint
                output_dir = os.path.join(args.output_dir, args.model_type)
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                # Save weights of the network
                model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
                # model_checkpoint = {'epoch': epoch + 1,
                #             'state_dict': model_to_save.state_dict(),
                #             'optim_state_dict': optimizer.state_dict(),
                #             'scheduler_dict': scheduler.state_dict(),
                #             }
                # model_to_save.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                model_file_path = os.path.join(output_dir, 'qa-best.bin')
                torch.save(model_to_save.state_dict(), model_file_path)
                logger.info("Saving best model checkpoint to %s", output_dir)

    # if 'cuda' in str(args.device):
    # torch.cuda.empty_cache()

    return max_f1

Evaluate:

def evaluate(args, model, tokenizer, val_loader, prefix=""):
    
    features = val_loader.dataset.features
    examples = val_loader.dataset.examples


    # args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    # eval_sampler = SequentialSampler(dataset)
    # eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    # if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
    #     model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(val_loader)*args.eval_batch_size)
    # logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    # start_time = timeit.default_timer()
    model.eval()
    # for batch in tqdm(val_loader, desc="Evaluating"):
    for batch in val_loader:
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':batch[0], 'attention_mask':batch[1], 
                    'token_type_ids':batch[2], }
                    # 'start_positions':batch[3],
                    # 'end_positions':batch[4],}
                    # 'answerable_label':batch[5]}

            if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

            batch_unique_id = batch[6]

            # XLNet and XLM use more arguments for their predictions
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[4], "p_mask": batch[7]})
                # for lang_id-sensitive xlm models
                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                    inputs.update(
                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                    )
            outputs = model(**inputs)

        for i, unique_id in enumerate(batch_unique_id):
            # eval_feature = features[example_indice]
            # unique_id = int(eval_feature.unique_id)
            unique_id = int(unique_id.item())

            output = [output[i].detach().to('cpu').tolist() for output in outputs[:2]]

            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
            # models only use two.
            if args.model_type in ["xlnet", "xlm"]:
                start_logits = output[0]
                # start_top_index = output[1]
                end_logits = output[1]
                # end_top_index = output[3]
                # cls_logits = output[2]

                result = SquadResult(
                    unique_id,
                    start_logits,
                    end_logits,
                    # start_top_index=start_top_index,
                    # end_top_index=end_top_index,
                    cls_logits=None,
                )

            else:
                start_logits = output[0]
                end_logits = output[1]
                # cls_logits = output[2]
                result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    # evalTime = timeit.default_timer() - start_time
    # logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file=None
    output_nbest_file=None
    # output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    # output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    if args.version_2_with_negative:
        output_null_log_odds_file = None
        # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    # XLNet and XLM use a more complex post-processing procedure
    if args.model_type in ["xlnet", "xlm"]:
        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top

        predictions = compute_predictions_extended(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            start_n_top,
            end_n_top,
            args.version_2_with_negative,
            tokenizer,
            args.verbose_logging
        )
    else:
        # predictions是一个dict:{qid:[pred_text,start_logits,end_logits,start_index,end_index]}
        predictions, nbest_predictions = compute_predictions(
            examples,
            features,
            all_results,
            args.n_best_size,
            args.max_answer_length,
            args.do_lower_case,
            output_prediction_file,
            output_nbest_file,
            output_null_log_odds_file,
            args.verbose_logging,
            args.version_2_with_negative,
            args.null_score_diff_threshold,
            tokenizer
            
        )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions, tokenizer)
    return results

The training loop function:

def train_loop(args, model, tokenizer, optimizer, scheduler, train_dataloader, val_dataloader):
    # 这里进行train和val的操作
    seed_everything(args.seed)
    max_f1 = 0.0
    # global_steps = 0
    for epoch in range(int(args.num_train_epochs)):
        logger.info('******************** Epoch {} Running Start! ********************'.format(epoch+1))
        max_f1 = train_and_evaluate(args,model, tokenizer, optimizer, scheduler, train_dataloader, val_dataloader, epoch, max_f1) 
        **
this is the diffenence of two experiments
> **# last_evaluate_results = evaluate(args, model, tokenizer, val_dataloader)**

**
        # logger.info('The last step evaluate f1 is {:.4f}'.format(last_evaluate_results.get('f1')))
        # max_f1 = new_max_f1
        # logger.info('The best Acc-score is {:.4f}'.format(max_acc))
        # logger.info('The best new F1-score is {:.4f}'.format(new_max_f1))
        logger.info('The best F1-score is {:.4f}'.format(max_f1))
        logger.info('******************** Epoch {} Running End! ********************'.format(epoch+1))
        # logger.info('Negative best F1-score is {:.4f}'.format(max_neg_f1))

        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()

I still think the reason could be the different order into the pseudorandom number generator as explained in your double post. Did you take a look at it and e.g. re-seeded the training for debugging purposes?

@ptrblck Thanks for your reply, you means is I need to choose another seed number to train the model?Thanks!

@ptrblck I use this code to test the random state, the code like this:

{
            "python": random.getstate(),
            "numpy": np.random.get_state(),
            "cpu": torch.random.get_rng_state(),
            "gpu": torch.cuda.random.get_rng_state()
        }

and, the result is the state is not change in start and end of the evaluate function.