I have some bug in training model, I did two experiments. In the first one, when I evaluate in training every 1000steps, and I evaluate the model in the end of each epoch, but not to save every epoch end model. In the second, I evaluate in training every 1000steps, and not have evaluate in the end of each epoch.This two experiments had not same as result.I dont not this is why?
The train and evaluate function are here:
Train:
def train_and_evaluate(args, model, tokenizer, optimizer, scheduler, train_dataloader, val_loader, epoch, max_f1):
""" Train the model """
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataloader)*args.train_batch_size)
epoch_step = 0
epoch_loss = 0.0
model.zero_grad()
# 下面这里读取batch数据需要根据自己的数据脚本进行修改
epoch_iterator = tqdm(train_dataloader, desc="Training")
# model.train()
scaler = GradScaler()
# 增加对抗训练代码
# fgm = FGM(model, epsilon=1, emb_name='word_embeddings.weight')
# pgd = PGD(model, emb_name='word_embeddings.weight', epsilon=1.0, alpha=0.3)
# k=3
for step, batch in enumerate(epoch_iterator):
model.train()
batch = tuple(t.to(args.device) for t in batch)
inputs = {'input_ids':batch[0], 'attention_mask':batch[1],
'token_type_ids':batch[2],
'start_positions':batch[3],
'end_positions':batch[4],
'answerable_label':batch[5]}
if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
del inputs["token_type_ids"]
if args.model_type in ['xlnet', 'xlm']:
inputs.update({'cls_index': batch[6],
'p_mask': batch[9]})
with autocast():
outputs = model(**inputs)
loss = outputs[0]
# if args.n_gpu > 1:
# loss = loss.mean() # mean() to average on multi-gpu parallel training
epoch_loss += loss.item()
scaler.scale(loss).backward()
# if args.fp16:
# with amp.scale_loss(loss, optimizer) as scaled_loss:
# scaled_loss.backward()
# else:
# loss.backward()
# pgd对抗训练
# pgd.backup_grad()
# for t in range(k):
# pgd.attack(is_first_attack=(t==0)) # 在embedding上添加对抗扰动, first attack时备份param.data
# if t != k-1:
# model.zero_grad()
# else:
# pgd.restore_grad()
# with autocast():
# loss_adv = model(**inputs)[0]
# scaler.scale(loss_adv).backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
# pgd.restore() # 恢复embedding参数
# 增加fgm对抗训练的代码
# fgm.attack()
# with autocast():
# adv_outputs = model(**inputs)
# loss_adv = adv_outputs[0]
# if args.n_gpu > 1:
# loss_adv = loss_adv.mean() # mean() to average on multi-gpu parallel training
# scaler.scale(loss_adv).backward()
# if args.fp16:
# with amp.scale_loss(loss_adv, optimizer) as adv_scaled_loss:
# adv_scaled_loss.backward()
# else:
# loss_adv.backward()
# fgm.restore()
# if args.fp16:
# torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
# else:
# torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
# optimizer.step()
scaler.step(optimizer)
# scaler.step(aux_opt)
scaler.update()
# optimizer.step()
scheduler.step() # Update learning rate schedule
optimizer.zero_grad()
epoch_step += 1
# optimizer.step()
# scheduler.step() # Update learning rate schedule
# model.zero_grad()
# epoch_step += 1
# evaluate model in some steps
if (epoch_step % args.evaluate_steps == 0) or (step == len(train_dataloader) - 1):
val_results = evaluate(args, model, tokenizer, val_loader)
# logger.info('evaluate f1 is {:.4f}'.format(val_results.get('f1')))
# logger.info('***** Epoch {} Running result *****'.format(epoch+1))
# logger.info('Training loss is {:.4f}'.format(epoch_loss/epoch_step))
# logger.info("***** Eval results %s *****", "")
# info = "-".join([f' {key}: {value:.4f} ' for key, value in val_results.items()])
# logger.info(info)
if max_f1 < val_results.get('f1'):
max_f1 = val_results.get('f1')
# logger.info('Epoch {} Training loss is {:.4f}'.format(epoch+1, epoch_loss/epoch_step))
logger.info("***** Eval results %s *****", "")
info = "-".join([f' {key}: {value:.4f} ' for key, value in val_results.items()])
logger.info(info)
# Save best model checkpoint
output_dir = os.path.join(args.output_dir, args.model_type)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Save weights of the network
model_to_save = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training
# model_checkpoint = {'epoch': epoch + 1,
# 'state_dict': model_to_save.state_dict(),
# 'optim_state_dict': optimizer.state_dict(),
# 'scheduler_dict': scheduler.state_dict(),
# }
# model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
model_file_path = os.path.join(output_dir, 'qa-best.bin')
torch.save(model_to_save.state_dict(), model_file_path)
logger.info("Saving best model checkpoint to %s", output_dir)
# if 'cuda' in str(args.device):
# torch.cuda.empty_cache()
return max_f1
Evaluate:
def evaluate(args, model, tokenizer, val_loader, prefix=""):
features = val_loader.dataset.features
examples = val_loader.dataset.examples
# args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly
# eval_sampler = SequentialSampler(dataset)
# eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# multi-gpu evaluate
# if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
# model = torch.nn.DataParallel(model)
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = %d", len(val_loader)*args.eval_batch_size)
# logger.info(" Batch size = %d", args.eval_batch_size)
all_results = []
# start_time = timeit.default_timer()
model.eval()
# for batch in tqdm(val_loader, desc="Evaluating"):
for batch in val_loader:
model.eval()
batch = tuple(t.to(args.device) for t in batch)
with torch.no_grad():
inputs = {'input_ids':batch[0], 'attention_mask':batch[1],
'token_type_ids':batch[2], }
# 'start_positions':batch[3],
# 'end_positions':batch[4],}
# 'answerable_label':batch[5]}
if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
del inputs["token_type_ids"]
batch_unique_id = batch[6]
# XLNet and XLM use more arguments for their predictions
if args.model_type in ["xlnet", "xlm"]:
inputs.update({"cls_index": batch[4], "p_mask": batch[7]})
# for lang_id-sensitive xlm models
if hasattr(model, "config") and hasattr(model.config, "lang2id"):
inputs.update(
{"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
)
outputs = model(**inputs)
for i, unique_id in enumerate(batch_unique_id):
# eval_feature = features[example_indice]
# unique_id = int(eval_feature.unique_id)
unique_id = int(unique_id.item())
output = [output[i].detach().to('cpu').tolist() for output in outputs[:2]]
# Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
# models only use two.
if args.model_type in ["xlnet", "xlm"]:
start_logits = output[0]
# start_top_index = output[1]
end_logits = output[1]
# end_top_index = output[3]
# cls_logits = output[2]
result = SquadResult(
unique_id,
start_logits,
end_logits,
# start_top_index=start_top_index,
# end_top_index=end_top_index,
cls_logits=None,
)
else:
start_logits = output[0]
end_logits = output[1]
# cls_logits = output[2]
result = SquadResult(unique_id, start_logits, end_logits)
all_results.append(result)
# evalTime = timeit.default_timer() - start_time
# logger.info(" Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
# Compute predictions
output_prediction_file=None
output_nbest_file=None
# output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
# output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
if args.version_2_with_negative:
output_null_log_odds_file = None
# output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
else:
output_null_log_odds_file = None
# XLNet and XLM use a more complex post-processing procedure
if args.model_type in ["xlnet", "xlm"]:
start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
predictions = compute_predictions_extended(
examples,
features,
all_results,
args.n_best_size,
args.max_answer_length,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
start_n_top,
end_n_top,
args.version_2_with_negative,
tokenizer,
args.verbose_logging
)
else:
# predictions是一个dict:{qid:[pred_text,start_logits,end_logits,start_index,end_index]}
predictions, nbest_predictions = compute_predictions(
examples,
features,
all_results,
args.n_best_size,
args.max_answer_length,
args.do_lower_case,
output_prediction_file,
output_nbest_file,
output_null_log_odds_file,
args.verbose_logging,
args.version_2_with_negative,
args.null_score_diff_threshold,
tokenizer
)
# Compute the F1 and exact scores.
results = squad_evaluate(examples, predictions, tokenizer)
return results
The training loop function:
def train_loop(args, model, tokenizer, optimizer, scheduler, train_dataloader, val_dataloader):
# 这里进行train和val的操作
seed_everything(args.seed)
max_f1 = 0.0
# global_steps = 0
for epoch in range(int(args.num_train_epochs)):
logger.info('******************** Epoch {} Running Start! ********************'.format(epoch+1))
max_f1 = train_and_evaluate(args,model, tokenizer, optimizer, scheduler, train_dataloader, val_dataloader, epoch, max_f1)
**
this is the diffenence of two experiments
> **# last_evaluate_results = evaluate(args, model, tokenizer, val_dataloader)**
**
# logger.info('The last step evaluate f1 is {:.4f}'.format(last_evaluate_results.get('f1')))
# max_f1 = new_max_f1
# logger.info('The best Acc-score is {:.4f}'.format(max_acc))
# logger.info('The best new F1-score is {:.4f}'.format(new_max_f1))
logger.info('The best F1-score is {:.4f}'.format(max_f1))
logger.info('******************** Epoch {} Running End! ********************'.format(epoch+1))
# logger.info('Negative best F1-score is {:.4f}'.format(max_neg_f1))
if 'cuda' in str(args.device):
torch.cuda.empty_cache()