Hi there, Ive been using the Trainer class from HuggingFace to train my BERT Models. I recently started asking myself what is the point of setting per_device_eval_batch_size
in TrainingArguments and what the results given by compute_metrics
mean.
Here’s a concrete example of what I mean:
args = TrainingArguments(
output_dir= args_parsed.output_name,
evaluation_strategy = "epoch",
logging_dir="../logs",
logging_strategy="steps",
logging_steps = 10,
per_device_train_batch_size=args_parsed.batch_size,
per_device_eval_batch_size=args_parsed.batch_size,
warmup_ratio=0.1,
learning_rate=lr,
weight_decay=1e-2,
adam_beta1=0.9,
adam_beta2=0.98,
adam_epsilon=1e-6,
max_grad_norm=0.0,
num_train_epochs=epoch,
disable_tqdm=True,
report_to="none",
seed=seed
)
def compute_metrics(p):
predictions, labels = p
predictions = np.argmax(predictions, axis=2)
label_list = list(k for k,v in tag2idx.items())
true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
seqeval_token=seqeval.scheme.IOBES
micro_results = {}
macro_results = {}
micro_results["precision"] = seqeval.metrics.sequence_labeling.precision_score(true_labels,true_predictions,average="micro",mode="strict",scheme=seqeval_token)
micro_results["recall"] = seqeval.metrics.sequence_labeling.recall_score(true_labels,true_predictions,average="micro",mode="strict",scheme=seqeval_token)
micro_results["f1"] = seqeval.metrics.sequence_labeling.f1_score(true_labels,true_predictions,average="micro",mode="strict",scheme=seqeval_token)
macro_results["precision"] = seqeval.metrics.sequence_labeling.precision_score(true_labels,true_predictions,average="macro",mode="strict",scheme=seqeval_token)
macro_results["recall"] = seqeval.metrics.sequence_labeling.recall_score(true_labels,true_predictions,average="macro",mode="strict",scheme=seqeval_token)
macro_results["f1"] = seqeval.metrics.sequence_labeling.f1_score(true_labels,true_predictions,average="macro",mode="strict",scheme=seqeval_token)
return {"micro_precision": micro_results["precision"], "micro_recall": micro_results["recall"] , "micro_f1": micro_results["f1"], \
"macro_precision": macro_results["precision"], "macro_recall": macro_results["recall"], "macro_f1": macro_results["f1"],}
trainer = Trainer(
model_init=model_init,
args=args,
train_dataset=train_tokenized_datasets,
eval_dataset=eval_tokenized_datasets,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
callbacks=[LogCallback],
)
Note that I only pasted the important components of the code so as to not clutter the whole page with just my code.
Does Trainer output the evaluation results for the final batch with size args_parsed.batch_size
or does it give the evaluation results for the whole evaluation dataset?