Hi there,
I am using my customized bert script to train a model. However, everything even I keep the same setting for lr, AdamW weight decay and epoch, and run on the same platform (cuda on SageMaker) with same torch (1.5.0) and transformers (2.11.0) versions, the results still change a lot in terms of the loss. This make my different experiments not comparable.
Can someone who has experienced this before or have any ideas please advice me on what should I do? I really want to solve this inreproducible issue so that I can continue on my experiments. Super appreciated for your help!
Details as below:
For example, if I set epoch = 4, lr = 1e-5, decay for AdamW as 0.01.
For one run I got this result for the first epoch only showing the last complete 100 batches result:
2020-10-19 03:45:29,032 - utils - INFO - | epoch 1 | 1300/ 1320 batches | lr 2.261e-05 | loss 0.267 | Elapsed 0:12:29
2020-10-19 03:45:40,550 - utils - INFO - Training epoch took: 0:12:41
2020-10-19 03:45:40,550 - utils - INFO - Validating...
2020-10-19 03:46:14,588 - utils - INFO - | loss 0.019 | Elapsed 0:00:34
precision recall f1-score support
False 0.906472 0.979875 0.941745 2087.000000
True 0.475000 0.152610 0.231003 249.000000
accuracy 0.891695 0.891695 0.891695 0.891695
macro avg 0.690736 0.566243 0.586374 2336.000000
weighted avg 0.860480 0.891695 0.865986 2336.000000
2020-10-19 03:46:15,403 - utils - INFO - Testing...
2020-10-19 03:46:55,182 - utils - INFO - use model: 1 batch / 1319 step
precision recall f1-score support
False 0.906 0.984 0.944 2344.000
True 0.413 0.098 0.159 265.000
accuracy 0.894 0.894 0.894 0.894
macro avg 0.659 0.541 0.551 2609.000
weighted avg 0.856 0.894 0.864 2609.000
2020-10-19 03:46:55,188 - utils - INFO - best test F1 score: 0.8638224640164368
And for the second attempt I got this for the first epoch:
2020-11-07 17:08:08,821 - utils - INFO - | epoch 1 | 1300/ 1320 batches | lr 2.261e-05 | loss 0.286 | Elapsed 0:12:25
2020-11-07 17:08:20,487 - utils - INFO - Training epoch took: 0:12:37
2020-11-07 17:08:20,487 - utils - INFO - Validating...
2020-11-07 17:08:54,609 - utils - INFO - | loss 0.018 | Elapsed 0:00:34
precision recall f1-score support
False 0.893408 1.000000 0.943703 2087.000000
True 0.000000 0.000000 0.000000 249.000000
accuracy 0.893408 0.893408 0.893408 0.893408
macro avg 0.446704 0.500000 0.471852 2336.000000
weighted avg 0.798177 0.893408 0.843112 2336.000000
2020-11-07 17:08:55,313 - utils - INFO - Testing...
2020-11-07 17:09:34,934 - utils - INFO - use model: 1 batch / 1319 step
precision recall f1-score support
False 0.898 1.000 0.946 2344.000
True 0.000 0.000 0.000 265.000
accuracy 0.898 0.898 0.898 0.898
macro avg 0.449 0.500 0.473 2609.000
weighted avg 0.807 0.898 0.850 2609.000
2020-11-07 17:09:34,938 - utils - INFO - best test F1 score: 0.8503599608647853
Note that, the last used lr rate per 100 batches are the same, while the average loss per 100 batches are slightly different. But this result in the predictions for the validation and testing data set very different.
I already set the seed during my model with this function below:
def set_seed(seed):
""" Set all seeds to make results reproducible (deterministic mode).
When seed is a false-y value or not supplied, disables deterministic mode. """
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
And my model script is like this:
class ReviewClassification(BertPreTrainedModel):
def __init__(self, config,
add_agent_text, agent_text_heads):
"""
:param config: Bert configuration, can set up some parameters, like output_attention, output_hidden_states
:param add_agent_text: whether to use the non text feature, and how.
It can have three options: None, "concat" and "attention"
:param agent_text_heads: number of the heads in agent attention mechanism. Only useful if add_agent_text are set to
"attention"
"""
super().__init__(config)
# self.num_labels = 2
self.add_agent_text = add_agent_text
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
embedding_size = config.hidden_size
if self.add_agent_text == "concat":
embedding_size = 2 * embedding_size
elif self.add_agent_text == "attention":
self.agent_attention = nn.MultiheadAttention(embedding_size, num_heads=agent_text_heads)
else:
# don't use the information in Agent text
pass
self.classifier = nn.Linear(embedding_size, 1) # self.classifier = nn.Linear(embedding_size, len(LABEL_NAME)) # bias: If set to False, the layer will not learn an additive bias
self.init_weights()
print(
"""
add agent text :{}
agent text multi-head :{}
""".format(self.add_agent_text, agent_text_heads)
)
def forward(
self,
review_input_ids=None,
review_attention_mask=None,
review_token_type_ids=None,
agent_input_ids=None,
agent_attention_mask=None,
agent_token_type_ids=None,
labels=None,
):
review_outputs = self.bert(
review_input_ids,
attention_mask=review_attention_mask,
token_type_ids=review_token_type_ids,
position_ids=None,
head_mask=None,
inputs_embeds=None,
)
if self.add_agent_text is not None:
# means that self.add_agent_text is "concat" or "attention"
# TODO: we can try that agent_outputs do not share the same parameter
agent_outputs = self.bert(
agent_input_ids,
attention_mask=agent_attention_mask,
token_type_ids=agent_token_type_ids,
position_ids=None,
head_mask=None,
inputs_embeds=None,
)
if self.add_agent_text == "attention":
review_hidden_states = review_outputs[0].transpose(0, 1) # before trans: (bs, seq_len, hidden_size)
# want to take it as query, we need the it has the shape (#target_seq_len, batch_size, embedding_size)
agent_hidden_states = agent_outputs[0].mean(axis=1).unsqueeze(dim=0) # (1, batch_size, hidden_size)
attn_output, _ = self.agent_attention(agent_hidden_states, review_hidden_states, review_hidden_states)
feature = attn_output.squeeze() # (batch_size, seq_len)
else:
feature = review_outputs[1] # (batch_size, seq_len) -? Should it be (batch_size, hidden_size)
if self.add_agent_text == "concat":
feature = torch.cat([feature, agent_outputs[1]], axis=1)
logits = self.classifier(feature).squeeze()
outputs = (logits,) # + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = nn.BCEWithLogitsLoss().cuda() #pos_weight=pos_weight
loss = loss_fct(logits, labels)
outputs = (loss,) + outputs
return outputs # (loss, logits, hidden_states, attentions)
The loss is calculated using BCEWithLogitsLoss() from torch.nn.
The train, validation and test part script is as below:
import time
import pickle
from path import Path
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix
import torch
import torch.nn as nn
from utils import LABEL_NAME, isnotebook, set_seed, format_time
if isnotebook():
from tqdm.notebook import tqdm
else:
from tqdm import tqdm
def model_train(model, train_data_loader, valid_data_loader, test_data_loader,
logger, optimizer, scheduler, num_epochs, seed, out_dir):
# move model to gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
num_gpus = torch.cuda.device_count()
logger.info("Let's use {} GPUs!".format(num_gpus))
# Set the seed value all over the place to make this reproducible.
set_seed(seed=seed)
# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []
print_interval = 100
# Measure the total training time for the whole run.
total_t0 = time.time()
batch_size = train_data_loader.batch_size
num_batch = len(train_data_loader)
best_f1_score = {
"weighted": 0,
"averaged": 0
}
best_test_f1_score = 0
# For each epoch...
for epoch_i in range(0, num_epochs):
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
logger.info("")
logger.info('======== Epoch {:} / {:} ========'.format(epoch_i + 1, num_epochs))
logger.info('Training...')
# Reset the total loss for this epoch.
total_train_loss = 0
# Measure how long the training epoch takes.
t_train = time.time()
model.train()
# For each batch of training data...
for step, batch in tqdm(enumerate(train_data_loader), desc="Training Iteration", total=num_batch):
# Progress update every 100 batches.
if step % print_interval == 0 and not step == 0:
# Calculate elapsed time in minutes.
elapsed = format_time(time.time() - t_train)
avg_train_loss = total_train_loss / print_interval
# Report progress.
logger.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:.3e} | loss {:5.3f} | Elapsed {:s}'.format(
epoch_i+1, step, num_batch, scheduler.get_last_lr()[0], avg_train_loss, elapsed)
)
total_train_loss = 0
training_stats.append(
{
'epoch': epoch_i + 1,
'step': step,
'train loss': avg_train_loss,
}
)
# Unpack this training batch from our dataloader.
#
# As we unpack the batch, we'll also copy each tensor to the GPU using the
# `to` method.
#
# `batch` contains four pytorch tensors:
# "input_ids"
# "attention_mask"
# "token_type_ids"
# "binarized_labels"
b_review_input_ids = batch["review_input_ids"].to(device)
b_review_attention_mask = batch["review_attention_mask"].to(device)
b_review_token_type_ids = batch["review_token_type_ids"].to(device)
b_agent_input_ids = batch["agent_input_ids"].to(device)
b_agent_attention_mask = batch["agent_attention_mask"].to(device)
b_agent_token_type_ids = batch["agent_token_type_ids"].to(device)
b_binarized_label = batch["binarized_label"].to(device)
model.zero_grad()
(loss, _) = model(review_input_ids=b_review_input_ids,
review_attention_mask=b_review_attention_mask,
review_token_type_ids=b_review_token_type_ids,
agent_input_ids=b_agent_input_ids,
agent_attention_mask=b_agent_attention_mask,
agent_token_type_ids=b_agent_token_type_ids,
labels=b_binarized_label
)
# Accumulate the training loss over all of the batches so that we can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
if num_gpus > 1:
total_train_loss += loss.mean().item()
loss.mean().backward() # use loss.mean().backward() instead of loss.backward() for multiple gpu trainings
else:
total_train_loss += loss.item()
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
optimizer.step()
scheduler.step()
# End of training epoch
# Measure how long this epoch took.
training_time = format_time(time.time() - t_train)
logger.info("")
logger.info(" Training epoch took: {:s}".format(training_time))
# evaluate the model after one epoch.
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure our performance on
# our validation set.
logger.info("")
logger.info("Validating...")
t_valid = time.time()
model.eval()
ave_valid_loss, valid_f1_table, cm_table, f1_score = model_validate(model=model, data_loader=valid_data_loader)
# Measure how long this epoch took.
validation_time = format_time(time.time() - t_valid)
logger.info("")
logger.info('| loss {:5.3f} | Elapsed {:s}'.format(ave_valid_loss, validation_time))
logger.info(" \n{:s}".format(valid_f1_table.to_string()))
logger.info("")
logger.info(" \n{:s}".format(cm_table.to_string()))
# need to store the best model
for key in best_f1_score.keys():
if best_f1_score[key] < f1_score[key]:
# remove the old model:
file_list = [f for f in out_dir.files() if f.name.endswith(".pt") and f.name.startswith(key)]
for f in file_list:
Path.remove(f)
model_file = out_dir.joinpath('{:s}_epoch_{:02d}-f1_{:.3f}.pt'.format(
key, epoch_i + 1, f1_score[key])
)
best_f1_score[key] = f1_score[key]
if num_gpus > 1:
torch.save(model.module.state_dict(), model_file)
else:
torch.save(model.state_dict(), model_file)
# ========================================
# Test
# ========================================
logger.info("")
logger.info("Testing...")
result_df = model_test(model=model, data_loader=test_data_loader)
y_true = np.array(result_df["review_label"], dtype=np.bool) # This part may need double check
y_pred = result_df["Probability"] > 0.5
report = classification_report(y_true, y_pred, output_dict=True)
metrics_df = pd.DataFrame(report).transpose()
metrics_df = metrics_df.sort_index()
weighted_f1_score = metrics_df.loc['weighted avg', 'f1-score']
averaged_f1_score = metrics_df.loc['macro avg', 'f1-score']
best_test_f1_score = metrics_df.loc['weighted avg', 'f1-score'] \
if best_test_f1_score < metrics_df.loc['weighted avg', 'f1-score'] else best_test_f1_score
metrics_df = metrics_df.astype(float).round(3)
# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
cm_df = pd.DataFrame(columns = ['Predicted No', 'Predicted Yes'],
index = ['Actual No', 'Actual Yes'])
# adding rows to an empty
# dataframe at existing index
cm_df.loc['Actual No'] = [tn,fp]
cm_df.loc['Actual Yes'] = [fn,tp]
logger.info("use model: {} batch / {} step".format(epoch_i + 1, step))
logger.info("\n" + "=" * 50)
logger.info("\n" + metrics_df.to_string())
logger.info("\n" + "=" * 50)
logger.info("\n" + cm_df.to_string())
logger.info("best test F1 score: {}".format(best_test_f1_score))
logger.info("\n" + "=" * 50)
# Below is to save the result files
result_filename = "result_df_epoch_" + str(epoch_i + 1) + ".xlsx"
result_df.to_excel(out_dir.joinpath(result_filename), index=False)
logger.info("")
logger.info("Training complete!")
logger.info("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))
# Save training_stats to csv file
pd.DataFrame(training_stats).to_csv(out_dir.joinpath("model_train.log"), index=False)
return model, optimizer, scheduler
def model_validate(model, data_loader):
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
label_prop = data_loader.dataset.dataset.label_prop()
total_valid_loss = 0
batch_size = data_loader.batch_size
num_batch = len(data_loader)
y_pred, y_true = [], []
# Evaluate data
for step, batch in tqdm(enumerate(data_loader), desc="Validation...", total=num_batch):
b_review_input_ids = batch["review_input_ids"].to(device)
b_review_attention_mask = batch["review_attention_mask"].to(device)
b_review_token_type_ids = batch["review_token_type_ids"].to(device)
b_agent_input_ids = batch["agent_input_ids"].to(device)
b_agent_attention_mask = batch["agent_attention_mask"].to(device)
b_agent_token_type_ids = batch["agent_token_type_ids"].to(device)
b_binarized_label = batch["binarized_label"].to(device)
# Tell pytorch not to bother with constructing the compute graph during
# the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
(loss, logits,) = model(review_input_ids=b_review_input_ids,
review_attention_mask=b_review_attention_mask,
review_token_type_ids=b_review_token_type_ids,
agent_input_ids=b_agent_input_ids,
agent_attention_mask=b_agent_attention_mask,
agent_token_type_ids=b_agent_token_type_ids,
labels=b_binarized_label)
total_valid_loss += loss.item()
### The sigmoid function is used for the two-class logistic regression,
### whereas the softmax function is used for the multiclass logistic regression
# Version 1
# numpy_probas = logits.detach().cpu().numpy()
# y_pred.extend(np.argmax(numpy_probas, axis=1).flatten())
# y_true.extend(b_binarized_label.cpu().numpy())
# Version 2
# transfored_logits = F.log_softmax(logits,dim=1)
# numpy_probas = transfored_logits.detach().cpu().numpy()
# y_pred.extend(np.argmax(numpy_probas, axis=1).flatten())
# y_true.extend(b_binarized_label.cpu().numpy())
# Version 3
# transfored_logits = torch.sigmoid(logits)
# numpy_probas = transfored_logits.detach().cpu().numpy()
# y_pred.extend(np.argmax(numpy_probas, axis=1).flatten())
# y_true.extend(b_binarized_label.cpu().numpy())
# New version - for num_label = 1
transfored_logits = torch.sigmoid(logits)
numpy_probas = transfored_logits.detach().cpu().numpy()
y_pred.extend(numpy_probas)
y_true.extend(b_binarized_label.cpu().numpy())
# End of an epoch of validation
# put model to train mode again.
model.train()
ave_loss = total_valid_loss / (num_batch * batch_size)
y_pred = np.array(y_pred)
y_pred[y_pred < 0.5] = 0
y_pred[y_pred >= 0.5] = 1
# Below is in case the input and target are not the same data format
y_pred = np.array(y_pred, dtype=np.bool)
y_true = np.array(y_true, dtype=np.bool)
# compute the various f1 score for each label
report = classification_report(y_true, y_pred, output_dict=True)
metrics_df = pd.DataFrame(report).transpose()
# metrics_df = pd.DataFrame(0, index=LABEL_NAME, columns=["Precision", "Recall", "F1","support"])
# metrics_df.Precision = precision_recall_fscore_support(y_true, y_pred)[0]
# metrics_df.Recall = precision_recall_fscore_support(y_true, y_pred)[1]
# metrics_df.F1 = precision_recall_fscore_support(y_true, y_pred)[2]
# metrics_df.support = precision_recall_fscore_support(y_true, y_pred)[3]
# y_pred = np.array(y_pred)
# y_pred[y_pred < 0] = 0
# y_pred[y_pred > 0] = 1
# y_pred = np.array(y_pred, dtype=np.bool)
# y_true = np.array(y_true, dtype=np.bool)
# metrics_df = pd.DataFrame(0, index=LABEL_NAME, columns=["Precision", "Recall", "F1"], dtype=np.float)
# # or_y_pred = np.zeros(y_pred.shape[0], dtype=np.bool)
# # or_y_true = np.zeros(y_true.shape[0], dtype=np.bool)
# for i in range(len(LABEL_NAME)):
# metrics_df.iloc[i] = precision_recall_fscore_support(
# y_true=y_true[:, i], y_pred=y_pred[:, i], average='binary', zero_division=0)[0:3]
# or_y_pred = or_y_pred | y_pred[:, i]
# or_y_true = or_y_true | y_true[:, i]
metrics_df = metrics_df.sort_index()
# metrics_df.loc['Weighted Average'] = metrics_df.transpose().dot(label_prop)
# metrics_df.loc['Average'] = metrics_df.mean()
# metrics_df.loc['Weighted Average', 'F1'] = 2 / (1/metrics_df.loc['Weighted Average', "Recall"] +
# 1/metrics_df.loc['Weighted Average', "Precision"])
# metrics_df.loc['Average', 'F1'] = 2 / (1/metrics_df.loc['Average', "Recall"] +
# 1/metrics_df.loc['Average', "Precision"])
weighted_f1_score = metrics_df.loc['weighted avg', 'f1-score']
averaged_f1_score = metrics_df.loc['macro avg', 'f1-score']
# Calculate confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
cm_df = pd.DataFrame(columns = ['Predicted No', 'Predicted Yes'],
index = ['Actual No', 'Actual Yes'])
# adding rows to an empty
# dataframe at existing index
cm_df.loc['Actual No'] = [tn,fp]
cm_df.loc['Actual Yes'] = [fn,tp]
# pooled_f1_score = f1_score(y_pred=or_y_pred, y_true=or_y_true)
return ave_loss, metrics_df, cm_df,{
"weighted": weighted_f1_score,
"averaged": averaged_f1_score,
}
def model_test(model, data_loader):
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()
model.to(device)
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
num_batch = len(data_loader)
# Below need to modify if change the input
review_id, review_label, hmd_text, head_cust_text = [], [], [], []
agent = []
pred_logits = []
# Evaluate data
for step, batch in tqdm(enumerate(data_loader), desc="Inference...", total=num_batch):
if "anecdote_lead_final" in batch.keys():
review_label.extend(batch["anecdote_lead_final"])
review_id.extend(batch["_id"].tolist())
hmd_text.extend(batch["hmd_comments"])
head_cust_text.extend(batch["head_cust"])
agent.extend(batch["new_transcript_agent"])
b_review_input_ids = batch["review_input_ids"].to(device)
b_review_attention_mask = batch["review_attention_mask"].to(device)
b_review_token_type_ids = batch["review_token_type_ids"].to(device)
b_agent_input_ids = batch["agent_input_ids"].to(device)
b_agent_attention_mask = batch["agent_attention_mask"].to(device)
b_agent_token_type_ids = batch["agent_token_type_ids"].to(device)
# Tell pytorch not to bother with constructing the compute graph during
# the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
(logits,) = model(review_input_ids=b_review_input_ids,
review_token_type_ids=b_review_token_type_ids,
review_attention_mask=b_review_attention_mask,
agent_input_ids=b_agent_input_ids,
agent_token_type_ids=b_agent_token_type_ids,
agent_attention_mask=b_agent_attention_mask
)
if logits.detach().cpu().numpy().size == 1:
pred_logits.extend(logits.detach().cpu().numpy().reshape(1,))
else:
pred_logits.extend(logits.detach().cpu().numpy())
# End of an epoch of validation
# put model to train mode again.
model.train()
pred_logits = np.array(pred_logits)
pred_prob = np.exp(pred_logits)
pred_prob = pred_prob / (1 + pred_prob)
pred_label = pred_prob.copy()
pred_label[pred_label < 0.5] = 0
pred_label[pred_label >= 0.5] = 1
# compute the f1 score for each tags
d = {'Probability':pred_prob,'Anecdotes Prediction':pred_label}
pred_df = pd.DataFrame(d, columns=['Probability','Anecdotes Prediction'])
result_df = pd.DataFrame(
{
"review_id": review_id,
"hmd_text": hmd_text,
"head_cust_text": head_cust_text,
"agent": agent
}
)
if len(review_label) != 0:
result_df["review_label"] = [x.item() for x in review_label]
return pd.concat([result_df, pred_df], axis=1).set_index("review_id")
optimizer and scheduler part are defined as below:
if args.full_finetuning:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay_rate': args.decay},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0}
]
else:
param_optimizer = list(model.classifier.named_parameters())
optimizer_grouped_parameters = [
{"params": [p for n, p in param_optimizer]}
]
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(optimizer_grouped_parameters, # or param_optimizer
lr=args.lr, # args.learning_rate - default is 5e-5, our notebook had 1e-5
eps=1e-8) # args.adam_epsilon - default is 1e-8.
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=len(data_loader["train"]) * args.num_epochs
)
And to run the model, I use below script:
model_train(model=model, train_data_loader=data_loader["train"], valid_data_loader=data_loader["valid"],
test_data_loader=data_loader["test"], optimizer=optimizer, scheduler=scheduler,
num_epochs=args.num_epochs, seed=args.seed, logger=logger, out_dir=out_dir)