This is the code for re-train the model.
Call the function:
retrain_and_fine_tune_and_save_model(pretrain_dataset_name=‘multi_news’,
num_train_docs=5, num_val_docs=5)
def retrain_and_fine_tune_and_save_model(pretrain_dataset_name, num_train_docs, num_val_docs):
model_name = ‘google/pegasus-large’
device = ‘cuda’ if torch.cuda.is_available() else ‘cpu’
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
dataset = load_dataset(pretrain_dataset_name)
train_texts = dataset['train']['document'][0:num_train_docs]
train_labels = dataset['train']['summary'][0:num_train_docs]
val_texts = dataset['validation']['document'][num_train_docs : num_train_docs + num_val_docs]
val_labels = dataset['validation']['summary'][num_train_docs : num_train_docs + num_val_docs]
# df_train = pd.DataFrame(list(zip(train_texts, train_labels)), columns=["texts", "summary"])
# df_train.to_csv('df_xsum_train.csv', index=False)
# df_train_2 = pd.read_csv('df_xsum_train.csv')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
train_decodings = tokenizer(train_labels, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
val_decodings = tokenizer(val_labels, truncation=True, padding=True)
train_dataset = Sum_Dataset(train_encodings, train_decodings)
val_dataset = Sum_Dataset(val_encodings, val_decodings)
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=2, # total number of training epochs
per_device_train_batch_size=1, # batch size per device during training, can increase if memory allows
per_device_eval_batch_size=1, # batch size for evaluation, can increase if memory allows
save_steps=500, # number of updates steps before checkpoint saves
save_total_limit=5, # limit the total amount of checkpoints and deletes the older checkpoints
evaluation_strategy='steps', # evaluation strategy to adopt during training
eval_steps=100, # number of update steps before evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
trainer.train()
# save model
model_name = "./pegasus_model"
trainer.save_model(model_name)
return model_name