Hi @mrshenli, @TT_YY I hope you are well. sorry, I am using this code for fine tunning the gpt-2 , I use ddp for both training and validation part. there are two questions for me, I need to show the graph of train and validation loss, how I can aggregate apply (all_gather) or all_reduce to have one value for each epoch to show in graph? the second question is that the size of the model that I reload is different from the initial one, why this happen? is there any issue by saving the model? many thanks for your help,
#!/usr/bin/env python
# coding: utf-8
# In[1]:
from torch.utils.data import DataLoader
from transformers import TextDataset,DataCollatorForLanguageModeling
#from transformers import AutoModelWithLMHead
from transformers import AutoModelForCausalLM
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import gc
import math
import os
import time
import datetime
import torch
import torch.distributed as dist
import sys
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
import random
import pandas as pd
######################
weight_decay=0
learning_rate=5e-5
adam_epsilon=1e-8
warmup_steps = 1e2
lr=5e-5
Max_length=400
PathData='/home//NLP_Projects/CaseSummary_resolutionProject/Results_GPT_2/model_v4200_k_bs=16_lr=5e-05_epochs=20/'
pretrained_model='/home///GPT_2/'
########################################
def format_time(elapsed):
return str(datetime.timedelta(seconds=int(round((elapsed)))))
################################################
class GPT2Dataset(Dataset):
def __init__(self, txt_list, tokenizer, gpt2_type=pretrained_model, max_length=400):
self.tokenizer = tokenizer
self.input_ids = []
self.attn_masks = []
for txt in txt_list:
encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.attn_masks[idx]
######################################################3
def ddp_setup(rank, world_size):
"""
Args:
rank: Unique identifier of each process
world_size: Total number of processes
"""
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
#########################################################
def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
gpu_id=rank
Path='/home//NLP_Projects/CaseSummary_resolutionProject/Results_GPT_2/multipleGPU/model_v4'\
'data_'+str(200)+'_k'+'_'+'bs='+str(batch_size)+'_lr='+str(learning_rate)+'_epochs='+str(total_epochs)
print(Path)
Results_Path=Path+'/Results/'
ss=os.path.isdir(Results_Path)
if ss==False:
os.makedirs(Results_Path)
### defined variable ###############
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
###############################
ddp_setup(rank, world_size)
###############################
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-small
model = GPT2LMHeadModel.from_pretrained(pretrained_model)
model.resize_token_embeddings(len(tokenizer))
## loading traina and tets dataset
print(PathData)
trains_titles=pd.read_csv(PathData+'/'+'traindata.csv')
valid_titles=pd.read_csv(PathData+'/'+'validdata.csv')
trains_titles=trains_titles.drop(columns=['Unnamed: 0'])['0'].iloc[:200]
valid_titles=valid_titles.drop(columns=['Unnamed: 0'])['0'].iloc[:30]
print(trains_titles.head(2))
train_dataset = GPT2Dataset(trains_titles, tokenizer, max_length=Max_length)
Val_dataset = GPT2Dataset(valid_titles, tokenizer, max_length=Max_length)
############################################################################
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(train_dataset))
validation_loader= torch.utils.data.DataLoader(dataset=Val_dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(Val_dataset))
total_steps = len(train_loader) * total_epochs
################# define optimizer and scheduler#########################
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(), lr = learning_rate,eps = adam_epsilon)
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = warmup_steps,
num_training_steps = total_steps)
############################## train_loader and validation_loader ######################3
training_steps_per_epoch=len(train_loader)
total_num_training_steps = int(training_steps_per_epoch*total_epochs)
######################## applying DDP on the model for training ############################
model=model.to(gpu_id)
model = DDP(model, device_ids=[gpu_id])
print("gpu_id",gpu_id)
# ========================================
# Training
# ========================================
training_stats = []
for epoch_i in range(0, total_epochs):
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, total_epochs))
print('Training...')
##########################################
train_loader.sampler.set_epoch(epoch_i)
b_sz = len(next(iter(train_loader))[0])
print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(train_loader)}")
train_loader.sampler.set_epoch(epoch_i)
##########################################
t0 = time.time()
total_train_loss = 0
model.train()
for step, batch in enumerate(train_loader):
#################################
b_input_ids = batch[0].to(gpu_id,non_blocking=True)
b_labels = batch[0].to(gpu_id,non_blocking=True)
b_masks = batch[1].to(gpu_id,non_blocking=True)
#################################
optimizer.zero_grad()
outputs = model( b_input_ids,
labels=b_labels,
attention_mask = b_masks,
token_type_ids=None
)
loss = outputs[0]
batch_loss = loss.item()
total_train_loss += batch_loss
# print("total_train_loss",total_train_loss)
loss.backward()
optimizer.step()
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_loader)
del total_train_loss
del batch_loss
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
# ========================================
# Validation
# ========================================
print("")
print("Running Validation...")
avg_val_loss_1=[]
t0 = time.time()
#################### is this section corrcet for validation #############
model.eval()
model = DDP(model, device_ids=[gpu_id])
########################################3
total_eval_loss = 0
nb_eval_steps = 0
########################################
validation_loader.sampler.set_epoch(epoch_i)
b_sz = len(next(iter(validation_loader))[0])
print("bz",b_sz)
print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(validation_loader)}")
validation_loader.sampler.set_epoch(epoch_i)
###########################################
# Evaluate data for one epoch
for batch in validation_loader:
b_input_ids = batch[0].to(gpu_id,non_blocking=True)
b_labels = batch[0].to(gpu_id,non_blocking=True)
b_masks = batch[1].to(gpu_id,non_blocking=True)
with torch.no_grad():
outputs = model.module(b_input_ids,attention_mask = b_masks,labels=b_labels)
loss = outputs[0]
batch_loss = loss.item()
# print("here batch loss",batch_loss)
total_eval_loss += batch_loss
avg_val_loss = total_eval_loss / len(validation_loader)
# print("here total_eval_loss=",total_eval_loss)
perplexity=math.exp(avg_val_loss)
avg_val_loss_1.append(avg_val_loss)
validation_time = format_time(time.time() - t0)
del total_eval_loss
print(" Validation Loss: {0:.2f}".format(avg_val_loss))
print(" Validation took: {:}".format(validation_time))
# Record all statistics from this epoch.
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Training Time': training_time,
'Validation Time': validation_time,
'perplexity': perplexity
}
)
gc.collect()
################### saving the model ########################
if gpu_id == 0:
Path2=Results_Path+'/'+'savemodel_epoch=='+str(epoch_i)
ss=os.path.isdir(Path2)
if ss==False:
os.makedirs(Path2)
ckp = model.module.state_dict()
torch.save(ckp, Path2+"/checkpoint.pt")
############ save the results #####################
pt_save_directory=Results_Path+'/'+'analyticsnumber'
ss=os.path.isdir(pt_save_directory)
if ss==False:
os.makedirs(pt_save_directory)
print("here",training_stats)
Path_3=pt_save_directory+'/'+'training_stats='+str(42)+".csv"
torch.save(training_stats,Path_3)
#### is a good place to put the destrop process ###########
destroy_process_group()
#############################
if __name__ == '__main__':
import sys
total_epochs=int(sys.argv[1])
save_every=int(sys.argv[2])
batch_size=int(sys.argv[3])
world_size = (torch.cuda.device_count())-1
print(world_size)
mp.spawn(main, args=(world_size, save_every, total_epochs, batch_size), nprocs=world_size,join=True)