Hi @ptrblck , I hope you are well. I tried the code that we have discussed before for fine tunning the gpt2 with multiple GPUS. but when I load the model to generate sentences the results are very strange. it just gave me back the inputs as it is and the expected generated sentence is padding token. I would appreciate your idea, where can be wrong in the code. saving and loading the model? or training the model. I really appreciate your help.
######################
weight_decay=0
learning_rate=7e-5
adam_epsilon=1e-8
warmup_steps = 1e2
lr=5e-5
Max_length=400
PathData='/home//NLP_Projects/
pretrained_model = '/home//GPT_NEO_1.3B/'
#########################
def format_time(elapsed):
return str(datetime.timedelta(seconds=int(round((elapsed)))))
################################################
class GPT2Dataset(Dataset):
def __init__(self, txt_list, tokenizer, gpt2_type=pretrained_model, max_length=400):
self.tokenizer = tokenizer
self.input_ids = []
self.attn_masks = []
for txt in txt_list:
encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.attn_masks[idx]
######################################################3
def ddp_setup(rank, world_size):
"""
Args:
rank: Unique identifier of each process
world_size: Total number of processes
"""
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
#########################################################
def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
gpu_id=rank
### defined variable ###############
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
###############################
ddp_setup(rank, world_size)
###############################
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-small
model_or = GPTNeoForCausalLM.from_pretrained(pretrained_model)
model_or.resize_token_embeddings(len(tokenizer))
## loading traina and tets dataset
print(PathData)
trains_titles=pd.read_csv(PathData+'/'+'traindata.csv')
valid_titles=pd.read_csv(PathData+'/'+'validdata.csv')
trains_titles=trains_titles.drop(columns=['Unnamed: 0'])['0']
valid_titles=valid_titles.drop(columns=['Unnamed: 0'])['0']
print(trains_titles.head(2))
train_dataset = GPT2Dataset(trains_titles, tokenizer, max_length=Max_length)
Val_dataset = GPT2Dataset(valid_titles, tokenizer, max_length=Max_length)
############################################################################
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(train_dataset))
validation_loader= torch.utils.data.DataLoader(dataset=Val_dataset,
batch_size=batch_size,
pin_memory=True,
shuffle=False,
sampler=DistributedSampler(Val_dataset))
total_steps = len(train_loader) * total_epochs
################# define optimizer and scheduler#########################
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model_or.parameters(), lr = learning_rate,eps = adam_epsilon)
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = warmup_steps,
num_training_steps = total_steps)
############################## train_loader and validation_loader #########
training_steps_per_epoch=len(train_loader)
total_num_training_steps = int(training_steps_per_epoch*total_epochs)
model = copy.deepcopy(model_or)
model=model.to(gpu_id)
model = DDP(model, device_ids=[gpu_id])
print("gpu_id",gpu_id)
# ========================================
# Training
# ========================================
training_stats = []
for epoch_i in range(0, total_epochs):
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, total_epochs))
print('Training...')
##########################################
train_loader.sampler.set_epoch(epoch_i)
b_sz = len(next(iter(train_loader))[0])
print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(train_loader)}")
train_loader.sampler.set_epoch(epoch_i)
##########################################
t0 = time.time()
total_train_loss = 0
model.train()
for step, batch in enumerate(train_loader):
# print("len(train_loader)",len(train_loader))
#################################
b_input_ids = batch[0].to(gpu_id,non_blocking=True)
b_labels = batch[0].to(gpu_id,non_blocking=True)
b_masks = batch[1].to(gpu_id,non_blocking=True)
#################################
optimizer.zero_grad()
outputs = model( b_input_ids,
labels=b_labels,
attention_mask = b_masks,
token_type_ids=None
)
loss = outputs[0]
batch_loss = loss.item()
total_train_loss += batch_loss
loss.backward()
optimizer.step()
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_loader)
print("avg_train_loss",avg_train_loss)
Path_3=pt_save_directory+'/'+'trainingloss='+str(gpu_id)+str(epoch_i)+".csv"
torch.save(avg_train_loss,Path_3)
del total_train_loss
del batch_loss
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
print(" Training epoch took: {:}".format(training_time))
gc.collect()
################### saving the model ########################
if gpu_id == 0:
Path2=Results_Path+'/'+'savemodel_epoch=='+str(epoch_i)
ss=os.path.isdir(Path2)
if ss==False:
os.makedirs(Path2)
ckp = model.module.state_dict()
torch.save(ckp, Path2+"/checkpoint.pt")
destroy_process_group()
#############################
if __name__ == '__main__':
import sys
total_epochs=int(sys.argv[1])
save_every=int(sys.argv[2])
batch_size=int(sys.argv[3])
world_size = (torch.cuda.device_count())-1
print(world_size)
mp.spawn(main, args=(world_size, save_every, total_epochs, batch_size), nprocs=world_size,join=True)
I load the model in this way
pretrained_model = '/home/momenisa//GPT_2/'
model = GPTNeoForCausalLM.from_pretrained(pretrained_model)
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) #gpt2-small
model.resize_token_embeddings(len(tokenizer))
CHECKPOINT_PATH='/home//checkpoint.pt'
model.load_state_dict(torch.load(CHECKPOINT_PATH,map_location='cpu'),strict=False)
model.eval()
and results are