Help CUDA error: out of memory

Hi @ptrblck , I hope you are well. I tried the code that we have discussed before for fine tunning the gpt2 with multiple GPUS. but when I load the model to generate sentences the results are very strange. it just gave me back the inputs as it is and the expected generated sentence is padding token. I would appreciate your idea, where can be wrong in the code. saving and loading the model? or training the model. I really appreciate your help.

######################
weight_decay=0
learning_rate=7e-5
adam_epsilon=1e-8
warmup_steps = 1e2
lr=5e-5
Max_length=400

PathData='/home//NLP_Projects/
pretrained_model = '/home//GPT_NEO_1.3B/'

#########################

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

################################################
class GPT2Dataset(Dataset):

    def __init__(self, txt_list, tokenizer, gpt2_type=pretrained_model, max_length=400):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:

            encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

######################################################3
def ddp_setup(rank, world_size):
    """
    Args:
        rank: Unique identifier of each process
        world_size: Total number of processes
    """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2"

    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)
#########################################################

def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
    
    gpu_id=rank
    
    ### defined variable ###############
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    ###############################
    
    ddp_setup(rank, world_size)
    
    ###############################
    
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-small
  
    model_or = GPTNeoForCausalLM.from_pretrained(pretrained_model)

    model_or.resize_token_embeddings(len(tokenizer))

    ## loading traina and tets dataset
    print(PathData)
    trains_titles=pd.read_csv(PathData+'/'+'traindata.csv')
    valid_titles=pd.read_csv(PathData+'/'+'validdata.csv')
    
    trains_titles=trains_titles.drop(columns=['Unnamed: 0'])['0']
    valid_titles=valid_titles.drop(columns=['Unnamed: 0'])['0']

    print(trains_titles.head(2))
    
    train_dataset = GPT2Dataset(trains_titles, tokenizer, max_length=Max_length)

    Val_dataset = GPT2Dataset(valid_titles, tokenizer, max_length=Max_length)
    
    ############################################################################
    
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                       batch_size=batch_size,
                                                        pin_memory=True,
                                                        shuffle=False,
                                                       sampler=DistributedSampler(train_dataset))

    validation_loader= torch.utils.data.DataLoader(dataset=Val_dataset,
                                                       batch_size=batch_size,
                                                        pin_memory=True,
                                                        shuffle=False,
                                                       sampler=DistributedSampler(Val_dataset))
    
    
    total_steps = len(train_loader) * total_epochs


    ################# define optimizer and scheduler#########################

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    optimizer = AdamW(model_or.parameters(), lr = learning_rate,eps = adam_epsilon)


    # Create the learning rate scheduler.
    # This changes the learning rate as the training loop progresses
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = warmup_steps, 
                                                num_training_steps = total_steps)

    ############################## train_loader and validation_loader #########
 
    training_steps_per_epoch=len(train_loader)
    total_num_training_steps = int(training_steps_per_epoch*total_epochs)

    model = copy.deepcopy(model_or)

    model=model.to(gpu_id)
    model = DDP(model, device_ids=[gpu_id])
    print("gpu_id",gpu_id)
    # ========================================
    #               Training
    # ========================================

    training_stats = []
           

    for epoch_i in range(0, total_epochs):
        
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, total_epochs))
        print('Training...')

        ##########################################
        train_loader.sampler.set_epoch(epoch_i)
        b_sz = len(next(iter(train_loader))[0])
        print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(train_loader)}")
        train_loader.sampler.set_epoch(epoch_i)
        ##########################################

        t0 = time.time()

        total_train_loss = 0

        model.train()

        for step, batch in enumerate(train_loader):
        #    print("len(train_loader)",len(train_loader))
            #################################
            b_input_ids = batch[0].to(gpu_id,non_blocking=True)
            b_labels = batch[0].to(gpu_id,non_blocking=True)
            b_masks = batch[1].to(gpu_id,non_blocking=True)
            #################################

            optimizer.zero_grad()        

            outputs = model(  b_input_ids,
                             labels=b_labels, 
                              attention_mask = b_masks,
                              token_type_ids=None
                            )

            loss = outputs[0]  
            batch_loss = loss.item()
            total_train_loss += batch_loss
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_loader)  
        print("avg_train_loss",avg_train_loss)
        Path_3=pt_save_directory+'/'+'trainingloss='+str(gpu_id)+str(epoch_i)+".csv"
        torch.save(avg_train_loss,Path_3)
        
        del total_train_loss
        del batch_loss
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)
        print("  Training epoch took: {:}".format(training_time))

        gc.collect()
        
        ################### saving the model ########################

        if gpu_id == 0:
    
            Path2=Results_Path+'/'+'savemodel_epoch=='+str(epoch_i)
    
            ss=os.path.isdir(Path2)
            if ss==False:
                os.makedirs(Path2)

            ckp = model.module.state_dict()
            torch.save(ckp, Path2+"/checkpoint.pt")

    
    destroy_process_group()
    #############################
if __name__ == '__main__':
    import sys
    total_epochs=int(sys.argv[1])
    save_every=int(sys.argv[2])
    batch_size=int(sys.argv[3])
    world_size = (torch.cuda.device_count())-1
    print(world_size)
    mp.spawn(main, args=(world_size, save_every, total_epochs, batch_size), nprocs=world_size,join=True)

I load the model in this way


pretrained_model = '/home/momenisa//GPT_2/'

model = GPTNeoForCausalLM.from_pretrained(pretrained_model)

tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model) #gpt2-small

model.resize_token_embeddings(len(tokenizer))

CHECKPOINT_PATH='/home//checkpoint.pt'
model.load_state_dict(torch.load(CHECKPOINT_PATH,map_location='cpu'),strict=False)
model.eval()

and results are