How to fully use multiple GPUs when training multiple networks

Hi,

I am trying to train multiple neural networks on a machine with multiple GPUs. I have already used DataParallel module to parallelize this process. My understanding of DataParallel is that it can only help train each model one by one parallelly. However, all the GPUs are not fully utilized if I train these networks one by one.

I am wondering if there is any method to perform multiple networks training at the same time and fully use all my GPUs? I tried to think about using torch.multiprocessing. Yet it seems like pretty painful to use that in my case.

Any suggestions will be really appreciated. Thanks!

Are these multiple networks connected somehow to each other, i.e. model1 feeds its output to model2?
If so, could you post an explanation of your workflow?

If the networks are completely standalone models, you could run multiple scripts, specifying the GPU which should be used with: CUDA_VISIBLE_DEVICES=device_id python script.py, where device_id has to be set to the appropriate GPU id.
You could also set the device in your script with:

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

Thanks a lot for your response!

These networks are not connected to each other. They are completely standalone models. However, every once a while, I need to manage these models somehow. Basically, I am trying to use different types of optimizers to train multiple neural networks with same architectures, say, ResNet-50. After every 5 epochs, I need to observe the training accuracies for these models and then modify the optimizers or even delete some optimizers accordingly if they behave too bad.

Hence, I need to train all of them and keep track of the performance as well as the weights for these neural networks to be able to continue the same process every 5 epochs. Based on what you are saying, should I specify the GPU for each of them in my script and then collect the performance after every 5 epochs? For example, if I have 50 such networks and 10 GPUs, I then need to loop through them by running 50 / 10 = 5 times to gather all the results?

However, this method seems like a little bit dirty and hard to manage. I am wondering if there is any better method. Thanks again!

Hi @ptrblck , I come up with the model for ddp training which works, but for the validation part, I want to validate on one GPU, As I understand the rank=0 as a default is the main gpu. is the code meaningful for you? many thanks for your help.

#!/usr/bin/env python
# coding: utf-8

# In[1]:


from torch.utils.data import DataLoader
from transformers import TextDataset,DataCollatorForLanguageModeling
#from transformers import  AutoModelWithLMHead
from transformers import  AutoModelForCausalLM
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import gc
import math
import os
import time
import datetime
import torch
import torch.distributed as dist
import sys
## the directory include the package from INVIDIA
#sys.path.append('/home/momenisa/GPU_ZIP_Apex/apex-master/apex/')
#from apex import amp
#from apex.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
import os
import random
import pandas as pd
######################
weight_decay=0
learning_rate=5e-5
adam_epsilon=1e-8
warmup_steps = 1e2
lr=5e-5
Max_length=400

PathData='/home//NLP_Projects/CaseSummary_resolutionProject/Results_GPT_2/model_v4200_k_bs=16_lr=5e-05_epochs=20/'
pretrained_model='/home///GPT_2/'

########################################
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

################################################
class GPT2Dataset(Dataset):

    def __init__(self, txt_list, tokenizer, gpt2_type=pretrained_model, max_length=400):

        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []

        for txt in txt_list:

            encodings_dict = tokenizer('<|startoftext|>'+ txt + '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx] 

######################################################3
def ddp_setup(rank, world_size):
    """
    Args:
        rank: Unique identifier of each process
        world_size: Total number of processes
    """
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"
    init_process_group(backend="nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)
#########################################################

def main(rank: int, world_size: int, save_every: int, total_epochs: int, batch_size: int):
    
    gpu_id=rank
    
    Path='/home//NLP_Projects/CaseSummary_resolutionProject/Results_GPT_2/multipleGPU/model_v4'\
    'data_'+str(200)+'_k'+'_'+'bs='+str(batch_size)+'_lr='+str(learning_rate)+'_epochs='+str(total_epochs)
    
    print(Path)
    
    Results_Path=Path+'/Results/'
    ss=os.path.isdir(Results_Path)
    if ss==False:
        os.makedirs(Results_Path)

    print(Results_Path)
        
    print(PathData)
    
    print(rank)

    ### defined variable ###############
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    ###############################
    
    ddp_setup(rank, world_size)
    
    ###############################
    
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model, bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-small

    model = GPT2LMHeadModel.from_pretrained(pretrained_model)

    model.resize_token_embeddings(len(tokenizer))

    ## loading traina and tets dataset
    print(PathData)
    trains_titles=pd.read_csv(PathData+'/'+'traindata.csv')
    valid_titles=pd.read_csv(PathData+'/'+'validdata.csv')
    
    trains_titles=trains_titles.drop(columns=['Unnamed: 0'])['0']
    valid_titles=valid_titles.drop(columns=['Unnamed: 0'])['0']

    print(trains_titles.head(2))
    
    train_dataset = GPT2Dataset(trains_titles, tokenizer, max_length=Max_length)

    Val_dataset = GPT2Dataset(valid_titles, tokenizer, max_length=Max_length)
    
    ############################################################################
    
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                       batch_size=batch_size,
                                                        pin_memory=True,
                                                        shuffle=False,
                                                       sampler=DistributedSampler(train_dataset))

  # For validation the order doesn't matter, so we'll just read them sequentially.
        validation_loader = DataLoader(
                Val_dataset, # The validation samples.
                sampler = SequentialSampler(Val_dataset), # Pull out batches sequentially.
                batch_size = batch_size # Evaluate with this batch size.
            )
    
    
    total_steps = len(train_loader) * total_epochs


    ################# define optimizer and scheduler#########################

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    optimizer = AdamW(model.parameters(), lr = learning_rate,eps = adam_epsilon)


    # Create the learning rate scheduler.
    # This changes the learning rate as the training loop progresses
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = warmup_steps, 
                                                num_training_steps = total_steps)

    ############################## train_loader and validation_loader ######################3
 
    training_steps_per_epoch=len(train_loader)
    total_num_training_steps = int(training_steps_per_epoch*total_epochs)

  ######################## applying DDP on the model for training ############################
    model=model.to(gpu_id)
    model = DDP(model, device_ids=[gpu_id])
    print("gpu_id",gpu_id)
    # ========================================
    #               Training
    # ========================================


        
    training_stats = []
           

    for epoch_i in range(0, total_epochs):
        
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, total_epochs))
        print('Training...')

        ##########################################
        train_loader.sampler.set_epoch(epoch_i)
        b_sz = len(next(iter(train_loader))[0])
        print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(train_loader)}")
        train_loader.sampler.set_epoch(epoch_i)
        ##########################################

        t0 = time.time()

        total_train_loss = 0

        model.train()

        for step, batch in enumerate(train_loader):

            #################################
            b_input_ids = batch[0].to(gpu_id,non_blocking=True)
            b_labels = batch[0].to(gpu_id,non_blocking=True)
            b_masks = batch[1].to(gpu_id,non_blocking=True)
            #################################

            optimizer.zero_grad()        

            outputs = model(  b_input_ids,
                             labels=b_labels, 
                              attention_mask = b_masks,
                              token_type_ids=None
                            )

            loss = outputs[0]  
            batch_loss = loss.item()
            total_train_loss += batch_loss
            loss.backward()
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_loader)  

        del total_train_loss
        del batch_loss
        
        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)
        # ========================================
        #               Validation
        # ========================================

        print("")
        print("Running Validation...")

        avg_val_loss_1=[]
        t0 = time.time()
        #################### is this section corrcet for validation  #############
        if gpu_id == 0:

            model.eval()
            ########################################3
            total_eval_loss = 0
            nb_eval_steps = 0

            # Evaluate data for one epoch
            for batch in validation_loader:

                b_input_ids = batch[0].to(gpu_id)
                b_labels = batch[0].to(gpu_id)
                b_masks = batch[1].to(gpu_id)

                with torch.no_grad():        
                    outputs  = model.module(b_input_ids,attention_mask = b_masks,labels=b_labels)
                    loss = outputs[0]  
                batch_loss = loss.item()
                total_eval_loss += batch_loss        

            avg_val_loss = total_eval_loss / len(validation_loader)

            perplexity=math.exp(avg_val_loss)

            avg_val_loss_1.append(avg_val_loss)

            validation_time = format_time(time.time() - t0)    

            del total_eval_loss 


            print("  Validation Loss: {0:.2f}".format(avg_val_loss))
            print("  Validation took: {:}".format(validation_time))

            # Record all statistics from this epoch.
            training_stats.append(
                {
                    'epoch': epoch_i + 1,
                    'Training Loss': avg_train_loss,
                    'Valid. Loss': avg_val_loss,
                    'Training Time': training_time,
                    'Validation Time': validation_time,
                    'perplexity': perplexity
                }
            )
            gc.collect()
        
        ################### saving the model ########################

            Path2=Results_Path+'/'+'savemodel_epoch='+str(epoch_i)
    
            ss=os.path.isdir(Path2)
            if ss==False:
                os.makedirs(Path2)

            ckp = model.module.state_dict()
            torch.save(ckp, Pathmodel+'/ "checkpoint.pt"')
        ############ save the results #####################3          
    Path_2=pt_save_directory+'/'+'training_stats='+str(0)+".csv"
    torch.save(training_stats,Path_2)

    #### is a good place to put the destrop process ??????????????????? ###########
    destroy_process_group()
    #############################
if __name__ == '__main__':
    import sys
    total_epochs=int(sys.argv[1])
    save_every=int(sys.argv[2])
    batch_size=int(sys.argv[3])
    world_size = (torch.cuda.device_count())-1
    print(world_size)
    mp.spawn(main, args=(world_size, save_every, total_epochs, batch_size), nprocs=world_size,join=True)