How do I copy data to GPU in parallel?

How about writing your own data loader in a separate thread? This can run in parrallel to your training loop.

The data loading can look like this (I wrote the data caching in RAM but it can work for parrallel caching in GPU as well, i think):
(cache_size is the number of images or data to be cached parrallelly)

from threading import Thread
from time import sleep
import queue 
import time


trfqueue = queue.Queue(maxsize=20) 
tsfqueue = queue.Queue(maxsize=20) 

 
cache_size= 10
templist = []
trfcount=0
tsfcount=0

def LoadinThread(dirPath1,runtimeLength):
  
  fcount=0
  global n_trainingFiles
  global n_testFiles
  n_testFiles=0
  n_trainingFiles=0
  
  trfqueue.queue.clear()
  tsfqueue.queue.clear()
  
  #print("---"+dirPath1)
  
  filelist= os.listdir(dirPath1)       
  for file in filelist:
    #print(fcount,runtimeLength )

    if file[-3:] == 'jpg' and (  fcount < 12  or runtimeLength==2) :
      fcount=fcount+1

      if dirPath1[-5:] == "Test/" :
        jpg, anno = GetData(file)
        # HERE you can copy the data to GPU or keep in RAM
        # 
        templist=[]
        templist.append(jpg)
        templist.append(anno)
        templist.append(file)

        n_testFiles = int( len(filelist)/2)
        while tsfqueue.qsize() > cache_size:
          time.sleep(1) 
        #print("Entry in tsfq")
        tsfqueue.put(templist)

      else:
        jpg, anno = GetData(file)
        # HERE you can copy the data to GPU or keep in RAM
        # 
        
        flist.append(file)
        templist=[]
        templist.append(jpg)
        templist.append(anno)
        templist.append(file)
        n_trainingFiles=  int( len(filelist)) /2
        #print("at training step", n_trainingFiles)
        while trfqueue.qsize() > cache_size:
          #print("sleep at trf q")
          time.sleep(2) 
        trfqueue.put(templist)
  print("thread stopped !! ")
 


def StartDataCaching(dirpath1, runtimeLength):
    thread1 = Thread(target = LoadinThread, args = (dirpath1,runtimeLength, ))


    thread1.start()
 

The training loop can look like this :

 def trainmodel(model, BATCH_SIZE,  lr_base, lr_max, runtimeLength):
    
    StartDataCaching(dirpath,runtimeLength)
    lr=lr_base
    fcount=0
    running_loss = 0.0
    current_loss=0.0
    while trfqueue.qsize() < 3 or n_trainingFiles ==0:
      time.sleep(2)
    #print("training size", n_trainingFiles, "q size " , trfqueue.qsize())
    lr_incr= (lr_max-lr)/n_trainingFiles
    
    outputlist= []
    
    for t in range(0, int(n_trainingFiles) ):

      
        
        #fstart = datetime.now()
        if trfqueue.empty():
          while trfqueue.empty():
            time.sleep(1)

        dataitem = trfqueue.get()
        inputs_reshaped, labels = dataitem[0],dataitem[1]
        filename= dataitem[2]

        SZ,_,_,_=inputs_reshaped.shape
        
        for i in range(0, SZ, BATCH_SIZE):
            optimizer = optim.Adam(model.parameters(), lr= lr) #1e-9
            inputs1, labels1 = (inputs_reshaped[i:i+BATCH_SIZE]), (labels[i:i+BATCH_SIZE])
            optimizer.zero_grad()

            loss = criterion(model(inputs1), labels1)
            loss.backward()        
            optimizer.step()
            lr=lr+lr_incr 
            current_loss += loss.data.item()
        #fpend = datetime.now()
        templist=[]
        templist.append( lr)
        templist.append( current_loss)
        templist.append( filename)
        
        outputlist.append( templist )
         
        #print("flist len ",len(flist))
        #flist[t] +

        print( filename+" - TRN .. "+repr(fcount)+ "/"+repr(n_trainingFiles) + " loss: "+repr(round( current_loss,2) ))
        running_loss += current_loss
        current_loss=0.0
        #print("FP: "( fpend-fend).total_seconds() )
        
    lr=lr_base
    
    print('Current Training: avg loss: %.3f' % ( running_loss/fcount ) )
    running_loss = 0.0
    return outputlist

However, keep in mind that if your GPU memory is 8GB and you batch size already fills the memory then you can’t additionally cache another 8 GB in GPU. It will through memory error.

1 Like