Custom dataloader gpu not used correctly

Hi,
I am loading my data using the following piece of code, for the first few epochs, I am using 85% of my GPU’s VRAM and everything’s fine, but then it drops to maximum 10% and starts being sooo slow. Any idea why ?

def train_model(model, criterion, optimizer,num_epochs=25,lre=0.001):
nb_config = 5
liste_dir = sorted(glob.glob(’./dataset/train//’))
liste_images = []
for i in liste_dir:
images = sorted(glob.glob(str(i)+"
.png"))
liste_images.append(images)
x = []
resize = transforms.Resize((224,224))
for fname in sorted(liste_images):
x.append(np.array([np.array(resize(Image.open(fname2))) for fname2 in fname]))

for i,_ in  enumerate(x):
    x[i][0], x[i][-1] = x[i][-1], x[i][0]

inputs = np.expand_dims(x[0][0], axis=0)
for i in range(1,len(x)):
    inputs= np.concatenate((inputs,np.expand_dims(x[i][0], axis=0)),0)

#batch_5
batch_5 = []
for i in range(0,nb_config):
    batch = np.expand_dims(inputs[i],0)
    for a in range(0,4):
        batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0)
    batch = torch.from_numpy(batch)
    batch = batch.permute(0,3,1,2)
    batch = Variable(batch[:,:3,:,:].float().cuda())
    batch_5.append(batch)


#batch_4
batch_4 = []
for i in range(0,nb_config):
    batch = np.expand_dims(inputs[i],0)
    for a in range(0,3):
        batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0)
    batch = torch.from_numpy(batch)
    batch = batch.permute(0,3,1,2)
    batch = Variable(batch[:,:3,:,:].float().cuda())
    batch_4.append(batch)

#batch_3
batch_3 = []
for i in range(0,nb_config):
    batch = np.expand_dims(inputs[i],0)
    for a in range(0,2):
        batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0)
    batch = torch.from_numpy(batch)
    batch = batch.permute(0,3,1,2)
    batch = Variable(batch[:,:3,:,:].float().cuda())
    batch_3.append(batch)

#batch_2
batch_2 = []
for i in range(0,nb_config):
    batch = np.expand_dims(inputs[i],0)
    batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0)
    batch = torch.from_numpy(batch)
    batch = batch.permute(0,3,1,2)
    batch = Variable(batch[:,:3,:,:].float().cuda())
    batch_2.append(batch)



targets_arrays = []
for j in x: #directory
    target_direct = []
    for y in range(1,len(j),5):#images 
        if(y+5<=len(j)-1):#if we there is still more than 5 images in the folder we do a 5 images batch
            targets = np.expand_dims(np.expand_dims(np.array(resize(Image.fromarray(j[y]).convert('L')))/255,axis=0),0)
            for h in range(y+1,y+5):
                targets= np.concatenate((targets,
                                np.expand_dims(
                                    np.expand_dims(np.array(resize(Image.fromarray(j[h]).convert('L')))/255,axis=0),0)),0)

            targets = Variable(torch.from_numpy(targets).float().cuda())
            target_direct.append(targets)
        else:#we do a bach of the remaining size
            targets = np.expand_dims(np.expand_dims(np.array(resize(Image.fromarray(j[y]).convert('L')))/255,axis=0),0)
            for h in range(y+1,y+(len(j)-y)):
                targets= np.concatenate((targets,
                                np.expand_dims(
                                        np.expand_dims(np.array(resize(Image.fromarray(j[h]).convert('L')))/255,axis=0),0)),0)

            targets = Variable(torch.from_numpy(targets).float().cuda())
            target_direct.append(targets)
    targets_arrays.append(target_direct)


lost=[]
#input batch size depends on the size of the output,batch can be of size 5,4,3,2
for i in range(0,num_epochs):#epochs
    print("epochs : "+str(i))
    for dire,hm in enumerate(targets_arrays):#for each directories
        for ar in hm:#for each batch
            if(len(ar)==5):
                optimizer.zero_grad()
                outputs = model(batch_5[dire])#output  = [x,1,224,244] tensor 
                loss = criterion(outputs,ar)
                lost.append(loss)
                loss.backward()
                optimizer.step()
            elif(len(ar)==4):
                optimizer.zero_grad()
                outputs = model(batch_4[dire])#output  = [x,1,224,244] tensor 
                loss = criterion(outputs,ar)
                lost.append(loss)
                loss.backward()
                optimizer.step()
            elif(len(ar)==3):
                optimizer.zero_grad()
                outputs = model(batch_3[dire])#output  = [x,1,224,244] tensor 
                loss = criterion(outputs,ar)
                lost.append(loss)
                loss.backward()
                optimizer.step()                    
            elif(len(ar)==2):
                optimizer.zero_grad()
                outputs = model(batch_2[dire])#output  = [x,1,224,244] tensor 
                loss = criterion(outputs,ar)
                lost.append(loss)
                loss.backward()
                optimizer.step()                   
            else:
                print("fuck off")
    print(np.mean(lost).data.cpu())
    lost=[]
    if(i!=0 and i%50==0):
        torch.save(model.cpu(),"./model/model_epochs_"+str(i)+".ckpt")
        model.cuda()
        for param_group in optimizer.param_groups:
            param_group['lr'] = lre/4
return outputs,model

Are you holding a reference to outputs outside of this function?
Do you see an increase in GPU memory usage?
Here is a nice explanation of why accumulating the history of Variables is problematic.

I don’t hold any reference to outputs outside of the function.
What I would like is to use as much GPU memory as possible and not just only 10%. As I have access to a gtx 1080.
when my memory usage is around 90% my training goes quite fast, and when it drops it lslow down so much the process.

Thanks for the link I will take a look :slight_smile:

I suppose the GPU utilization is dropping from 90% to 10% or is it really the memory?

Could you elaborate a bit more about the batching code in front of the training?
I don’t really understand it, but maybe you could just just a Dataset and DataLoader to speed thing up?

to sum it up I create batchs of size 5,4,3 or 2 depending on the number of target values I can batch. For one input I need to forward it through the network, compare it to a bunch of targets and backpropagate the loss.
Instead of doing it for one target at a time I am doing it for 5 at a time ( 4,3,2 if there are not enough left to do a 5 target batch). So if I have 5 targets I use 5 inputs (seems to be the maximum i can use in my gpu memory.
It worked well as it sped up my training, but after some epochs my GPU memory utilization drops from 90% to 10% and then stays around this value, slowing the training a lot.

Sorry for the explanation which is a bit confusing.

What is the condition of the number of targets you can batch?
Sufficient data left for the batch_size?
If so, you could try the following code snippet and extend it to your use case:

class MyDataset(Dataset):
    def __init__(self, X, y):
        self.data = X
        self.target = y
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        
        return x, y
    
    def __len__(self):
        return len(self.data)


# Create fake data
X = torch.randn(100, 3, 16, 16)
y = torch.Tensor(100).random_(0, 10).long()

dataset = MyDataset(X, y)

loader = DataLoader(dataset,
                    batch_size=6,
                    num_workers=1,
                    shuffle=False,
                    drop_last=False)


for data, target in loader:
    # Your training procedure...
    print data.shape
    print target.shape

If you set batch_size=5, all batches will have 5 samples in them, since 100 is dividable by 5 without a remainder.
Keeping batch_size=6 will give you batches of size 6 apart from the last one, which will have 4 samples, if drop_last is set to False.
Try to play around with num_workers so that your DataLoader will use multi-processing.

Thanks for the snippets, I managed to have my gpu working full capacity by simply putting the

Variable(tensor.cuda()) , in my main loop. 1000 epochs per hours instead of 50 previously.

Nice you figured it out!

However, it seems you are accumulating the loss in lost in your training loop.
If you just need the value, try to use lost.append(lost.cpu().data[0]).

1 Like

Yeah I am accumulating it over the different data folders to compute its mean and display it for each epochs.
Are Cuda Variables discarded after the backward if I just use their data ?

As long as you don’t hold a reference to the Variable, PyTorch will take care of freeing the memory for your.
Note that nvidia-smi does not necessarily give the correct amount of used memory, since PyTorch uses a caching memory allocator.

ok so because i was using a reference to the loss it wasn’t freed before the next epochs gotcha.
Thanks for the help :slight_smile: