Hi,
I am loading my data using the following piece of code, for the first few epochs, I am using 85% of my GPU’s VRAM and everything’s fine, but then it drops to maximum 10% and starts being sooo slow. Any idea why ?
def train_model(model, criterion, optimizer,num_epochs=25,lre=0.001):
nb_config = 5
liste_dir = sorted(glob.glob('./dataset/train//'))
liste_images = []
for i in liste_dir:
images = sorted(glob.glob(str(i)+".png"))
liste_images.append(images)
x =
resize = transforms.Resize((224,224))
for fname in sorted(liste_images):
x.append(np.array([np.array(resize(Image.open(fname2))) for fname2 in fname]))for i,_ in enumerate(x): x[i][0], x[i][-1] = x[i][-1], x[i][0] inputs = np.expand_dims(x[0][0], axis=0) for i in range(1,len(x)): inputs= np.concatenate((inputs,np.expand_dims(x[i][0], axis=0)),0) #batch_5 batch_5 = [] for i in range(0,nb_config): batch = np.expand_dims(inputs[i],0) for a in range(0,4): batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0) batch = torch.from_numpy(batch) batch = batch.permute(0,3,1,2) batch = Variable(batch[:,:3,:,:].float().cuda()) batch_5.append(batch) #batch_4 batch_4 = [] for i in range(0,nb_config): batch = np.expand_dims(inputs[i],0) for a in range(0,3): batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0) batch = torch.from_numpy(batch) batch = batch.permute(0,3,1,2) batch = Variable(batch[:,:3,:,:].float().cuda()) batch_4.append(batch) #batch_3 batch_3 = [] for i in range(0,nb_config): batch = np.expand_dims(inputs[i],0) for a in range(0,2): batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0) batch = torch.from_numpy(batch) batch = batch.permute(0,3,1,2) batch = Variable(batch[:,:3,:,:].float().cuda()) batch_3.append(batch) #batch_2 batch_2 = [] for i in range(0,nb_config): batch = np.expand_dims(inputs[i],0) batch = np.concatenate((batch,np.expand_dims(inputs[i],0)),0) batch = torch.from_numpy(batch) batch = batch.permute(0,3,1,2) batch = Variable(batch[:,:3,:,:].float().cuda()) batch_2.append(batch) targets_arrays = [] for j in x: #directory target_direct = [] for y in range(1,len(j),5):#images if(y+5<=len(j)-1):#if we there is still more than 5 images in the folder we do a 5 images batch targets = np.expand_dims(np.expand_dims(np.array(resize(Image.fromarray(j[y]).convert('L')))/255,axis=0),0) for h in range(y+1,y+5): targets= np.concatenate((targets, np.expand_dims( np.expand_dims(np.array(resize(Image.fromarray(j[h]).convert('L')))/255,axis=0),0)),0) targets = Variable(torch.from_numpy(targets).float().cuda()) target_direct.append(targets) else:#we do a bach of the remaining size targets = np.expand_dims(np.expand_dims(np.array(resize(Image.fromarray(j[y]).convert('L')))/255,axis=0),0) for h in range(y+1,y+(len(j)-y)): targets= np.concatenate((targets, np.expand_dims( np.expand_dims(np.array(resize(Image.fromarray(j[h]).convert('L')))/255,axis=0),0)),0) targets = Variable(torch.from_numpy(targets).float().cuda()) target_direct.append(targets) targets_arrays.append(target_direct) lost=[] #input batch size depends on the size of the output,batch can be of size 5,4,3,2 for i in range(0,num_epochs):#epochs print("epochs : "+str(i)) for dire,hm in enumerate(targets_arrays):#for each directories for ar in hm:#for each batch if(len(ar)==5): optimizer.zero_grad() outputs = model(batch_5[dire])#output = [x,1,224,244] tensor loss = criterion(outputs,ar) lost.append(loss) loss.backward() optimizer.step() elif(len(ar)==4): optimizer.zero_grad() outputs = model(batch_4[dire])#output = [x,1,224,244] tensor loss = criterion(outputs,ar) lost.append(loss) loss.backward() optimizer.step() elif(len(ar)==3): optimizer.zero_grad() outputs = model(batch_3[dire])#output = [x,1,224,244] tensor loss = criterion(outputs,ar) lost.append(loss) loss.backward() optimizer.step() elif(len(ar)==2): optimizer.zero_grad() outputs = model(batch_2[dire])#output = [x,1,224,244] tensor loss = criterion(outputs,ar) lost.append(loss) loss.backward() optimizer.step() else: print("fuck off") print(np.mean(lost).data.cpu()) lost=[] if(i!=0 and i%50==0): torch.save(model.cpu(),"./model/model_epochs_"+str(i)+".ckpt") model.cuda() for param_group in optimizer.param_groups: param_group['lr'] = lre/4 return outputs,model