Cuda Meomory Error for no good reason

Why do i get cuda meomory as show below even when i have got sufficient amount GPU present.
predict_on_slides_set( fnames, num_workers,aug,test_dir)

<ipython-input-58-d10dc427c7b6> in predict_on_slide(fn, aug)
     83             for model in models:
     84                 model.eval()
---> 85                 model.cuda()
     86 
     87                 #print(len(aug))

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in cuda(self, device)
    302             Module: self
    303         """
--> 304         return self._apply(lambda t: t.cuda(device))
    305 
    306     def cpu(self):

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    199     def _apply(self, fn):
    200         for module in self.children():
--> 201             module._apply(fn)
    202 
    203         def compute_should_use_set_data(tensor, tensor_applied):

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    199     def _apply(self, fn):
    200         for module in self.children():
--> 201             module._apply(fn)
    202 
    203         def compute_should_use_set_data(tensor, tensor_applied):

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    221                 # `with torch.no_grad():`
    222                 with torch.no_grad():
--> 223                     param_applied = fn(param)
    224                 should_use_set_data = compute_should_use_set_data(param, param_applied)
    225                 if should_use_set_data:

~/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py in <lambda>(t)
    302             Module: self
    303         """
--> 304         return self._apply(lambda t: t.cuda(device))
    305 
    306     def cpu(self):

RuntimeError: CUDA error: out of memory
def predict_on_slide(fn, aug):
    with torch.no_grad():
        
        
            preds=[]
            image = read_image(fn )
            coords = compute(image  )

            image = openslide.OpenSlide(fn+'.tiff') #MASKS

            crops,hw_product = split_image_coord(image,coords)
            #if hw_product>4200*2200 :# 25 quantile for height and width


            if (hw_product<5000*5000):

                sel_n=36
                if len(crops) >sel_n:
                    sel_n=nearest_square( len(crops))
            elif( hw_product>5000*5000) and (hw_product<9000*9000):
                sel_n=49
                if len(crops) >sel_n:
                    sel_n=nearest_square( len(crops))
            else:
                sel_n=81
                if len(crops) >sel_n:
                    sel_n=nearest_square( len(crops))

            non_empty_crops =  select_best_images(crops,sel_n)

            cnt=len(non_empty_crops)
            cnt=nearest_square( cnt)
             
            tile_list=np.arange(cnt)
            Ng=int(math.sqrt(cnt))
            print(Ng,cnt)
             
            imgs=[non_empty_crops[i] for i in tile_list]
             
            im_out = PIL.Image.new('RGB', (sz*Ng,sz*Ng))
            x=0
            y=0
            for i,img in enumerate(imgs):
                if i%Ng==0 and i >0 :
                    #print(i)
                    y=0
                    x=x+sz
                elif i>0:
                    y=y+sz


                im_out.paste(PIL.Image.fromarray(img), (y,x))
            
            im_out=final_aug(image=np.asarray(im_out))['image'] #albumnetations
            
            img=pil2tensor(PIL.Image.fromarray(im_out), np.float32).div_(255)
            
            
            img=normalize_transform(img)
            

            for model in models:
                model.eval()
                model.cuda()
                
                
                py = pred_aug(img.unsqueeze(0),model,aug).detach()
                 
                if len(models)>1:
                    preds.append(py)
                                #preds.append(py*wts[i])
                else :
                    preds.append( py)
            py = torch.stack(preds).mean(0).cpu()
            return py


       

    return 0.5
print('x')
from concurrent.futures import ThreadPoolExecutor

def predict_on_slides_set( fnames, num_workers,aug,test_dir):
    def process_file(i):
        filename = fnames[i]
        y_pred = predict_on_slide(os.path.join(test_dir, filename),  aug=aug)
        y_pred= (y_pred>0.5).sum(-1)-1
        return y_pred

    with ThreadPoolExecutor(max_workers=1) as ex:
        print(os.path.join(test_dir, fnames[0]))
        predictions =  ex.map(process_file, range(len(fnames)))
        print('pred',list(predictions))
    return list(predictions)
print('x')

Try reducing the the Batch size , maybe your model is occupying a good amt of space.
cmd: nvidia-smi can be used to just keep an eye on usable Cuda Memory

my batch size is just 1 .
image tensor size is also less than = 3,1024,1024 .

I dont get such issue with this model which i use for inference through Dataloader.Since data loader has constraint to keep same size of all image in batch so i thought of building this pipeline

also when iniitate the model and there if i do model.cuda() i dont get this error.

looks like i had to .to device otherwise it was taking GPU which was full .

I was also thinking the same thing :grin:

Could u help me here in this last part
I do get non empty tensor in print statement but while returning it says empty List .
Why should that be.

with ThreadPoolExecutor(max_workers=2) as ex:
        print(os.path.join(test_dir, fnames[0]))
        predictions =  ex.map(process_file, range(len(fnames)))
        print('pred',torch.cat(list(predictions)))
    return torch.cat(list(predictions))

Not much familiar with Threadpool, will try to replicate this part.