Multi-GPU Inference on Pytorch Unet Segmentation Model Not Using Two Gpu

I am using two Nvidia-Quadro 1200(4gb) gpu for inferencing an image of size(1024*1792) in UNET segmentation using Pytorch Dataparallel method. Even though the code will start the inference it will go to only one gpu and other will remain idle. Since it is only going to single gpu I am getting the CUDA OOM error every time. I cant compromise the image size because it will alter our objective. I will add the code below.

import torch
import cv2
import numpy as np
from model import build_unet
from torch.nn.parallel import DataParallel
import os
from tqdm import tqdm
torch.cuda.empty_cache()
checkpoint_path = "Weights/best_model.pth"

def mask_parse(mask):
    mask = np.expand_dims(mask, axis=-1)
    mask = np.concatenate(\[mask, mask, mask\], axis=-1)  
    return mask

def inference_with_dataparallel(model, cv_img, imname):
    image = cv_img
    imageCopy = image.copy()
    x = np.transpose(image, (2, 0, 1))
    x = x / 255.0
    x = np.expand_dims(x, axis=0)
    x = x.astype(np.float32)
    x = torch.from_numpy(x)
    x = x.to(device)
    
    with torch.no_grad():
        pred_y = model(x)
        pred_y = torch.sigmoid(pred_y)
    
        pred_y = pred_y[0].cpu().numpy()
        pred_y = np.squeeze(pred_y, axis=0)
        pred_y = pred_y > 0.1  # 0.15
        pred_y = np.array(pred_y, dtype=np.uint8)
    
        pred_y = mask_parse(pred_y)
        out = pred_y * 255
    
        imageCopy = cv2.resize(imageCopy, (mWidth, mHeight))
        out = cv2.resize(out, (mWidth, mHeight))
        finalOut = cv2.addWeighted(imageCopy, 0.6, out, 0.4, 0)
    
        cv2.imwrite('Output/' + imname, finalOut)
if __name__ == "__main__":
    model = build_unet()
    device = torch.device("cuda")  
    model = model.to(device)
    model = DataParallel(model)
    model.load_state_dict(torch.load(checkpoint_path,map_location="cuda"))
    dirListCSV=os.listdir('TestImages')

    for allCSV in tqdm(dirListCSV):
        imgName = 'TestImages/'+ allCSV
        cv_img = cv2.imread(imgName)
        mHeight, mWidth = cv_img.shape[0], cv_img.shape[1]
        cv_img = cv2.resize(cv_img, (1024, 1792))
        inference_with_dataparallel(model, cv_img, allCSV)
        print('Gpu0:',torch.cuda.max_memory_allocated(device=device_ids[0]))
        print('Gpu1:',torch.cuda.max_memory_allocated(device=device_ids[1]))

nn.DataParallel is deprecated and known to introduce overheads as well as to create an imbalanced memory usage. Use DistributedDataParallel instead or reduce your batch size further.

Thanks @ptrblck for the reply.
I have already tried with the DistributedDataParallel method also, and I am taking only single image at a time so i cant decrease the batch further. As I said I cant decrease the image size as it will alter the objective. I will add the code I used with DistributedDataParallel

import torch
import cv2
import numpy as np
from model import build_unet
import os
from tqdm import tqdm
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group


torch.cuda.empty_cache()
checkpoint_path = "Weights/Sealant_2897i.pth"
device_ids=[0,1]
def ddp_setup(rank, world_size):
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "37814"
    init_process_group(backend="nccl",rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def inference_with_dataparallel(model, cv_img, imname):
    model = DDP(model,device_ids=device_ids)
    image = cv_img
    imageCopy = image.copy()
    x = np.transpose(image, (2, 0, 1))
    x = x / 255.0
    x = np.expand_dims(x, axis=0)
    x = x.astype(np.float32)
    x = torch.from_numpy(x)
    x = x.to(device)

    with torch.no_grad():
        pred_y = model(x)
        pred_y = torch.sigmoid(pred_y)

        pred_y = pred_y[0].cpu().numpy()
        pred_y = np.squeeze(pred_y, axis=0)
        pred_y = pred_y > 0.1  # 0.15
        pred_y = np.array(pred_y, dtype=np.uint8)
        mask = np.expand_dims(pred_y, axis=-1)
        mask = np.concatenate([mask, mask, mask], axis=-1)  

        out = mask * 255

        imageCopy = cv2.resize(imageCopy, (mWidth, mHeight))
        out = cv2.resize(out, (mWidth, mHeight))
        finalOut = cv2.addWeighted(imageCopy, 0.6, out, 0.4, 0)

        cv2.imwrite('Output/' + imname, finalOut)


def mainfunc(rank,world_size):
	global device,mHeight,mWidth,rank
	device = torch.device("cuda:0")
	ddp_setup(rank,world_size)
	model = build_unet()
	model = model.to(device)
	model.load_state_dict(torch.load(checkpoint_path, map_location="cuda:0"))
	dirListCSV=os.listdir('TestImages')
	for allCSV in tqdm(dirListCSV):
	    imgName = 'TestImages/'+ allCSV
	    cv_img = cv2.imread(imgName)
	    mHeight, mWidth = cv_img.shape[0], cv_img.shape[1]
	    cv_img = cv2.resize(cv_img, (1024, 1792))
	    inference_with_dataparallel(model,cv_img, allCSV)
	    print('Gpu0:',torch.cuda.max_memory_allocated(device=0))
	    print('Gpu1:',torch.cuda.max_memory_allocated(device=1))
if __name__ == "__main__":
	world_size = 2  # Number of GPUs to use
	mp.spawn(mainfunc, args=(world_size), nprocs=world_size, join=True)

This also give the same results… If code needs any change/Modification please mention.

If a single image is already causing the OOM issue, you would need to further reduce the memory usage, e.g. via checkpointing, amp, a smaller model etc. Data parallel approaches won’t help in this case.