Run inference on multiple GPUs

Working on Ubuntu 20.04, Python 3.9, PyTorch 1.12.0, and with nvidia gpus .

I trained an encoder and I want to use it to encode each image in my dataset. Because my dataset is huge, I’d like to leverage multiple gpus to do this.

Below is a snippet of the code I use.

Inference code snippet
import os
import sys
import tqdm
import wandb
import torch
import hydra
import shutil
import datetime
import pandas as pd
import torch.nn as nn
import torch.backends.cudnn as cudnn

from pathlib import Path
from omegaconf import DictConfig

from source.dataset import RegionFilepathsDataset
from source.utils import initialize_df, is_main_process


@hydra.main(
    version_base="1.2.0", config_path="config/feature_extraction", config_name="default"
)
def main(cfg: DictConfig):

    distributed = torch.cuda.device_count() > 1
    if distributed:
        torch.distributed.init_process_group(backend="nccl")
        gpu_id = int(os.environ["LOCAL_RANK"])
        if gpu_id == 0:
            print(f"Distributed session successfully initialized")
    else:
        gpu_id = -1

    if is_main_process():
        print(f"torch.cuda.device_count(): {torch.cuda.device_count()}")
        run_id = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M")
        output_dir = Path(cfg.output_dir, cfg.experiment_name, run_id)
        if not cfg.resume and is_main_process():
            if output_dir.exists():
                print(f"{output_dir} already exists! deleting it...")
                shutil.rmtree(output_dir)
                print("done")
                output_dir.mkdir(parents=False)
            else:
                output_dir.mkdir(parents=True, exist_ok=True)

    cudnn.benchmark = True

    # preparing data
    df = pd.read_csv(cfg.tiles_csv)
    dataset = RegionFilepathsDataset(df)

    if distributed:
        sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
    else:
        sampler = torch.utils.data.RandomSampler(dataset)

    loader = torch.utils.data.DataLoader(
        subset,
        sampler=sampler,
        batch_size=cfg.batch_size_per_gpu,
        num_workers=cfg.num_workers,
        pin_memory=True,
        shuffle=False,
    )

    model = ...
  
    # move network to gpu
    if distributed:
        model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu_id], output_device=gpu_id)
    else:
        model = model.cuda()

    with tqdm.tqdm(
        loader,
        desc="Image Encoding",
        unit=" img",
        unit_scale=cfg.batch_size_per_gpu,
        ncols=80,
        position=0,
        leave=True,
        disable=not (gpu_id in [-1, 0]),
    ) as t:

        with torch.no_grad():

            for batch in t:

                idx, img = batch
                if gpu_id == -1:
                    img = img.cuda(non_blocking=True)
                else:
                    device = torch.device(f"cuda:{gpu_id}")
                    img = img.to(device, non_blocking=True)
                feature = model(img)
                (...)

if __name__ == "__main__":

    main()

I kick off the script via:

python3 -m torch.distributed.run --standalone --nproc_per_node=gpu main.py

Running the previous command results in the following error:

RuntimeError: DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient.

It seems one cannot use DDP to run inference. Hence my question boils down to: what’s the easiest way to run inference using multiple GPUs?
Looking for help!

NB: I’ve also tried to create as many subset of my dataset as the number of GPUs available, and spawn one process per GPU using and multiprocessing.Pool and starmap, but ran into another error.

Why do you need to wrap your model with DDP when you are running in inference? DDP is used to synchronize model parameters such as weights and normalization statistics, which wouldn’t be needed during inference.

You would likely only need something like torch.utils.data.distributed.DistributedSampler to split the dataset across the GPUs, but DDP should not be needed.

1 Like

indeed, I didn’t quite get what DDP should be used for.
I managed to get things working using torch.utils.data.distributed.DistributedSampler & sending inputs to the right gpus. I’ve added hereunder a snippet of the code working, in case someone is interested.

Distributed inference code snippet
import os
import sys
import tqdm
import wandb
import torch
import hydra
import shutil
import datetime
import pandas as pd
import torch.nn as nn
import torch.backends.cudnn as cudnn

from pathlib import Path
from omegaconf import DictConfig

from source.dataset import RegionFilepathsDataset
from source.utils import initialize_df, is_main_process


@hydra.main(
    version_base="1.2.0", config_path="config/feature_extraction", config_name="default"
)
def main(cfg: DictConfig):

    distributed = torch.cuda.device_count() > 1
    if distributed:
        torch.distributed.init_process_group(backend="nccl")
        gpu_id = int(os.environ["LOCAL_RANK"])
        if gpu_id == 0:
            print(f"Distributed session successfully initialized")
    else:
        gpu_id = -1

    if is_main_process():
        print(f"torch.cuda.device_count(): {torch.cuda.device_count()}")

    cudnn.benchmark = True

    # preparing data
    df = pd.read_csv(cfg.tiles_csv)
    dataset = RegionFilepathsDataset(df)

    if distributed:
        sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
    else:
        sampler = torch.utils.data.RandomSampler(dataset)

    loader = torch.utils.data.DataLoader(
        subset,
        sampler=sampler,
        batch_size=cfg.batch_size_per_gpu,
        num_workers=cfg.num_workers,
        pin_memory=True,
        shuffle=False,
    )

    model = ...
  
    # move network to gpu
    if distributed:
        model = model.to(gpu_id)
    else:
        model = model.cuda()

    with tqdm.tqdm(
        loader,
        desc="Image Encoding",
        unit=" img",
        unit_scale=cfg.batch_size_per_gpu,
        ncols=80,
        position=0,
        leave=True,
        disable=not (gpu_id in [-1, 0]),
    ) as t:

        with torch.no_grad():

            for batch in t:

                idx, img = batch
                if gpu_id == -1:
                    device = torch.device(f"cuda")
                else:
                    device = torch.device(f"cuda:{gpu_id}")
                img = img.to(device, non_blocking=True)
                feature = model(img)
                (...)

if __name__ == "__main__":

    main()
1 Like