Working on Ubuntu 20.04, Python 3.9, PyTorch 1.12.0, and with nvidia gpus .
I trained an encoder and I want to use it to encode each image in my dataset. Because my dataset is huge, I’d like to leverage multiple gpus to do this.
Below is a snippet of the code I use.
Inference code snippet
import os
import sys
import tqdm
import wandb
import torch
import hydra
import shutil
import datetime
import pandas as pd
import torch.nn as nn
import torch.backends.cudnn as cudnn
from pathlib import Path
from omegaconf import DictConfig
from source.dataset import RegionFilepathsDataset
from source.utils import initialize_df, is_main_process
@hydra.main(
version_base="1.2.0", config_path="config/feature_extraction", config_name="default"
)
def main(cfg: DictConfig):
distributed = torch.cuda.device_count() > 1
if distributed:
torch.distributed.init_process_group(backend="nccl")
gpu_id = int(os.environ["LOCAL_RANK"])
if gpu_id == 0:
print(f"Distributed session successfully initialized")
else:
gpu_id = -1
if is_main_process():
print(f"torch.cuda.device_count(): {torch.cuda.device_count()}")
run_id = datetime.datetime.now().strftime("%Y-%m-%d_%H_%M")
output_dir = Path(cfg.output_dir, cfg.experiment_name, run_id)
if not cfg.resume and is_main_process():
if output_dir.exists():
print(f"{output_dir} already exists! deleting it...")
shutil.rmtree(output_dir)
print("done")
output_dir.mkdir(parents=False)
else:
output_dir.mkdir(parents=True, exist_ok=True)
cudnn.benchmark = True
# preparing data
df = pd.read_csv(cfg.tiles_csv)
dataset = RegionFilepathsDataset(df)
if distributed:
sampler = torch.utils.data.DistributedSampler(dataset, shuffle=True)
else:
sampler = torch.utils.data.RandomSampler(dataset)
loader = torch.utils.data.DataLoader(
subset,
sampler=sampler,
batch_size=cfg.batch_size_per_gpu,
num_workers=cfg.num_workers,
pin_memory=True,
shuffle=False,
)
model = ...
# move network to gpu
if distributed:
model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu_id], output_device=gpu_id)
else:
model = model.cuda()
with tqdm.tqdm(
loader,
desc="Image Encoding",
unit=" img",
unit_scale=cfg.batch_size_per_gpu,
ncols=80,
position=0,
leave=True,
disable=not (gpu_id in [-1, 0]),
) as t:
with torch.no_grad():
for batch in t:
idx, img = batch
if gpu_id == -1:
img = img.cuda(non_blocking=True)
else:
device = torch.device(f"cuda:{gpu_id}")
img = img.to(device, non_blocking=True)
feature = model(img)
(...)
if __name__ == "__main__":
main()
I kick off the script via:
python3 -m torch.distributed.run --standalone --nproc_per_node=gpu main.py
Running the previous command results in the following error:
RuntimeError: DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient.
It seems one cannot use DDP
to run inference. Hence my question boils down to: what’s the easiest way to run inference using multiple GPUs?
Looking for help!
NB: I’ve also tried to create as many subset of my dataset as the number of GPUs available, and spawn one process per GPU using and multiprocessing.Pool
and starmap
, but ran into another error.