Hi!
I tried to replace gloo with NCCL in my code because PyTorch 1.11 starts to support send/recv in NCCL.
Because the official document recommends using NCCL for GPU training, I expected that NCCL is faster than Gloo. However, I found that send/recv is slower in NCCL than in Gloo.
Is this result to be expected?
Moreover, is there any way to make send/recv for NCCL faster?
Code:
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import time
def run(rank, size, backend):
total_time = 0.
for _ in range(10):
if rank == 0:
tensor = torch.randn((100,100), device="cuda:1")
time.sleep(5) # wait untill process 1 initializes the variable 'tensor'.
t1_time = time.time()
if backend == "nccl":
dist.send(tensor=tensor, dst=1)
else:
dist.send(tensor=tensor.to("cpu"), dst=1)
total_time += time.time() - t1_time
else:
if backend == "nccl":
tensor = torch.randn((100,100), device="cuda:2")
dist.recv(tensor=tensor, src=0)
else:
tensor = torch.randn((100,100), device="cpu")
dist.recv(tensor=tensor, src=0)
if rank == 0:
print(f"{backend}: {total_time} sec")
def init_process(rank, size, fn, backend='nccl'):
""" Initialize the distributed environment. """
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29501'
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size, backend)
if __name__ == "__main__":
mp.set_start_method("spawn")
for backend in ["nccl", "gloo"]:
size = 2
processes = []
for rank in range(size):
p = mp.Process(target=init_process, args=(rank, size, run, backend))
p.start()
processes.append(p)
for p in processes:
p.join()
Results:
nccl: 3.5865402221679688 sec
gloo: 0.00415349006652832 sec
Enviroment:
- Python 3.10.4
- cuda 11.4
- PyTorch 1.11.0
- GPUs: NVIDIA Quadro A6000 * 2
Best Regard!