I write a simple program where two process send a tensor to each other using torch.isend and torch.irecv. I use different stream for sending and receiving but it seems to be dead-lock. I use torchrun --nproc_per_node=2 test.py
to launch this program. But the program (the both two ranks) hangs on req2.wait()
.
import torch
import torch.distributed as dist
def run():
rank = dist.get_rank()
size = dist.get_world_size()
tensor = torch.zeros(1, device=f'cuda:{rank}')
send_stream = torch.cuda.Stream(device=f'cuda:{rank}')
recv_stream = torch.cuda.Stream(device=f'cuda:{rank}')
if rank == 0:
tensor += 1
with torch.cuda.stream(send_stream):
req1 = dist.isend(tensor=tensor, dst=1)
with torch.cuda.stream(recv_stream):
req2 = dist.irecv(tensor=tensor, src=1)
req1.wait()
print("req1 finished")
req2.wait() # will hang here
print('Rank 0 received:', tensor.item())
elif rank == 1:
tensor += 2
with torch.cuda.stream(send_stream):
req1 = dist.isend(tensor=tensor, dst=0)
with torch.cuda.stream(recv_stream):
req2 = dist.irecv(tensor=tensor, src=0)
req1.wait()
print("req1 finished")
req2.wait() # will hang here
print('Rank 1 received:', tensor.item())
if __name__ == "__main__":
dist.init_process_group("nccl", init_method='env://', world_size=2) # init nccl
run()