Seems like using the solution proposed here Getting Gloo error when connecting server and client over VPN from different systems - #2 by selineni fix the hanging…
Nothing like asking after fighting with the problem for a few hours to find a solution short after.
EDIT —
For completeness:
# rpc_test.py
# https://github.com/pytorch/pytorch/issues/85607
import os
import random
import numpy as np
import torch
import torch.distributed.rpc as rpc
def worker_init():
rank = int(os.environ['RANK'])
random.seed(rank)
np.random.seed(rank)
torch.manual_seed(rank)
print(f'Rank {rank}')
def main():
rank = int(os.environ['RANK'])
world_size = int(os.environ['WORLD_SIZE'])
print("initing rpc")
rpc.init_rpc(name=f'worker{rank}', rank=rank, world_size=world_size)
print("rpc inited - worker init")
worker_init()
print("worker inited")
# no-op
rpc.shutdown()
if __name__ == '__main__':
main()
And the execution calls
TP_SOCKET_IFNAME=<interface> GLOO_SOCKET_IFNAME=<interface> torchrun --nnodes=2 --nproc_per_node=1 --node_rank=0 --rdzv_id=0 --rdzv_endpoint=compute-2-2:53554 minimal-rpc.py
TP_SOCKET_IFNAME=<interface> GLOO_SOCKET_IFNAME=<interface> torchrun --nnodes=2 --nproc_per_node=1 --node_rank=1 --rdzv_id=0 --rdzv_endpoint=compute-2-2:53554 minimal-rpc.py