RPC + Torchrun hangs in ProcessGroupGloo

Seems like using the solution proposed here Getting Gloo error when connecting server and client over VPN from different systems - #2 by selineni fix the hanging…

Nothing like asking after fighting with the problem for a few hours to find a solution short after.

EDIT —

For completeness:

# rpc_test.py
# https://github.com/pytorch/pytorch/issues/85607

import os
import random

import numpy as np
import torch

import torch.distributed.rpc as rpc


def worker_init():
    rank = int(os.environ['RANK'])

    random.seed(rank)
    np.random.seed(rank)
    torch.manual_seed(rank)
    print(f'Rank {rank}')


def main():
    rank = int(os.environ['RANK'])
    world_size = int(os.environ['WORLD_SIZE'])
    
    print("initing rpc")
    rpc.init_rpc(name=f'worker{rank}', rank=rank, world_size=world_size)
    print("rpc inited - worker init")
    worker_init()
    print("worker inited")
    
    # no-op
    rpc.shutdown()


if __name__ == '__main__':
    main()

And the execution calls

TP_SOCKET_IFNAME=<interface> GLOO_SOCKET_IFNAME=<interface> torchrun --nnodes=2 --nproc_per_node=1 --node_rank=0 --rdzv_id=0 --rdzv_endpoint=compute-2-2:53554 minimal-rpc.py

TP_SOCKET_IFNAME=<interface> GLOO_SOCKET_IFNAME=<interface> torchrun --nnodes=2 --nproc_per_node=1 --node_rank=1 --rdzv_id=0 --rdzv_endpoint=compute-2-2:53554 minimal-rpc.py