# On worker 0:
import torch
import torch.distributed.rpc as rpc
import os
from torch.distributed.rpc import BackendType
os.environ['MASTER_ADDR'] = '10.8.9.106'
os.environ['MASTER_PORT'] = '10000'
os.environ['GLOO_SOCKET_IFNAME'] = 'br0'
os.environ['TF_SOCKET_IFNAME'] = 'br0'
options = rpc.TensorPipeRpcBackendOptions(
num_worker_threads=4,
rpc_timeout=0,
init_method='tcp://127.0.0.1:10000'
)
# On both workers:
@torch.jit.script
def my_script_add(t1, t2):
return torch.add(t1, t2)
rpc.init_rpc("worker0", backend=BackendType.TENSORPIPE, rank=0, world_size=2, rpc_backend_options=options)
# RuntimeError: ECONNREFUSED: connection refused
ret = rpc.rpc_sync("worker1", torch.add, args=(torch.ones(2), 3))
rpc.shutdown()
# On worker 1:
import torch
import torch.distributed.rpc as rpc
import os
from torch.distributed.rpc import BackendType
os.environ['MASTER_ADDR'] = '10.8.9.106'
os.environ['MASTER_PORT'] = '10000'
os.environ['GLOO_SOCKET_IFNAME'] = 'enp6s0'
os.environ['TF_SOCKET_IFNAME'] = 'enp6s0'
os.environ['NCCL_SOCKET_IFNAME'] = 'enp6s0'
options = rpc.TensorPipeRpcBackendOptions(
num_worker_threads=4,
rpc_timeout=0,
init_method='tcp://10.8.9.106:10000'
)
# On both workers:
@torch.jit.script
def my_script_add(t1, t2):
return torch.add(t1, t2)
rpc.init_rpc("worker1", backend=BackendType.TENSORPIPE, rank=1, world_size=2, rpc_backend_options=options)
rpc.shutdown()
Both workers connect successfully on the same machine.
But workers fail to connect over the simplest intranet.
On worker0, it says “RuntimeError: ECONNREFUSED: connection refused”
On worker1, it says “RuntimeError: [/pytorch/third_party/gloo/gloo/transport/tcp/pair.cc:575] Connection closed by peer [10.8.9.106]:21667”, what confuses me is this peer at a random port. Here 21667 changes every time.
I set port larger than 1023, so it’s not the permission thing.
I also tries gRPC in go programming, which is not relevent much, but it passed between two machines successfully.