I encountered the following problem, when I set a larger world_size
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 195, in init_rpc
_init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 229, in _init_rpc_backend
rpc_agent = backend_registry.init_backend(
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 106, in init_backend
return backend.value.init_backend_handler(*args, **kwargs)
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 315, in _tensorpipe_init_backend_handler
api._all_gather(None, timeout=rpc_constants.DEFAULT_RPC_TIMEOUT_SEC)
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 77, in wrapper
return func(*args, **kwargs)
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 204, in _all_gather
rpc_sync(
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 77, in wrapper
return func(*args, **kwargs)
File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 765, in rpc_sync
return fut.wait()
RuntimeError: connect: Resource temporarily unavailable (this error originated at tensorpipe/common/socket.cc:114)
And this is my code:
import os
import time
from entity.Server import Server
import torch.multiprocessing as mp
from torch.distributed import rpc
from utils.options import args_parser
import torch.distributed as dist
SERVER_NAME = "Server"
CLIENT_NAME = "Client{}"
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '29500'
def run(rank, args):
if rank == 0:
rpc.init_rpc(SERVER_NAME, rank=rank, world_size=args.world_size)
# server = Server(args)
else:
rpc.init_rpc(CLIENT_NAME.format(rank), rank=rank, world_size=args.world_size)
rpc.shutdown()
if __name__ == "__main__":
args = args_parser()
mp.spawn(
run,
args=(args, ),
nprocs=args.world_size,
join=True
)
When I set the world_size value above 25, the code fails.
I think this may be a problem with the linux system configuration, does anyone know how to configure or fix this?