RuntimeError: connect: Resource temporarily unavailable (this error originated at tensorpipe/common/socket.cc:114)

I encountered the following problem, when I set a larger world_size

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 195, in init_rpc

    _init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 229, in _init_rpc_backend

    rpc_agent = backend_registry.init_backend(

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 106, in init_backend

    return backend.value.init_backend_handler(*args, **kwargs)

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 315, in _tensorpipe_init_backend_handler

    api._all_gather(None, timeout=rpc_constants.DEFAULT_RPC_TIMEOUT_SEC)

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 77, in wrapper

    return func(*args, **kwargs)

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 204, in _all_gather

    rpc_sync(

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 77, in wrapper

    return func(*args, **kwargs)

File "/root/anaconda3/envs/final/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 765, in rpc_sync

    return fut.wait()

RuntimeError: connect: Resource temporarily unavailable (this error originated at tensorpipe/common/socket.cc:114)

And this is my code:

import os

import time

from entity.Server import Server

import torch.multiprocessing as mp

from torch.distributed import rpc

from utils.options import args_parser

import torch.distributed as dist

SERVER_NAME = "Server"

CLIENT_NAME = "Client{}"

os.environ['MASTER_ADDR'] = 'localhost'

os.environ['MASTER_PORT'] = '29500'

def run(rank, args):    

    if rank == 0:

        rpc.init_rpc(SERVER_NAME, rank=rank, world_size=args.world_size)

        # server = Server(args)
    else:

        rpc.init_rpc(CLIENT_NAME.format(rank), rank=rank, world_size=args.world_size)

    rpc.shutdown()

if __name__ == "__main__":

    args = args_parser()

    mp.spawn(

        run,

        args=(args, ),

        nprocs=args.world_size,

        join=True

    )

When I set the world_size value above 25, the code fails.

I think this may be a problem with the linux system configuration, does anyone know how to configure or fix this?

cc @pbelevich for configuring RPC