Hello,
I have a use case where I create one process per available gpu along with multiple e.g. 15 processes that only run on the CPU. Here is the minimalistic working example that works in pytorch 1.7.0 but fails in 1.9.0. However, if I only use 3 or less GPUs rather than 4 while keeping the number of CPU processes the same it works on version 1.9.0 too.
Could you please guide me towards why this is happening and how it can be resolved?
Thanks!
Code:
import os
import time
import torch
torch.multiprocessing.set_sharing_strategy('file_system')
import torch.multiprocessing as mp
import torch.distributed.rpc as rpc
no_of_saver_processes = 15
world_size = torch.cuda.device_count()
def cpu_process_initialization(rank):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9867'
rpc.init_rpc(f"{rank}",
rank=rank,
world_size = world_size + no_of_saver_processes,
backend=rpc.BackendType.TENSORPIPE,
rpc_backend_options=rpc.TensorPipeRpcBackendOptions(rpc_timeout=0,
init_method='env://')
)
print(f"Started CPU process {rank}")
print(f"Process {rank}: avaialable device {torch.cuda.current_device()}")
# Do something rather than sleeping example disk or cpu bound operations
time.sleep(30)
rpc.shutdown()
return
def cuda_process_initialization(rank):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9867'
rpc.init_rpc(f"{rank}",
rank=rank,
world_size=world_size + no_of_saver_processes,
backend=rpc.BackendType.TENSORPIPE,
rpc_backend_options=rpc.TensorPipeRpcBackendOptions(#num_send_recv_threads=args.world_size*3,
rpc_timeout=0,
init_method='env://')
)
torch.cuda.set_device(rank)
os.environ["CUDA_VISIBLE_DEVICES"] = f"{rank}"
print(f"Started CUDA process on gpu {rank}")
# Do some cuda operations
print(f"Process {rank}: avaialable device {torch.cuda.current_device()}")
time.sleep(30)
rpc.shutdown()
return
if __name__ == "__main__":
mp.set_start_method('forkserver', force=True)
trainer_processes = mp.spawn(cuda_process_initialization,
nprocs=world_size,
join=False)
cpu_processes = []
for rank in range(world_size,world_size+no_of_saver_processes):
p = mp.Process(target=cpu_process_initialization,
args=(rank,))
p.start()
cpu_processes.append(p)
for p in cpu_processes: p.join()
trainer_processes.join()
Error:
terminate called after throwing an instance of 'std::runtime_error'
what(): In handleEventInFromLoop at tensorpipe/transport/shm/connection_impl.cc:235 "errCouldn't access ringbuffer of connection outbox: fstat: Bad file descriptor (this error originated at tensorpipe/common/shm_segment.cc:153)"
[W tensorpipe_agent.cpp:653] RPC agent for 4 encountered error when accepting incoming pipe: async error on socket: Connection reset by peer (this error originated at tensorpipe/transport/shm/connection_impl.cc:187)
[W tensorpipe_agent.cpp:843] RPC agent for 4 encountered error when reading incoming request from 0: async error on socket: Connection reset by peer (this error originated at tensorpipe/transport/shm/connection_impl.cc:187)
[W tensorpipe_agent.cpp:653] RPC agent for 2 encountered error when accepting incoming pipe: async error on socket: Connection reset by peer (this error originated at tensorpipe/transport/shm/connection_impl.cc:187)
[W tensorpipe_agent.cpp:843] RPC agent for 2 encountered error when reading incoming request from 0: async error on socket: Connection reset by peer (this error originated at tensorpipe/transport/shm/connection_impl.cc:187)
[W tensorpipe_agent.cpp:843] RPC agent for 1 encountered error when reading incoming request from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:843] RPC agent for 6 encountered error when reading incoming request from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:843] RPC agent for 8 encountered error when reading incoming request from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:843] RPC agent for 3 encountered error when reading incoming request from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:843] RPC agent for 7 encountered error when reading incoming request from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:843] RPC agent for 5 encountered error when reading incoming request from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:1049] RPC agent for 7 encountered error when reading incoming response from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:1049] RPC agent for 1 encountered error when reading incoming response from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:1049] RPC agent for 8 encountered error when reading incoming response from 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
Failed to respond to 'Shutdown Proceed' in time, got error eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
Failed to respond to 'Shutdown Proceed' in time, got error eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
Failed to respond to 'Shutdown Proceed' in time, got error eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
Process Process-9:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 317, in _tensorpipe_init_backend_handler
agent.join()
RuntimeError: [/opt/conda/conda-bld/pytorch_1623448255797/work/third_party/gloo/gloo/transport/tcp/pair.cc:589] Read error [127.0.0.1]:39602: Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/net/home/store/home/user/The/Feature_Distribution/test_mp.py", line 14, in cpu_process_initialization
rpc.init_rpc(f"{rank}",
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 203, in init_rpc
_init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 237, in _init_rpc_backend
rpc_agent = backend_registry.init_backend(
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 99, in init_backend
return backend.value.init_backend_handler(*args, **kwargs)
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 319, in _tensorpipe_init_backend_handler
api.shutdown()
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 79, in wrapper
return func(*args, **kwargs)
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 313, in shutdown
_get_current_rpc_agent().join(shutdown=True)
RuntimeError: [/opt/conda/conda-bld/pytorch_1623448255797/work/third_party/gloo/gloo/transport/tcp/pair.cc:589] Read error [127.0.0.1]:39602: Connection reset by peer
[W tensorpipe_agent.cpp:1025] RPC agent for 6 encountered error when sending outgoing request #1 to 0: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
Failed to respond to 'Shutdown Proceed' in time, got error eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
Process Process-8:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 317, in _tensorpipe_init_backend_handler
agent.join()
RuntimeError: [/opt/conda/conda-bld/pytorch_1623448255797/work/third_party/gloo/gloo/transport/tcp/pair.cc:589] Read error [127.0.0.1]:6868: Connection reset by peer
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/net/home/store/home/user/The/Feature_Distribution/test_mp.py", line 14, in cpu_process_initialization
rpc.init_rpc(f"{rank}",
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 203, in init_rpc
_init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/__init__.py", line 237, in _init_rpc_backend
rpc_agent = backend_registry.init_backend(
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 99, in init_backend
return backend.value.init_backend_handler(*args, **kwargs)
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/backend_registry.py", line 319, in _tensorpipe_init_backend_handler
api.shutdown()
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 79, in wrapper
return func(*args, **kwargs)
File "/home/user/anaconda3/envs/pytorch1.9/lib/python3.9/site-packages/torch/distributed/rpc/api.py", line 313, in shutdown
_get_current_rpc_agent().join(shutdown=True)
RuntimeError: [/opt/conda/conda-bld/pytorch_1623448255797/work/third_party/gloo/gloo/transport/tcp/pair.cc:589] Read error [127.0.0.1]:6868: Connection reset by peer