Hi, I’m trying to use a minimal torch rpc demo:
# test.py
class Actor:
def __init__(self, value):
self.value = value
def compute(self, x):
return x + self.value
_singleton = None
def init_singleton(value):
global _singleton
if _singleton is None:
_singleton = Actor(value)
def compute(x):
return _singleton.compute(x)
if __name__ == "__main__":
import torch.distributed as dist
dist.init_process_group()
import torch.distributed.rpc as rpc
rpc.init_rpc(f"{dist.get_rank()}", rank=dist.get_rank(), world_size=dist.get_world_size())
if dist.get_rank() == 0:
workers = []
for i in range(1, dist.get_world_size()):
workers.append(rpc.rpc_async(f"{i}", init_singleton, args=(i,)))
for worker in workers:
worker.wait()
for i in range(1, dist.get_world_size()):
ret = rpc.rpc_async(f"{i}", compute, args=(i,))
print(f"Rank {i}: {ret.wait()}")
rpc.shutdown()
Launch script: torchrun --nproc-per-node 4 test.py
Error:
[2024-04-12 12:35:24,656] torch.distributed.run: [WARNING]
[2024-04-12 12:35:24,656] torch.distributed.run: [WARNING] *****************************************
[2024-04-12 12:35:24,656] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
[2024-04-12 12:35:24,656] torch.distributed.run: [WARNING] *****************************************
/data/youkaichao/miniconda/envs/vllm/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:602: UserWarning: You are using a Backend <class 'torch.distributed.distributed_c10d.ProcessGroupGloo'> as a ProcessGroup. This usage is deprecated since PyTorch 2.0. Please use a public API of PyTorch Distributed instead.
warnings.warn(
/data/youkaichao/miniconda/envs/vllm/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:602: UserWarning: You are using a Backend <class 'torch.distributed.distributed_c10d.ProcessGroupGloo'> as a ProcessGroup. This usage is deprecated since PyTorch 2.0. Please use a public API of PyTorch Distributed instead.
warnings.warn(
/data/youkaichao/miniconda/envs/vllm/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:602: UserWarning: You are using a Backend <class 'torch.distributed.distributed_c10d.ProcessGroupGloo'> as a ProcessGroup. This usage is deprecated since PyTorch 2.0. Please use a public API of PyTorch Distributed instead.
warnings.warn(
/data/youkaichao/miniconda/envs/vllm/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py:602: UserWarning: You are using a Backend <class 'torch.distributed.distributed_c10d.ProcessGroupGloo'> as a ProcessGroup. This usage is deprecated since PyTorch 2.0. Please use a public API of PyTorch Distributed instead.
warnings.warn(
[E thread_pool.cpp:112] Exception in thread pool task: unknown
[E thread_pool.cpp:112] Exception in thread pool task: unknown
FATAL: exception not rethrown
FATAL: exception not rethrown
[E thread_pool.cpp:112] Exception in thread pool task: unknown
FATAL: exception not rethrown
[W tensorpipe_agent.cpp:939] RPC agent for 0 encountered error when reading incoming response from 1: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:725] RPC agent for 0 encountered error when reading incoming request from 1: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
Traceback (most recent call last):
File "/data/youkaichao/vllm/test7.py", line 30, in <module>
worker.wait()
RuntimeError: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:939] RPC agent for 0 encountered error when reading incoming response from 2: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:725] RPC agent for 0 encountered error when reading incoming request from 2: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:939] RPC agent for 0 encountered error when reading incoming response from 3: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[W tensorpipe_agent.cpp:725] RPC agent for 0 encountered error when reading incoming request from 3: eof (this error originated at tensorpipe/transport/shm/connection_impl.cc:259)
[2024-04-12 12:35:29,705] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 302921) of binary: /data/youkaichao/miniconda/envs/vllm/bin/python