I’m trying to implement an ensemble model, as there are some independent models, I want to traing the models in parallel using torch.multiprocessing
, However, I always get Too many open files
error.
Here is a minimal example that reproduce the error:
import torch
import torch.nn as nn
from torch.multiprocessing import Pool
class MyModel:
def __init__(self):
self.nn = nn.Sequential(
nn.Linear(10, 10), nn.ReLU(),
nn.Linear(10, 10), nn.ReLU(),
nn.Linear(10, 10), nn.ReLU(),
nn.Linear(10, 10), nn.ReLU(),
nn.Linear(10, 10), nn.ReLU(),
nn.Linear(10, 10), nn.ReLU(),
nn.Linear(10, 10), nn.ReLU(),
nn.Linear(10, 10), nn.ReLU()
)
def train(self):
pass
class EnsembleModel:
def __init__(self, K):
self.K = K;
self.models = [MyModel() for i in range(self.K)]
def f(self, i):
return i
def train(self):
pool = Pool(processes = 3)
ret = pool.map(self.f, range(self.K))
print(ret)
md = EnsembleModel(15);
md.train()
And this is the error message:
/home/alaya/anaconda3/lib/python3.6/multiprocessing/reduction.py:153: RuntimeWarning: received malformed or improperly-truncated ancillary data
msg, ancdata, flags, addr = sock.recvmsg(1, socket.CMSG_LEN(bytes_size))
Process ForkPoolWorker-3:
Traceback (most recent call last):
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/pool.py", line 108, in worker
task = get()
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/queues.py", line 337, in get
return _ForkingPickler.loads(res)
File "/home/alaya/anaconda3/lib/python3.6/site-packages/torch/multiprocessing/reductions.py", line 151, in rebuild_storage_fd
fd = df.detach()
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/reduction.py", line 182, in recv_handle
return recvfds(s, 1)[0]
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/reduction.py", line 172, in recvfds
raise RuntimeError('Invalid data received')
RuntimeError: Invalid data received
Traceback (most recent call last):
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 149, in _serve
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 50, in send
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/reduction.py", line 176, in send_handle
File "/home/alaya/anaconda3/lib/python3.6/socket.py", line 460, in fromfd
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 142, in _serve
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/connection.py", line 453, in accept
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/connection.py", line 593, in accept
File "/home/alaya/anaconda3/lib/python3.6/socket.py", line 205, in accept
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/resource_sharer.py", line 142, in _serve
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/connection.py", line 453, in accept
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/connection.py", line 593, in accept
File "/home/alaya/anaconda3/lib/python3.6/socket.py", line 205, in accept
OSError: [Errno 24] Too many open files
Exception in thread Thread-1:
Traceback (most recent call last):
File "/home/alaya/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
self.run()
File "/home/alaya/anaconda3/lib/python3.6/threading.py", line 864, in run
self._target(*self._args, **self._kwargs)
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/pool.py", line 405, in _handle_workers
pool._maintain_pool()
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/pool.py", line 246, in _maintain_pool
self._repopulate_pool()
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/pool.py", line 239, in _repopulate_pool
w.start()
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/process.py", line 105, in start
self._popen = self._Popen(self)
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/context.py", line 277, in _Popen
return Popen(process_obj)
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/popen_fork.py", line 19, in __init__
self._launch(process_obj)
File "/home/alaya/anaconda3/lib/python3.6/multiprocessing/popen_fork.py", line 65, in _launch
parent_r, child_w = os.pipe()
OSError: [Errno 24] Too many open files