Hi, I encountered into the following problem when I was trying to read a batch of relatively large data sample with multi-threaded DataLoader (with num_workers=4 for example).
I have tried increasing the shared memory of ubuntu but did not work.
It will run without the num_workers argument, but it is too slow to learn from a large data set with single thread.
Is there anyone knowing what’s going wrong? Thank you in advance.
Exception Details:
~/.pyenv/versions/3.6.4/lib/python3.6/site-packages/torch/utils/data/dataloader.py in __next__(self)
273 while True:
274 assert (not self.shutdown and self.batches_outstanding > 0)
--> 275 idx, batch = self._get_batch()
276 self.batches_outstanding -= 1
277 if idx != self.rcvd_idx:
~/.pyenv/versions/3.6.4/lib/python3.6/site-packages/torch/utils/data/dataloader.py in _get_batch(self)
252 raise RuntimeError('DataLoader timed out after {} seconds'.format(self.timeout))
253 else:
--> 254 return self.data_queue.get()
255
256 def __next__(self):
~/.pyenv/versions/3.6.4/lib/python3.6/queue.py in get(self, block, timeout)
162 elif timeout is None:
163 while not self._qsize():
--> 164 self.not_empty.wait()
165 elif timeout < 0:
166 raise ValueError("'timeout' must be a non-negative number")
~/.pyenv/versions/3.6.4/lib/python3.6/threading.py in wait(self, timeout)
293 try: # restore state no matter what (e.g., KeyboardInterrupt)
294 if timeout is None:
--> 295 waiter.acquire()
296 gotit = True
297 else:
~/.pyenv/versions/3.6.4/lib/python3.6/site-packages/torch/utils/data/dataloader.py in handler(signum, frame)
173 # This following call uses `waitid` with WNOHANG from C side. Therefore,
174 # Python can still get and update the process status successfully.
--> 175 _error_if_any_worker_fails()
176 if previous_handler is not None:
177 previous_handler(signum, frame)
RuntimeError: DataLoader worker (pid 26317) is killed by signal: Aborted.