I train my model, the next day I found it stucks in the 12th epoch while I want to train 100 epochs.
My colleague tell my the machine is very slow, and the load is 500! So I have to use ctrl+c
to kill the script. The log is below:
./train.py
^CProcess Process-197:
Process Process-195:
Process Process-198:
Process Process-199:
Process Process-200:
Process Process-193:
Process Process-196:
Process Process-194:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
KeyboardInterrupt
KeyboardInterrupt
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
KeyboardInterrupt
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
self.run()
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
KeyboardInterrupt
KeyboardInterrupt
File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
File "/mapbar/data/home/acgtyrant/Projects/drn/.env/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 52, in _worker_loop
r = index_queue.get()
File "/usr/lib/python3.5/multiprocessing/queues.py", line 343, in get
res = self._reader.recv_bytes()
File "/usr/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
KeyboardInterrupt
It seems that the DataLoader stucks in multiprocessing, any idea? Thank you.