Mp.spawn breaks testing?

I am training Pointcept, a well know repo, with one of their examples:

The model runs an trains in 4 A100 gpus but when the evaluation starts there is a weird bug (below).
This is hard to debug as using 1 gpu gives an out of memory error. Evaluation happens inside of the same distributed process so it’s not cleat why it’s breaking.


[2023-10-25 17:48:30,221 INFO evaluator.py line 112 31130] >>>>>>>>>>>>>>>> Start Evaluation >>>>>>>>>>>>>>>>
Traceback (most recent call last):
File “”, line 1, in
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/init.py”, line 16, in
from . import context
ImportError: cannot import name ‘context’ from partially initialized module ‘multiprocessing’ (most likely due to a circular import) (/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/init.py)
Exception in thread Thread-3:
Traceback (most recent call last):
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/threading.py”, line 932, in _bootstrap_inner
self.run()
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/tensorboardX/event_file_writer.py”, line 202, in run
data = self._queue.get(True, queue_wait_duration)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/queues.py”, line 111, in get
res = self._recv_bytes()
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/connection.py”, line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/connection.py”, line 414, in _recv_bytes
buf = self._recv(4)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/connection.py”, line 383, in _recv
raise EOFError
EOFError
wandb: Waiting for W&B process to finish… (failed 1). Press Control-C to abort syncing.
Traceback (most recent call last):
File “”, line 1, in
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 125, in _main
prepare(preparation_data)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 236, in prepare
_fixup_main_from_path(data[‘init_main_from_path’])
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 287, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/runpy.py”, line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/runpy.py”, line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/runpy.py”, line 87, in _run_code
exec(code, run_globals)
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/tools/train.py”, line 8, in
from pointcept.engines.defaults import (
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/defaults.py”, line 14, in
from torch.nn.parallel import DistributedDataParallel
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/init.py”, line 16, in
import ctypes
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/ctypes/init.py”, line 545, in
from ctypes._endian import BigEndianStructure, LittleEndianStructure
ModuleNotFoundError: No module named ‘ctypes._endian’
Traceback (most recent call last):aded (0.000 MB deduped)
File “”, line 1, in
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 125, in _main
prepare(preparation_data)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 236, in prepare
_fixup_main_from_path(data[‘init_main_from_path’])
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/spawn.py”, line 287, in _fixup_main_from_path
main_content = runpy.run_path(main_path,
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/runpy.py”, line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/runpy.py”, line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/runpy.py”, line 87, in _run_code
exec(code, run_globals)
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/tools/train.py”, line 8, in
from pointcept.engines.defaults import (
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/defaults.py”, line 14, in
from torch.nn.parallel import DistributedDataParallel
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/init.py”, line 811, in
from .functional import * # noqa: F403
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/functional.py”, line 7, in
import torch.nn.functional as F
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/nn/init.py”, line 3, in
from .parallel import DataParallel
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/nn/parallel/init.py”, line 1, in
from .parallel_apply import parallel_apply
ModuleNotFoundError: No module named ‘torch.nn.parallel.parallel_apply’
Traceback (most recent call last):aded (0.000 MB deduped)
File “exp/s3dis/semseg-pt-v2m2-0-base_a2/code/tools/train.py”, line 39, in
main()
File “exp/s3dis/semseg-pt-v2m2-0-base_a2/code/tools/train.py”, line 27, in main
launch(
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/launch.py”, line 75, in launch
mp.spawn(
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method=‘spawn’)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 198, in start_processes
while not context.join():
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:

– Process 1 terminated with the following error:
Traceback (most recent call last):
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/multiprocessing/spawn.py”, line 69, in _wrap
fn(i, *args)
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/launch.py”, line 145, in _distributed_worker
main_func(*cfg)
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/tools/train.py”, line 20, in main_worker
trainer.train()
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/train.py”, line 168, in train
self.after_epoch()
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/train.py”, line 101, in after_epoch
h.after_epoch()
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/hooks/evaluator.py”, line 109, in after_epoch
self.eval()
File “/afs/cern.ch/work/m/mgarciam/private/UD_pointcept/exp/s3dis/semseg-pt-v2m2-0-base_a2/code/pointcept/engines/hooks/evaluator.py”, line 114, in eval
for i, input_dict in enumerate(self.trainer.val_loader):
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/utils/data/dataloader.py”, line 681, in next
data = self._next_data()
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/utils/data/dataloader.py”, line 1348, in _next_data
self._shutdown_workers()
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/utils/data/dataloader.py”, line 1474, in _shutdown_workers
w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/process.py”, line 149, in join
res = self._popen.wait(timeout)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/popen_fork.py”, line 44, in wait
if not wait([self.sentinel], timeout):
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/multiprocessing/connection.py”, line 931, in wait
ready = selector.select(timeout)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/selectors.py”, line 415, in select
fd_event_list = self._selector.poll(timeout)
File “/eos/user/m/mgarciam/envs/pointcept1/lib/python3.8/site-packages/torch/utils/data/_utils/signal_handling.py”, line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 32636) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.