trying mpi
is probably a really bad idea see my error:
$ python playground/multiprocessing_playground/ddp_basic_example.py
starting __main__
running main()
current process: <_MainProcess name='MainProcess' parent=None started>
pid: 4060
world_size=1
/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py:404: UserWarning: For MPI backend, world_size (1) and rank (0) are ignored since they are assigned by the MPI runtime.
warnings.warn(
Traceback (most recent call last):
File "playground/multiprocessing_playground/ddp_basic_example.py", line 153, in <module>
main()
File "playground/multiprocessing_playground/ddp_basic_example.py", line 148, in main
mp.spawn(run_parallel_training_loop, args=(world_size,), nprocs=world_size)
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join
raise Exception(msg)
Exception:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap
fn(i, *args)
File "/home/miranda9/ML4Coq/playground/multiprocessing_playground/ddp_basic_example.py", line 107, in run_parallel_training_loop
setup_process(rank, world_size)
File "/home/miranda9/ML4Coq/playground/multiprocessing_playground/ddp_basic_example.py", line 58, in setup_process
dist.init_process_group(backend, rank=rank, world_size=world_size)
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 409, in init_process_group
_default_pg = _new_process_group_helper(
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 482, in _new_process_group_helper
raise RuntimeError(
RuntimeError: Distributed package doesn't have MPI built in. MPI is only included if you build PyTorch from source on a host that has MPI installed.
I also tried both gloo
and nccl
both with different error e.g.
$ python playground/multiprocessing_playground/ddp_basic_example.py
starting __main__
running main()
current process: <_MainProcess name='MainProcess' parent=None started>
pid: 4175
world_size=1
Start running DDP with model parallel example on rank: 0.
current process: <SpawnProcess name='SpawnProcess-1' parent=4175 started>
pid: 4198
End running DDP with model parallel example on rank: 0.
End current process: <SpawnProcess name='SpawnProcess-1' parent=4175 started>
End pid: 4198
*** Error in `/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python': free(): invalid size: 0x000055b4ffbb8800 ***
======= Backtrace: =========
/lib64/libc.so.6(+0x81299)[0x2ac3ca566299]
/usr/local/cuda/lib64/libcublasLt.so.11(free_gemm_select+0x4d)[0x2ac3ee03de7d]
/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/lib/../../../../libcublas.so.11(cublasDestroy_v2+0x165)[0x2ac3e7929af5]
/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so(+0xc13a3d)[0x2ac3ff678a3d]
/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/lib/libtorch_cuda.so(+0xc13b41)[0x2ac3ff678b41]
/lib64/libc.so.6(+0x39ce9)[0x2ac3ca51ece9]
/lib64/libc.so.6(+0x39d37)[0x2ac3ca51ed37]
/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python(+0x25fe29)[0x55b4ab5b4e29]
/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python(+0x25fe5d)[0x55b4ab5b4e5d]
/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python(+0x25feb4)[0x55b4ab5b4eb4]
/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python(PyRun_SimpleStringFlags+0x66)[0x55b4ab5b7d66]
/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python(Py_RunMain+0x165)[0x55b4ab5b7ed5]
/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python(Py_BytesMain+0x39)[0x55b4ab5b82d9]
/lib64/libc.so.6(__libc_start_main+0xf5)[0x2ac3ca507555]
/home/miranda9/miniconda3/envs/automl-meta-learning/bin/python(+0x203493)[0x55b4ab558493]
======= Memory map: ========
200000000-200200000 ---p 00000000 00:00 0
200200000-200400000 rw-s 00000000 00:05 2897 /dev/nvidiactl
200400000-202400000 rw-s 00000000 00:05 2897 /dev/nvidiactl
202400000-205400000 rw-s 00000000 00:05 2897 /dev/nvidiactl
...
0000 r--p 00534000 00:31 9522865797 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/libmkl_rt.so
2ac46c6e0000-2ac46c6e2000 rw-p 0053a000 00:31 9522865797 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/libmkl_rt.so
2ac46c6e2000-2ac46c6f6000 rw-p 00000000 00:00 0
2ac46c6f6000-2ac46c6fc000 r--p 00000000 00:31 17557268491 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/mkl/_py_mkl_service.cpython-38-x86_64-linux-gnu.so
2ac46c6fc000-2ac46c712000 r-xp 00006000 00:31 17557268491 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/mkl/_py_mkl_service.cpython-38-x86_64-linux-gnu.so
2ac46c712000-2ac46c715000 r--p 0001c000 00:31 17557268491 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/mkl/_py_mkl_service.cpython-38-x86_64-linux-gnu.so
2ac46c715000-2ac46c716000 ---p 0001f000 00:31 17557268491 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/mkl/_py_mkl_service.cpython-38-x86_64-linux-gnu.so
2ac46c716000-2ac46c717000 r--p 0001f000 00:31 17557268491 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/mkl/_py_mkl_service.cpython-38-x86_64-linux-gnu.so
2ac46c717000-2ac46c71a000 rw-p 00020000 00:31 17557268491 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/mkl/_py_mkl_service.cpython-38-x86_64-linux-gnu.so
2ac46c71a000-2ac46c71b000 rw-p 00000000 00:00 0
2ac46c71b000-2ac46c745000 r--p 00000000 00:31 25797916 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/numpy/core/_multiarray_umath.cpython-38-x86_64-linux-gnu.so
2ac46c745000-2ac46c9b7000 r-xp 0002a000 00:31 25797916 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/numpy/core/_multiarray_umath.cpython-38-x86_64-linux-gnu.so
2ac46c9b7000-2ac46ca3f000 r--p 0029c000 00:31 25797916 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/numpy/core/_multiarray_umath.cpython-38-x86_64-linux-gnu.so
2ac46ca3f000-2ac46ca42000 r--p 00323000 00:31 25797916 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/numpy/core/_multiarray_umath.cpython-38-x86_64-linux-gnu.so
2ac46ca42000-2ac46ca5e000 rw-p 00326000 00:31 25797916 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/numpy/core/_multiarray_umath.cpython-38-x86_64-linux-gnu.so
2ac46ca5e000-2ac46cabf000 rw-p 00000000 00:00 0
2ac46cabf000-2ac46cac4000 r--p 00000000 00:31 30757369024 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/lib-dynload/_datetime.cpython-38-x86_64-linux-gnu.so
...
0000 00:05 2897 /dev/nvidiactl
2ac46cc64000-2ac46cc65000 rw-s 00000000 00:05 2897 /dev/nvidiactl
2ac46cc65000-2ac46cc66000 rw-s 00000000 00:05 2897 /dev/nvidiactl
2ac46cc6f000-2ac46cc78000 r--p 00000000 00:31 25797914 /home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/numpy/core/_multiarray_tests.cpython-38-x86_64-linux-gnu.so
2ac46cc78000-2ac46cc8b000 r-xp 00009000 00:31 25797914 /home/miranda9/miniconda3/envs/automl-meta-learninTraceback (most recent call last):
File "playground/multiprocessing_playground/ddp_basic_example.py", line 153, in <module>
main()
File "playground/multiprocessing_playground/ddp_basic_example.py", line 148, in main
mp.spawn(run_parallel_training_loop, args=(world_size,), nprocs=world_size)
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes
while not context.join():
File "/home/miranda9/miniconda3/envs/automl-meta-learning/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 105, in join
raise Exception(
Exception: process 0 terminated with signal SIGABRT