Hii @ptrblck, sometimes even though I get the segmentation fault the processes keep still going, that why for the previous output I had to manually stop them.
However, the below output from the bt is when the segfault showed and I did not interrupt the processes:
I would really appreaciate if you could check it out and let me know your thoughts.
(gdb) bt
#0 0x00007f4c03e84d1f in __GI___select (nfds=0, readfds=0x0, writefds=0x0, exceptfds=0x0, timeout=0x7ffee3ed8c90) at ../sysdeps/unix/sysv/linux/select.c:41
#1 0x00000000005b97c4 in pysleep (secs=<optimized out>) at ../Modules/timemodule.c:1467
#2 time_sleep () at ../Modules/timemodule.c:235
#3 0x00000000005075bc in _PyCFunction_FastCallDict (kwargs=<optimized out>, nargs=<optimized out>, args=0x4c63070, func_obj=<built-in method sleep of module object at remote 0x7f4c02ab8ef8>) at ../Objects/methodobject.c:209
#4 _PyCFunction_FastCallKeywords (kwnames=<optimized out>, nargs=<optimized out>, stack=<optimized out>, func=<optimized out>) at ../Objects/methodobject.c:294
#5 call_function.lto_priv () at ../Python/ceval.c:4851
#6 0x00000000005092a4 in _PyEval_EvalFrameDefault () at ../Python/ceval.c:3335
#7 0x0000000000504fd4 in PyEval_EvalFrameEx (throwflag=0,
f=Frame 0x4c62ea8, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/elastic/agent/server/api.py, line 843, in _invoke_run (self=<LocalElasticAgent(_worker_group=<WorkerGroup at remote 0x7f4b32fd28e0>, _remaining_restarts=0, _store=<torch._C._distributed_c10d.PrefixStore at remote 0x7f4b32f82688>, _exit_barrier_timeout=300, _total_execution_time=0, _start_method='spawn', _pcontext=<SubprocessContext(name='default', entrypoint='/usr/bin/python3', args={0: ('-u', 'prostateai/gleasonai/model_2/main.py'), 1: ('-u', 'prostateai/gleasonai/model_2/main.py')}, envs={0: {'LOCAL_RANK': '0', 'RANK': '0', 'GROUP_RANK': '0', 'ROLE_RANK': '0', 'ROLE_NAME': 'default', 'LOCAL_WORLD_SIZE': '2', 'WORLD_SIZE': '2', 'GROUP_WORLD_SIZE': '1', 'ROLE_WORLD_SIZE': '2', 'MASTER_ADDR': '127.0.0.1', 'MASTER_PORT': '29500', 'TORCHELASTIC_RESTART_COUNT': '0', 'TORCHELASTIC_MAX_RESTARTS': '0', 'TORCHELASTIC_RUN_ID': 'none', 'TORCHELASTIC_USE_AGENT_STORE': 'True', 'NCCL_ASYNC_ERROR_HANDLING': '1', 'OMP_NUM_THREADS': '1', 'TORC...(truncated)) at ../Python/ceval.c:754
#8 _PyEval_EvalCodeWithName.lto_priv.1836 () at ../Python/ceval.c:4166
#9 0x0000000000506d00 in fast_function.lto_priv () at ../Python/ceval.c:4992
#10 0x00000000005076ed in call_function.lto_priv () at ../Python/ceval.c:4872
#11 0x00000000005092a4 in _PyEval_EvalFrameDefault () at ../Python/ceval.c:3335
#12 0x0000000000504fd4 in PyEval_EvalFrameEx (throwflag=0,
f=Frame 0x4c3d568, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/elastic/agent/server/api.py, line 709, in run (self=<LocalElasticAgent(_worker_group=<WorkerGroup at remote 0x7f4b32fd28e0>, _remaining_restarts=0, _store=<torch._C._distributed_c10d.PrefixStore at remote 0x7f4b32f82688>, _exit_barrier_timeout=300, _total_execution_time=0, _start_method='spawn', _pcontext=<SubprocessContext(name='default', entrypoint='/usr/bin/python3', args={0: ('-u', 'prostateai/gleasonai/model_2/main.py'), 1: ('-u', 'prostateai/gleasonai/model_2/main.py')}, envs={0: {'LOCAL_RANK': '0', 'RANK': '0', 'GROUP_RANK': '0', 'ROLE_RANK': '0', 'ROLE_NAME': 'default', 'LOCAL_WORLD_SIZE': '2', 'WORLD_SIZE': '2', 'GROUP_WORLD_SIZE': '1', 'ROLE_WORLD_SIZE': '2', 'MASTER_ADDR': '127.0.0.1', 'MASTER_PORT': '29500', 'TORCHELASTIC_RESTART_COUNT': '0', 'TORCHELASTIC_MAX_RESTARTS': '0', 'TORCHELASTIC_RUN_ID': 'none', 'TORCHELASTIC_USE_AGENT_STORE': 'True', 'NCCL_ASYNC_ERROR_HANDLING': '1', 'OMP_NUM_THREADS': '1', 'TORCHELASTIC...(truncated)) at ../Python/ceval.c:754
#13 _PyEval_EvalCodeWithName.lto_priv.1836 () at ../Python/ceval.c:4166
#14 0x0000000000586641 in PyEval_EvalCodeEx (closure=<optimized out>, kwdefs=<optimized out>, defcount=1, defs=0x7f4b32fc5958, kwcount=0, kws=0x7f4c0418c060, argcount=<optimized out>, args=0x7f4ac72c1760, locals=0x0, globals=<optimized out>, _co=<optimized out>) at ../Python/ceval.c:4187
#15 function_call.lto_priv () at ../Objects/funcobject.c:604
#16 0x000000000059d65e in PyObject_Call () at ../Objects/abstract.c:2261
#17 0x000000000050a752 in do_call_core (kwdict={},
callargs=(<LocalElasticAgent(_worker_group=<WorkerGroup at remote 0x7f4b32fd28e0>, _remaining_restarts=0, _store=<torch._C._distributed_c10d.PrefixStore at remote 0x7f4b32f82688>, _exit_barrier_timeout=300, _total_execution_time=0, _start_method='spawn', _pcontext=<SubprocessContext(name='default', entrypoint='/usr/bin/python3', args={0: ('-u', 'prostateai/gleasonai/model_2/main.py'), 1: ('-u', 'prostateai/gleasonai/model_2/main.py')}, envs={0: {'LOCAL_RANK': '0', 'RANK': '0', 'GROUP_RANK': '0', 'ROLE_RANK': '0', 'ROLE_NAME': 'default', 'LOCAL_WORLD_SIZE': '2', 'WORLD_SIZE': '2', 'GROUP_WORLD_SIZE': '1', 'ROLE_WORLD_SIZE': '2', 'MASTER_ADDR': '127.0.0.1', 'MASTER_PORT': '29500', 'TORCHELASTIC_RESTART_COUNT': '0', 'TORCHELASTIC_MAX_RESTARTS': '0', 'TORCHELASTIC_RUN_ID': 'none', 'TORCHELASTIC_USE_AGENT_STORE': 'True', 'NCCL_ASYNC_ERROR_HANDLING': '1', 'OMP_NUM_THREADS': '1', 'TORCHELASTIC_ERROR_FILE': '/tmp/torchelastic_sd2v1fbe/none_k3qyzzyt/attempt_0/0/error.json'}, 1: {'LOCAL_RANK': '1', 'RANK': '1', 'GROUP_RANK': '0...(truncated), func=<function at remote 0x7f4b32f69d08>) at ../Python/ceval.c:5120
#18 _PyEval_EvalFrameDefault () at ../Python/ceval.c:3404
#19 0x0000000000504fd4 in PyEval_EvalFrameEx (throwflag=0,
f=Frame 0x4bcd358, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/elastic/metrics/api.py, line 125, in wrapper (args=(<LocalElasticAgent(_worker_group=<WorkerGroup at remote 0x7f4b32fd28e0>, _remaining_restarts=0, _store=<torch._C._distributed_c10d.PrefixStore at remote 0x7f4b32f82688>, _exit_barrier_timeout=300, _total_execution_time=0, _start_method='spawn', _pcontext=<SubprocessContext(name='default', entrypoint='/usr/bin/python3', args={0: ('-u', 'prostateai/gleasonai/model_2/main.py'), 1: ('-u', 'prostateai/gleasonai/model_2/main.py')}, envs={0: {'LOCAL_RANK': '0', 'RANK': '0', 'GROUP_RANK': '0', 'ROLE_RANK': '0', 'ROLE_NAME': 'default', 'LOCAL_WORLD_SIZE': '2', 'WORLD_SIZE': '2', 'GROUP_WORLD_SIZE': '1', 'ROLE_WORLD_SIZE': '2', 'MASTER_ADDR': '127.0.0.1', 'MASTER_PORT': '29500', 'TORCHELASTIC_RESTART_COUNT': '0', 'TORCHELASTIC_MAX_RESTARTS': '0', 'TORCHELASTIC_RUN_ID': 'none', 'TORCHELASTIC_USE---Type <return> to continue, or q <return> to quit---
_AGENT_STORE': 'True', 'NCCL_ASYNC_ERROR_HANDLING': '1', 'OMP_NUM_THREADS': '1', 'TORCHELASTIC...(truncated)) at ../Python/ceval.c:754
#20 _PyEval_EvalCodeWithName.lto_priv.1836 () at ../Python/ceval.c:4166
#21 0x0000000000506d00 in fast_function.lto_priv () at ../Python/ceval.c:4992
#22 0x00000000005076ed in call_function.lto_priv () at ../Python/ceval.c:4872
#23 0x00000000005092a4 in _PyEval_EvalFrameDefault () at ../Python/ceval.c:3335
#24 0x00000000005069c8 in PyEval_EvalFrameEx (throwflag=0,
f=Frame 0x4c49e28, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/launcher/api.py, line 252, in launch_agent (config=<LaunchConfig(min_nodes=1, max_nodes=1, nproc_per_node=2, run_id='none', role='default', rdzv_endpoint='127.0.0.1:29500', rdzv_backend='static', rdzv_configs={'rank': 0, 'timeout': 900}, rdzv_timeout=-1, max_restarts=0, monitor_interval=5, start_method='spawn', log_dir=None, redirects=<Std(_value_=0, _name_='NONE', __objclass__=<EnumMeta(_generate_next_value_=<function at remote 0x7f4c02d67488>, __module__='torch.distributed.elastic.multiprocessing.api', from_str=<classmethod at remote 0x7f4b33051eb8>, __doc__='An enumeration.', _member_names_=['NONE', 'OUT', 'ERR', 'ALL'], _member_map_={'NONE': <...>, 'OUT': <Std(_value_=1, _name_='OUT', __objclass__=<...>) at remote 0x7f4b32fd0bc8>, 'ERR': <Std(_value_=2, _name_='ERR', __objclass__=<...>) at remote 0x7f4b32fd0c08>, 'ALL': <Std(_value_=3, _name_='ALL', __objclass__=<...>) at remote 0x7f4b32fd0c48>}, _member_type_=<type at remo...(truncated)) at ../Python/ceval.c:754
#25 _PyFunction_FastCall (globals=<optimized out>, nargs=79994408, args=<optimized out>, co=<optimized out>) at ../Python/ceval.c:4933
#26 fast_function.lto_priv () at ../Python/ceval.c:4968
#27 0x00000000005076ed in call_function.lto_priv () at ../Python/ceval.c:4872
#28 0x00000000005092a4 in _PyEval_EvalFrameDefault () at ../Python/ceval.c:3335
#29 0x0000000000504fd4 in PyEval_EvalFrameEx (throwflag=0,
f=Frame 0x7f4b32fe15b8, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/launcher/api.py, line 131, in __call__ (self=<elastic_launch(_config=<LaunchConfig(min_nodes=1, max_nodes=1, nproc_per_node=2, run_id='none', role='default', rdzv_endpoint='127.0.0.1:29500', rdzv_backend='static', rdzv_configs={'rank': 0, 'timeout': 900}, rdzv_timeout=-1, max_restarts=0, monitor_interval=5, start_method='spawn', log_dir=None, redirects=<Std(_value_=0, _name_='NONE', __objclass__=<EnumMeta(_generate_next_value_=<function at remote 0x7f4c02d67488>, __module__='torch.distributed.elastic.multiprocessing.api', from_str=<classmethod at remote 0x7f4b33051eb8>, __doc__='An enumeration.', _member_names_=['NONE', 'OUT', 'ERR', 'ALL'], _member_map_={'NONE': <...>, 'OUT': <Std(_value_=1, _name_='OUT', __objclass__=<...>) at remote 0x7f4b32fd0bc8>, 'ERR': <Std(_value_=2, _name_='ERR', __objclass__=<...>) at remote 0x7f4b32fd0c08>, 'ALL': <Std(_value_=3, _name_='ALL', __objclass__=<...>) at remote 0x7f4b32fd0c48>}, _mem...(truncated)) at ../Python/ceval.c:754
#30 _PyEval_EvalCodeWithName.lto_priv.1836 () at ../Python/ceval.c:4166
#31 0x00000000005062b2 in _PyFunction_FastCallDict () at ../Python/ceval.c:5075
#32 0x0000000000592461 in _PyObject_FastCallDict (kwargs=0x0, nargs=3, args=0x7ffee3ed9b30, func=<function at remote 0x7f4b32f7a0d0>) at ../Objects/abstract.c:2310
#33 _PyObject_Call_Prepend (kwargs=0x0, args=<optimized out>, obj=<optimized out>, func=<function at remote 0x7f4b32f7a0d0>) at ../Objects/abstract.c:2373
#34 method_call.lto_priv () at ../Objects/classobject.c:314
#35 0x00000000005479ef in PyObject_Call (kwargs=0x0, args=('-u', 'prostateai/gleasonai/model_2/main.py'), func=<method at remote 0x7f4c019d12c8>) at ../Objects/abstract.c:2261
#36 slot_tp_call () at ../Objects/typeobject.c:6207
#37 0x000000000059d65e in PyObject_Call () at ../Objects/abstract.c:2261
#38 0x000000000050a752 in do_call_core (kwdict=0x0, callargs=('-u', 'prostateai/gleasonai/model_2/main.py'),
func=<elastic_launch(_config=<LaunchConfig(min_nodes=1, max_nodes=1, nproc_per_node=2, run_id='none', role='default', rdzv_endpoint='127.0.0.1:29500', rdzv_backend='static', rdzv_configs={'rank': 0, 'timeout': 900}, rdzv_timeout=-1, max_restarts=0, monitor_interval=5, start_method='spawn', log_dir=None, redirects=<Std(_value_=0, _name_='NONE', __objclass__=<EnumMeta(_generate_next_value_=<function at remote 0x7f4c02d67488>, __module__='torch.distributed.elastic.multiprocessing.api', from_str=<classmethod at remote 0x7f4b33051eb8>, __doc__='An enumeration.', _member_names_=['NONE', 'OUT', 'ERR', 'ALL'], _member_map_={'NONE': <...>, 'OUT': <Std(_value_=1, _name_='OUT', __objclass__=<...>) at remote 0x7f4b32fd0bc8>, 'ERR': <Std(_value_=2, _name_='ERR', __objclass__=<...>) at remote 0x7f4b32fd0c08>, 'ALL': <Std(_value_=3, _name_='ALL', __objclass__=<...>) at remote 0x7f4b32fd0c48>}, _member_type_=<type at remote 0x9cd180>, _value2member_map_={0: <...>, 1: <...>, 2: <...>, 3: <...>}, NONE=<...>, OUT=<...>, ERR=<...>, A...(truncated)) at ../Python/ceval.c:5120
#39 _PyEval_EvalFrameDefault () at ../Python/ceval.c:3404
#40 0x00000000005069c8 in PyEval_EvalFrameEx (throwflag=0,
f=Frame 0x7f4b32f7f048, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/run.py, line 713, in run (args=<Namespace(nnodes='1', nproc_per_node='2', rdzv_backend='static', rdzv_endpoint='', rdzv_id='none', rdzv_conf='', standalone=False, max_restarts=0, monitor_interval=5, start_method='spawn---Type <return> to continue, or q <return> to quit---
', role='default', module=False, no_python=False, run_path=False, log_dir=None, redirects='0', tee='0', node_rank=0, master_addr='127.0.0.1', master_port=29500, training_script='prostateai/gleasonai/model_2/main.py', training_script_args=[]) at remote 0x7f4b32f82470>, config=<LaunchConfig(min_nodes=1, max_nodes=1, nproc_per_node=2, run_id='none', role='default', rdzv_endpoint='127.0.0.1:29500', rdzv_backend='static', rdzv_configs={'rank': 0, 'timeout': 900}, rdzv_timeout=-1, max_restarts=0, monitor_interval=5, start_method='spawn', log_dir=None, redirects=<Std(_value_=0, _name_='NONE', __objclass__=<EnumMeta(_generate_next_value_=<function at remote 0x7f4c02d67488>, __module__='torch.distributed.elastic.multi...(truncated)) at ../Python/ceval.c:754
#41 _PyFunction_FastCall (globals=<optimized out>, nargs=139960954384456, args=<optimized out>, co=<optimized out>) at ../Python/ceval.c:4933
#42 fast_function.lto_priv () at ../Python/ceval.c:4968
#43 0x00000000005076ed in call_function.lto_priv () at ../Python/ceval.c:4872
#44 0x00000000005092a4 in _PyEval_EvalFrameDefault () at ../Python/ceval.c:3335
#45 0x0000000000504fd4 in PyEval_EvalFrameEx (throwflag=0,
f=Frame 0x7f4b3301ea48, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/run.py, line 719, in main (args=<Namespace(nnodes='1', nproc_per_node='2', rdzv_backend='static', rdzv_endpoint='', rdzv_id='none', rdzv_conf='', standalone=False, max_restarts=0, monitor_interval=5, start_method='spawn', role='default', module=False, no_python=False, run_path=False, log_dir=None, redirects='0', tee='0', node_rank=0, master_addr='127.0.0.1', master_port=29500, training_script='prostateai/gleasonai/model_2/main.py', training_script_args=[]) at remote 0x7f4b32f82470>)) at ../Python/ceval.c:754
#46 _PyEval_EvalCodeWithName.lto_priv.1836 () at ../Python/ceval.c:4166
#47 0x0000000000586641 in PyEval_EvalCodeEx (closure=<optimized out>, kwdefs=<optimized out>, defcount=1, defs=0x7f4c018fdae0, kwcount=0, kws=0x7f4c0418c060, argcount=<optimized out>, args=0x7f4c0418c060, locals=0x0, globals=<optimized out>, _co=<optimized out>) at ../Python/ceval.c:4187
#48 function_call.lto_priv () at ../Objects/funcobject.c:604
#49 0x000000000059d65e in PyObject_Call () at ../Objects/abstract.c:2261
#50 0x000000000050a752 in do_call_core (kwdict={}, callargs=(), func=<function at remote 0x7f4b32f7a7b8>) at ../Python/ceval.c:5120
#51 _PyEval_EvalFrameDefault () at ../Python/ceval.c:3404
#52 0x0000000000504fd4 in PyEval_EvalFrameEx (throwflag=0, f=Frame 0x4710f18, for file /usr/local/lib/python3.6/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py, line 345, in wrapper (args=(), kwargs={})) at ../Python/ceval.c:754
#53 _PyEval_EvalCodeWithName.lto_priv.1836 () at ../Python/ceval.c:4166
#54 0x0000000000506d00 in fast_function.lto_priv () at ../Python/ceval.c:4992
#55 0x00000000005076ed in call_function.lto_priv () at ../Python/ceval.c:4872
#56 0x00000000005092a4 in _PyEval_EvalFrameDefault () at ../Python/ceval.c:3335
#57 0x0000000000504fd4 in PyEval_EvalFrameEx (throwflag=0, f=Frame 0xd7bcc8, for file /usr/local/bin/torchrun, line 33, in <module> ()) at ../Python/ceval.c:754
#58 _PyEval_EvalCodeWithName.lto_priv.1836 () at ../Python/ceval.c:4166
#59 0x0000000000508103 in PyEval_EvalCodeEx (closure=0x0, kwdefs=0x0, defcount=0, defs=0x0, kwcount=0, kws=0x0, argcount=0, args=0x0, locals=<optimized out>, globals=<optimized out>, _co=<optimized out>) at ../Python/ceval.c:4187
#60 PyEval_EvalCode (co=<optimized out>, globals=<optimized out>, locals=<optimized out>) at ../Python/ceval.c:731
#61 0x0000000000634c32 in run_mod () at ../Python/pythonrun.c:1025
#62 0x0000000000634ce7 in PyRun_FileExFlags () at ../Python/pythonrun.c:978
#63 0x000000000063849f in PyRun_SimpleFileExFlags () at ../Python/pythonrun.c:419
#64 0x0000000000638675 in PyRun_AnyFileExFlags () at ../Python/pythonrun.c:81
#65 0x0000000000639041 in run_file (p_cf=0x7ffee3eda7ac, filename=<optimized out>, fp=<optimized out>) at ../Modules/main.c:340
#66 Py_Main () at ../Modules/main.c:810
#67 0x00000000004ad1f0 in main (argc=5, argv=0x7ffee3eda9a8) at ../Programs/python.c:69