Hello,
During training, jupyter notebook crashes (after the same number of steps each time) with the message “The kernel appears to have died. It will restart automatically”. I get a segmentation fault. I am training on a single GPU, pytorch version 1.9.1.
Here’s what gdb stacktrace returns:
Thread 1 "python" received signal SIGSEGV, Segmentation fault.
0x00007ffea0a62a8a in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() ()
from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
(gdb) backtrace
#0 0x00007ffea0a62a8a in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release() ()
from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#1 0x00007ffea0a8901c in arrow::json::ChunkedListArrayBuilder::InsertNull(long, long) ()
from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#2 0x00007ffea0a8941f in arrow::json::ChunkedListArrayBuilder::Insert(long, std::shared_ptr<arrow::Field> const&, std::shared_ptr<arrow::Array> const&) () from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#3 0x00007ffea0a860bd in arrow::json::ChunkedStructArrayBuilder::Finish(std::shared_ptr<arrow::ChunkedArray>*) ()
from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#4 0x00007ffea0a7dc2c in arrow::json::ChunkedListArrayBuilder::Finish(std::shared_ptr<arrow::ChunkedArray>*) ()
--Type <RET> for more, q to quit, c to continue without paging--c
from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#5 0x00007ffea0a8654d in arrow::json::ChunkedStructArrayBuilder::Finish(std::shared_ptr<arrow::ChunkedArray>*) () from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#6 0x00007ffea0a8654d in arrow::json::ChunkedStructArrayBuilder::Finish(std::shared_ptr<arrow::ChunkedArray>*) () from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#7 0x00007ffea0a94e02 in arrow::json::TableReaderImpl::Read() () from /opt/conda/lib/python3.7/site-packages/pyarrow/libarrow.so.500
#8 0x00007ffe9dc66f39 in __pyx_pw_7pyarrow_5_json_1read_json(_object*, _object*, _object*) () from /opt/conda/lib/python3.7/site-packages/pyarrow/_json.cpython-37m-x86_64-linux-gnu.so
#9 0x0000555555666919 in _PyMethodDef_RawFastCallKeywords (method=<optimized out>, self=0x0, args=<optimized out>, nargs=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:693
#10 0x00005555556c20b8 in _PyCFunction_FastCallKeywords (kwnames=<optimized out>, nargs=<optimized out>, args=0x55555d179860, func=0x7ffe9de56c80) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:723
#11 call_function (pp_stack=0x7fffffffc970, oparg=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:4568
#12 0x0000555555707fe8 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3139
#13 0x00005555556d6e3f in PyEval_EvalFrameEx (throwflag=0, f=0x55555d179680) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#14 gen_send_ex (closing=0, exc=0, arg=0x0, gen=0x7ffe0a8bb8d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:221
#15 gen_iternext (gen=0x7ffe0a8bb8d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:542
#16 0x0000555555707598 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:2809
#17 0x00005555556d6e3f in PyEval_EvalFrameEx (throwflag=0, f=0x7ffe9dc17650) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#18 gen_send_ex (closing=0, exc=0, arg=0x0, gen=0x7ffe0a8bb850) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:221
#19 gen_iternext (gen=0x7ffe0a8bb850) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:542
#20 0x0000555555707598 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:2809
#21 0x00005555556d6e3f in PyEval_EvalFrameEx (throwflag=0, f=0x7ffe9dc1a910) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#22 gen_send_ex (closing=0, exc=0, arg=0x0, gen=0x7ffe0a8bb7d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:221
#23 gen_iternext (gen=0x7ffe0a8bb7d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:542
#24 0x000055555562e295 in islice_next (lz=0x7ffd05728290) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Modules/itertoolsmodule.c:1552
#25 0x0000555555707598 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:2809
#26 0x000055555564afb2 in PyEval_EvalFrameEx (throwflag=0, f=0x7ffe310bb050) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3930
#27 _PyEval_EvalCodeWithName (_co=<optimized out>, globals=<optimized out>, locals=<optimized out>, args=<optimized out>, argcount=<optimized out>, kwnames=<optimized out>, kwargs=<optimized out>, kwcount=<optimized out>, kwstep=<optimized out>, defs=<optimized out>, defcount=<optimized out>, kwdefs=<optimized out>, closure=<optimized out>, name=<optimized out>, qualname=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3930
#28 0x00005555556c1010 in _PyFunction_FastCallKeywords (func=<optimized out>, stack=0x55555d179f60, nargs=1, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:433
#29 0x00005555556c1fa8 in call_function (pp_stack=0x7fffffffd0d8, oparg=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:4616
#30 0x00005555557072ba in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3124
#31 0x0000555555678763 in PyEval_EvalFrameEx (throwflag=<optimized out>, f=0x55555d179d90) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#32 gen_send_ex (gen=0x7ffe0a8bb750, arg=<optimized out>, exc=<optimized out>, closing=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:221
#33 0x000055555570850e in _PyGen_Send (arg=0x5555558aef30 <_Py_NoneStruct>, gen=0x7ffe0a8bb750) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:289
#34 _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:1787
#35 0x00005555556d6e3f in PyEval_EvalFrameEx (throwflag=0, f=0x7ffe9dc1a750) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#36 gen_send_ex (closing=0, exc=0, arg=0x0, gen=0x7ffe0a8bb6d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:221
#37 gen_iternext (gen=0x7ffe0a8bb6d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:542
#38 0x0000555555707598 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:2809
#39 0x00005555556d72ca in PyEval_EvalFrameEx (throwflag=0, f=0x55555d198f40) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#40 gen_send_ex (closing=0, exc=0, arg=0x0, gen=0x7ffe0a8bb3d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:221
#41 gen_iternext (gen=0x7ffe0a8bb3d0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/genobject.c:542
#42 builtin_next (self=<optimized out>, args=<optimized out>, nargs=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/bltinmodule.c:1426
#43 0x00005555556666b8 in _PyMethodDef_RawFastCallKeywords (method=0x5555558a6c60 <builtin_methods+992>, self=0x7ffff7a91d10, args=0x7ffe0a8bc5f8, nargs=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:654
#44 0x00005555556c20b8 in _PyCFunction_FastCallKeywords (kwnames=<optimized out>, nargs=<optimized out>, args=0x7ffe0a8bc5f8, func=0x7ffff7a1d370) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:723
#45 call_function (pp_stack=0x7fffffffd5f8, oparg=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:4568
#46 0x00005555557072ba in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3124
#47 0x00005555556c0e74 in PyEval_EvalFrameEx (throwflag=0, f=0x7ffe0a8bc450) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#48 function_code_fastcall (globals=0x7fff1aa5a640, nargs=<optimized out>, args=<optimized out>, co=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:283
#49 _PyFunction_FastCallKeywords (func=<optimized out>, stack=<optimized out>, nargs=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:408
#50 0x00005555556c1fa8 in call_function (pp_stack=0x7fffffffd7e0, oparg=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:4616
#51 0x0000555555707350 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3110
#52 0x00005555556c0e74 in PyEval_EvalFrameEx (throwflag=0, f=0x7ffe0a8cc210) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#53 function_code_fastcall (globals=0x7fff1aa3a1e0, nargs=<optimized out>, args=<optimized out>, co=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:283
#54 _PyFunction_FastCallKeywords (func=<optimized out>, stack=<optimized out>, nargs=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:408
#55 0x00005555556c1fa8 in call_function (pp_stack=0x7fffffffd9d0, oparg=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:4616
#56 0x0000555555707350 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3110
#57 0x000055555564c36d in PyEval_EvalFrameEx (throwflag=0, f=0x7ffe0a8c2620) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:544
#58 function_code_fastcall (globals=<optimized out>, nargs=<optimized out>, args=<optimized out>, co=0x7fff1aa3e540) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:283
#59 _PyFunction_FastCallDict (func=<optimized out>, args=<optimized out>, nargs=<optimized out>, kwargs=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:322
#60 0x00005555556c5fe2 in _PyObject_FastCallDict (kwargs=0x0, nargs=<optimized out>, args=0x7fffffffdb50, callable=0x7fff1aa68440) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:84
#61 _PyObject_FastCall_Prepend (nargs=<optimized out>, args=0x0, obj=<optimized out>, callable=0x7fff1aa68440) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:868
#62 call_unbound (nargs=<optimized out>, args=0x0, self=<optimized out>, func=0x7fff1aa68440, unbound=1) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/typeobject.c:1503
#63 call_method (obj=<optimized out>, name=<optimized out>, args=0x0, nargs=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/typeobject.c:1535
#64 0x00005555556701b3 in enum_next (en=0x7ffe0aa050a0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/enumobject.c:156
#65 0x0000555555707598 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:2809
#66 0x000055555564afb2 in PyEval_EvalFrameEx (throwflag=0, f=0x55555c564ca0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3930
#67 _PyEval_EvalCodeWithName (_co=<optimized out>, globals=<optimized out>, locals=<optimized out>, args=<optimized out>, argcount=<optimized out>, kwnames=<optimized out>, kwargs=<optimized out>, kwcount=<optimized out>, kwstep=<optimized out>, defs=<optimized out>, defcount=<optimized out>, kwdefs=<optimized out>, closure=<optimized out>, name=<optimized out>, qualname=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3930
#68 0x00005555556c1010 in _PyFunction_FastCallKeywords (func=<optimized out>, stack=0x7ffff79a85b8, nargs=1, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Objects/call.c:433
#69 0x00005555556c1fa8 in call_function (pp_stack=0x7fffffffdf90, oparg=<optimized out>, kwnames=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:4616
#70 0x0000555555707fe8 in _PyEval_EvalFrameDefault (f=<optimized out>, throwflag=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3139
#71 0x000055555564afb2 in PyEval_EvalFrameEx (throwflag=0, f=0x7ffff79a8450) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3930
#72 _PyEval_EvalCodeWithName (_co=<optimized out>, globals=<optimized out>, locals=<optimized out>, args=<optimized out>, argcount=<optimized out>, kwnames=<optimized out>, kwargs=<optimized out>, kwcount=<optimized out>, kwstep=<optimized out>, defs=<optimized out>, defcount=<optimized out>, kwdefs=<optimized out>, closure=<optimized out>, name=<optimized out>, qualname=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3930
#73 0x000055555564c1c9 in PyEval_EvalCodeEx (_co=<optimized out>, globals=<optimized out>, locals=<optimized out>, args=<optimized out>, argcount=<optimized out>, kws=<optimized out>, kwcount=0, defs=0x0, defcount=0, kwdefs=0x0, closure=0x0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:3959
#74 0x000055555572a47b in PyEval_EvalCode (co=<optimized out>, globals=<optimized out>, locals=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/ceval.c:524
#75 0x0000555555792343 in run_mod (mod=<optimized out>, filename=<optimized out>, globals=0x7ffff7a11eb0, locals=0x7ffff7a11eb0, flags=<optimized out>, arena=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/pythonrun.c:1035
#76 0x000055555579c747 in PyRun_FileExFlags (fp=0x5555559265f0, filename_str=<optimized out>, start=<optimized out>, globals=0x7ffff7a11eb0, locals=0x7ffff7a11eb0, closeit=1, flags=0x7fffffffe270) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/pythonrun.c:988
#77 0x000055555579c91c in PyRun_SimpleFileExFlags (fp=0x5555559265f0, filename=<optimized out>, closeit=1, flags=0x7fffffffe270) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Python/pythonrun.c:429
#78 0x000055555579ce79 in pymain_run_file (p_cf=0x7fffffffe270, filename=<optimized out>, fp=0x5555559265f0) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Modules/main.c:456
#79 pymain_run_filename (cf=0x7fffffffe270, pymain=0x7fffffffe380) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Modules/main.c:1646
#80 pymain_run_python (pymain=0x7fffffffe380) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Modules/main.c:2907
#81 pymain_main (pymain=0x7fffffffe380) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Modules/main.c:3068
#82 0x000055555579cfcc in _Py_UnixMain (argc=<optimized out>, argv=<optimized out>) at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Modules/main.c:3103
#83 0x00007ffff7c6c09b in __libc_start_main (main=0x55555562b390 <main>, argc=2, argv=0x7fffffffe4d8, init=<optimized out>, fini=<optimized out>, rtld_fini=<optimized out>, stack_end=0x7fffffffe4c8) at ../csu/libc-start.c:308
#84 0x0000555555716e21 in _start () at /home/conda/feedstock_root/build_artifacts/python_1631559780463/work/Parser/parser.c:325
Thanks so much for the help