Hi,
I recently play with the compile option and distributed data parallel. Locally it workes like a charm, now I wanted to try it on our cluster and run into an exception that is related to dynamo:
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 3795, in aot_module_simplified
compiled_fn = create_aot_dispatcher_function(
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 194, in time_wrapper
r = func(*args, **kwargs)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 3333, in create_aot_dispatcher_function
compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2120, in aot_wrapper_dedupe
return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2300, in aot_wrapper_synthetic_base
return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 1574, in aot_dispatch_base
compiled_fw = compiler(fw_module, flat_args)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 194, in time_wrapper
r = func(*args, **kwargs)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1033, in fw_compiler_base
return inner_compile(
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py", line 80, in debug_wrapper
inner_compiled_fn = compiler_fn(gm, example_inputs)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/debug.py", line 224, in inner
return fn(*args, **kwargs)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 52, in newFunction
return old_func(*args, **kwargs)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 312, in compile_fx_inner
compiled_graph: CompiledFxGraph = fx_codegen_and_compile(
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 534, in fx_codegen_and_compile
compiled_fn = graph.compile_to_fn()
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/graph.py", line 960, in compile_to_fn
return self.compile_to_module().call
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 194, in time_wrapper
r = func(*args, **kwargs)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/graph.py", line 931, in compile_to_module
mod = PyCodeCache.load_by_key_path(key, path, linemap=linemap)
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1068, in load_by_key_path
exec(code, mod.__dict__, mod.__dict__)
File "/tmp/torchinductor_twagner/fv/cfvfwaddcpvacy2mncxq7k727le6bt7dwj6uyrispgninqh36r6d.py", line 463, in <module>
async_compile.wait(globals())
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1344, in wait
scope[key] = result.result()
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1203, in result
self.future.result()
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/raven/u/twagner/mambaforge/envs/tomotwin_pytorch2/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
torch._dynamo.exc.BackendCompilerFailed: backend='compile_fn' raised:
FileNotFoundError: [Errno 2] No such file or directory: 'ldconfig'
Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
You can suppress this exception and fall back to eager by setting:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
Here is the code related to compile and distributeddataparallel:
I could now simply suppress the exception, but I first would like to understand what is happening.
Maybe someone could help me out?
Best,
Thorsten