Pytorch 2.1 on slurm

Hello,

I upraded my pytorch to 2.1 and there seem to be issues when I am running it on GPU on the slurm cluster I use.

 File "/storage/home/hcoda1/6/user123/VIT/model_reg_square.py", line 292, in <module>
    y_pred = model(x_masked, attn_mask)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 328, in _fn
    return fn(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 490, in catch_errors
    return callback(frame, cache_entry, hooks, frame_state)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 641, in _convert_frame
    result = inner_convert(frame, cache_size, hooks, frame_state)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 133, in _fn
    return fn(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 389, in _convert_frame_assert
    return _compile(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 569, in _compile
    guarded_code = compile_inner(code, one_graph, hooks, transform)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper
    r = func(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 491, in compile_inner
    out_code = transform_code_object(code, transform)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/bytecode_transformation.py", line 1028, in transform_code_object
    transformations(instructions, code_options)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/convert_frame.py", line 458, in transform
    tracer.run()
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2069, in run
    super().run()
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 719, in run
    and self.step()
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 683, in step
    getattr(self, inst.opname)(inst)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in RETURN_VALUE
    self.output.compile_subgraph(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 833, in compile_subgraph
    self.compile_and_call_fx_graph(tx, list(reversed(stack_values)), root)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 957, in compile_and_call_fx_graph
    compiled_fn = self.call_user_compiler(gm)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper
    r = func(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1024, in call_user_compiler
    raise BackendCompilerFailed(self.compiler_fn, e).with_traceback(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/output_graph.py", line 1009, in call_user_compiler
    compiled_fn = compiler_fn(gm, self.example_inputs())
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/repro/after_dynamo.py", line 117, in debug_wrapper
    compiled_gm = compiler_fn(gm, example_inputs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/__init__.py", line 1568, in __call__
    return compile_fx(model_, inputs_, config_patches=self.config)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1150, in compile_fx
    return aot_autograd(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/backends/common.py", line 55, in compiler_fn
    cg = aot_module_simplified(gm, example_inputs, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 3891, in aot_module_simplified
    compiled_fn = create_aot_dispatcher_function(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper
    r = func(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 3429, in create_aot_dispatcher_function
    compiled_fn = compiler_fn(flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2212, in aot_wrapper_dedupe
    return compiler_fn(flat_fn, leaf_flat_args, aot_config, fw_metadata=fw_metadata)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2392, in aot_wrapper_synthetic_base
    return compiler_fn(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_functorch/aot_autograd.py", line 2917, in aot_dispatch_autograd
    compiled_fw_func = aot_config.fw_compiler(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper
    r = func(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 1092, in fw_compiler_base
    return inner_compile(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/repro/after_aot.py", line 80, in debug_wrapper
    inner_compiled_fn = compiler_fn(gm, example_inputs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/debug.py", line 228, in inner
    return fn(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/contextlib.py", line 79, in inner
    return func(*args, **kwds)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 54, in newFunction
    return old_func(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 341, in compile_fx_inner
    compiled_graph: CompiledFxGraph = fx_codegen_and_compile(
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/compile_fx.py", line 565, in fx_codegen_and_compile
    compiled_fn = graph.compile_to_fn()
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/graph.py", line 970, in compile_to_fn
    return self.compile_to_module().call
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_dynamo/utils.py", line 189, in time_wrapper
    r = func(*args, **kwargs)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/graph.py", line 941, in compile_to_module
    mod = PyCodeCache.load_by_key_path(key, path, linemap=linemap)
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1139, in load_by_key_path
    exec(code, mod.__dict__, mod.__dict__)
  File "/scratch/4152356/torchinductor_user123/qz/cqzkkur7b6tqcm6ybza4x3syp4mkqpyaemiydeztjwogbypaudam.py", line 1070, in <module>
    async_compile.wait(globals())
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1418, in wait
    scope[key] = result.result()
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/site-packages/torch/_inductor/codecache.py", line 1277, in result
    self.future.result()
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/concurrent/futures/_base.py", line 458, in result
    return self.__get_result()
  File "/storage/home/hcoda1/6/user123/.conda/envs/myenv/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
    raise self._exception
torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
PermissionError: [Errno 13] Permission denied: 'ldconfig'

If i try to run ldconfig, the cluster shows command not found. How do I move forward?

1 Like