Hi, I’m a bit frustrated because I thought the whole point in pytorch doing “eager execution” is that it would enable interactive (e.g. pdb) debugging where any error-causing bug would immediately raise an exeception then pdb could interrupt and let you inspect?
I’m already using with torch.autograd.detect_anomaly():
but it is giving me a traceback which is probably wrong since it contradicts the reported error (see below). Is there another setting which would give me this ideal interactive pdb debugging scenario I’m talking about?
The error I’m facing is:
RuntimeError: Expected all tensors to be on the same device, but
found at least two devices, cpu and cuda:0! (when checking argument
for argument self in method wrapper_CUDA_softplus_backward)
But the traceback points to a line which only involves one tensor argument so I have no idea how it is complaining about different devices.
File "/u/ddeighan/MOR_MoE2/model_agnostic_BNN.py", line 109, in forward
sigma_params = nn.functional.softplus(self._rho_params)
Full TraceBack
/u/ddeighan/MOR_MoE2/JHTDB_sim_op.py:199: UserWarning: Anomaly Detection has been enabled. This mode will increase the runtime and should only be enabled for debugging.
with torch.autograd.detect_anomaly():
/u/ddeighan/MOR_MoE2/model_agnostic_BNN.py:94: UserWarning: Anomaly Detection has been enabled. This mode will increase the runtime and should only be enabled for debugging.
with torch.autograd.detect_anomaly():
/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/autograd/graph.py:768: UserWarning: Error detected in SoftplusBackward0. Traceback of forward call that caused the error:
File “”, line 198, in _run_module_as_main
File “”, line 88, in _run_code
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/pdb.py”, line 1795, in
pdb.main()
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/pdb.py”, line 1768, in main
pdb._run(target)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/pdb.py”, line 1646, in _run
self.run(target.code)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/bdb.py”, line 597, in run
exec(cmd, globals, locals)
File “”, line 1, in
File “/u/ddeighan/MOR_MoE2/channel.py”, line 102, in
model = SimModelClass(n_inputs=ndims, n_outputs=ndims, ndims=ndims, n_experts=n_experts, n_layers=n_layers, lr=lr, make_optim=make_optim, T_max=T_max,
File “/u/ddeighan/MOR_MoE2/JHTDB_sim_op.py”, line 197, in init
super().init(*args, simulator=simulator, **kwd_args)
File “/u/ddeighan/MOR_MoE2/JHTDB_sim_op.py”, line 168, in init
super().init(*args, **kwd_args)
File “/u/ddeighan/MOR_MoE2/POU_net.py”, line 240, in init
model_agnostic_BNN.model_agnostic_dnn_to_bnn(self, train_dataset_size, prior_cfg=prior_cfg)
File “/u/ddeighan/MOR_MoE2/model_agnostic_BNN.py”, line 146, in model_agnostic_dnn_to_bnn
dnn.apply(visit_parametrize)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 895, in apply
module.apply(fn)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 895, in apply
module.apply(fn)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 895, in apply
module.apply(fn)
[Previous line repeated 1 more time]
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 896, in apply
fn(self)
File “/u/ddeighan/MOR_MoE2/model_agnostic_BNN.py”, line 144, in visit_parametrize
parametrize.register_parametrization(module, name, _BayesianParameterization(param, **prior_cfg))
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/utils/parametrize.py”, line 573, in register_parametrization
parametrizations = ParametrizationList([parametrization], original, unsafe=unsafe)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/utils/parametrize.py”, line 192, in init
Z = self()
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 1562, in _call_impl
return forward_call(*args, **kwargs)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/utils/parametrize.py”, line 276, in forward
x = self0
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/nn/modules/module.py”, line 1562, in _call_impl
return forward_call(*args, **kwargs)
File “/u/ddeighan/MOR_MoE2/model_agnostic_BNN.py”, line 109, in forward
sigma_params = nn.functional.softplus(self._rho_params)
(Triggered internally at /home/conda/feedstock_root/build_artifacts/libtorch_1727865399751/work/torch/csrc/autograd/python_anomaly_mode.cpp:111.)
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/pdb.py”, line 1768, in main
pdb._run(target)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/pdb.py”, line 1646, in _run
self.run(target.code)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/bdb.py”, line 597, in run
exec(cmd, globals, locals)
File “”, line 1, in
File “/u/ddeighan/MOR_MoE2/channel.py”, line 128, in
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_dataloaders) #, ckpt_path=ckpt_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py”, line 539, in fit
call._call_and_handle_interrupt(
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py”, line 47, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py”, line 575, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py”, line 982, in _run
results = self._run_stage()
^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/trainer/trainer.py”, line 1026, in _run_stage
self.fit_loop.run()
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py”, line 216, in run
self.advance()
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py”, line 455, in advance
self.epoch_loop.run(self._data_fetcher)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/training_epoch_loop.py”, line 150, in run
self.advance(data_fetcher)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/training_epoch_loop.py”, line 320, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/optimization/automatic.py”, line 192, in run
self._optimizer_step(batch_idx, closure)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/optimization/automatic.py”, line 270, in _optimizer_step
call._call_lightning_module_hook(
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py”, line 171, in _call_lightning_module_hook
output = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/core/module.py”, line 1302, in optimizer_step
optimizer.step(closure=optimizer_closure)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/core/optimizer.py”, line 154, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/strategies/strategy.py”, line 239, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/plugins/precision/precision.py”, line 123, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/optim/lr_scheduler.py”, line 130, in wrapper
return func.get(opt, opt.class)(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/optim/optimizer.py”, line 484, in wrapper
out = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/optim/optimizer.py”, line 89, in _use_grad
ret = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/optim/adam.py”, line 205, in step
loss = closure()
^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/plugins/precision/precision.py”, line 109, in _wrap_closure
closure_result = closure()
^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/optimization/automatic.py”, line 146, in call
self._result = self.closure(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/utils/_contextlib.py”, line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/optimization/automatic.py”, line 140, in closure
self._backward_fn(step_output.closure_loss)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/loops/optimization/automatic.py”, line 241, in backward_fn
call._call_strategy_hook(self.trainer, “backward”, loss, optimizer)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/trainer/call.py”, line 323, in _call_strategy_hook
output = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/strategies/strategy.py”, line 213, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/plugins/precision/precision.py”, line 73, in backward
model.backward(tensor, *args, **kwargs)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/pytorch_lightning/core/module.py”, line 1097, in backward
loss.backward(*args, **kwargs)
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/_tensor.py”, line 521, in backward
torch.autograd.backward(
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/autograd/init.py”, line 289, in backward
_engine_run_backward(
File “/u/ddeighan/miniforge3/envs/uqops2/lib/python3.11/site-packages/torch/autograd/graph.py”, line 768, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument self in method wrapper_CUDA_softplus_backward)