As I enabled torch.autograd.set_detect_anomaly(True)
I got this error RuntimeError: Function 'PowBackward1' returned nan values in its 1th output.
. But I am not sure which source line in forward pass corresponds to this particular backward pass.
I am using pytorch lightning. How can I trace it back to the source line?
RuntimeError Traceback (most recent call last)
<ipython-input-5-b04366607296> in <module>
----> 1 trainer.fit(model, train_dataloader=dl_train, val_dataloaders = dl_val)
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
456 )
457
--> 458 self._run(model)
459
460 assert self.state.stopped
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
754
755 # dispatch `start_training` or `start_evaluating` or `start_predicting`
--> 756 self.dispatch()
757
758 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in dispatch(self)
795 self.accelerator.start_predicting(self)
796 else:
--> 797 self.accelerator.start_training(self)
798
799 def run_stage(self):
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
94
95 def start_training(self, trainer: 'pl.Trainer') -> None:
---> 96 self.training_type_plugin.start_training(trainer)
97
98 def start_evaluating(self, trainer: 'pl.Trainer') -> None:
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
142 def start_training(self, trainer: 'pl.Trainer') -> None:
143 # double dispatch to initiate the training loop
--> 144 self._results = trainer.run_stage()
145
146 def start_evaluating(self, trainer: 'pl.Trainer') -> None:
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
805 if self.predicting:
806 return self.run_predict()
--> 807 return self.run_train()
808
809 def _pre_training_routine(self):
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
867 with self.profiler.profile("run_training_epoch"):
868 # run train epoch
--> 869 self.train_loop.run_training_epoch()
870
871 if self.max_steps and self.max_steps <= self.global_step:
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
497 # ------------------------------------
498 with self.trainer.profiler.profile("run_training_batch"):
--> 499 batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
500
501 # when returning -1 from train_step, we end epoch early
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_batch(self, batch, batch_idx, dataloader_idx)
736
737 # optimizer step
--> 738 self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
739 if len(self.trainer.optimizers) > 1:
740 # revert back to previous state
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
432
433 # model hook
--> 434 model_ref.optimizer_step(
435 self.trainer.current_epoch,
436 batch_idx,
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
1401
1402 """
-> 1403 optimizer.step(closure=optimizer_closure)
1404
1405 def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py in step(self, closure, *args, **kwargs)
212 profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
213
--> 214 self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
215 self._total_optimizer_step_calls += 1
216
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py in __optimizer_step(self, closure, profiler_name, **kwargs)
132
133 with trainer.profiler.profile(profiler_name):
--> 134 trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
135
136 def step(self, *args, closure: Optional[Callable] = None, **kwargs):
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in optimizer_step(self, optimizer, opt_idx, lambda_closure, **kwargs)
327 )
328 if make_optimizer_step:
--> 329 self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
330 self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
331 self.training_type_plugin.post_optimizer_step(optimizer, opt_idx, **kwargs)
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in run_optimizer_step(self, optimizer, optimizer_idx, lambda_closure, **kwargs)
334 self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any
335 ) -> None:
--> 336 self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
337
338 def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in optimizer_step(self, optimizer, lambda_closure, **kwargs)
191
192 def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
--> 193 optimizer.step(closure=lambda_closure, **kwargs)
194
195 @property
~/conda/envs/myenv/lib/python3.8/site-packages/torch/optim/optimizer.py in wrapper(*args, **kwargs)
87 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
88 with torch.autograd.profiler.record_function(profile_name):
---> 89 return func(*args, **kwargs)
90 return wrapper
91
~/conda/envs/myenv/lib/python3.8/site-packages/torch/autograd/grad_mode.py in decorate_context(*args, **kwargs)
25 def decorate_context(*args, **kwargs):
26 with self.__class__():
---> 27 return func(*args, **kwargs)
28 return cast(F, decorate_context)
29
~/conda/envs/myenv/lib/python3.8/site-packages/torch/optim/adam.py in step(self, closure)
64 if closure is not None:
65 with torch.enable_grad():
---> 66 loss = closure()
67
68 for group in self.param_groups:
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in train_step_and_backward_closure()
730
731 def train_step_and_backward_closure():
--> 732 result = self.training_step_and_backward(
733 split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
734 )
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
834 if result is not None:
835 with self.trainer.profiler.profile("backward"):
--> 836 self.backward(result, optimizer, opt_idx)
837
838 # hook - call this hook only
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py in backward(self, result, optimizer, opt_idx, *args, **kwargs)
867 self.trainer.accelerator.backward(result, optimizer, opt_idx, should_accumulate, *args, **kwargs)
868 else:
--> 869 result.closure_loss = self.trainer.accelerator.backward(
870 result.closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs
871 )
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py in backward(self, closure_loss, optimizer, optimizer_idx, should_accumulate, *args, **kwargs)
306 self.training_type_plugin.pre_backward(closure_loss, should_accumulate, optimizer, optimizer_idx)
307
--> 308 output = self.precision_plugin.backward(
309 self.lightning_module, closure_loss, optimizer, optimizer_idx, should_accumulate, *args, **kwargs
310 )
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py in backward(self, model, closure_loss, optimizer, opt_idx, should_accumulate, *args, **kwargs)
77 # do backward pass
78 if automatic_optimization:
---> 79 model.backward(closure_loss, optimizer, opt_idx)
80 else:
81 closure_loss.backward(*args, **kwargs)
~/conda/envs/myenv/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py in backward(self, loss, optimizer, optimizer_idx, *args, **kwargs)
1273 """
1274 if self.automatic_optimization or self._running_manual_backward:
-> 1275 loss.backward(*args, **kwargs)
1276
1277 def toggle_optimizer(self, optimizer: Optimizer, optimizer_idx: int):
~/conda/envs/myenv/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
243 create_graph=create_graph,
244 inputs=inputs)
--> 245 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
246
247 def register_hook(self, hook):
~/conda/envs/myenv/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
143 retain_graph = create_graph
144
--> 145 Variable._execution_engine.run_backward(
146 tensors, grad_tensors_, retain_graph, create_graph, inputs,
147 allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Function 'PowBackward1' returned nan values in its 1th output.