I am trying to implement a model where the forward function calls to an external function that computes the values using the model’s parametes.
class R_model(torch.nn.Module):
def __init__(self,) :
super().__init__()
self.Kx = torch.nn.Parameter(torch.randint(-200, -100, (1,)).float()*0.0001)
self.Ky = torch.nn.Parameter(torch.randint(-200, -100, (1,)).float()*0.0001)
self.Kz = torch.nn.Parameter(torch.randint(-200, -100, (1,)).float()*0.0001)
def forward(self):
return generate_r(Kx=self.Kx, Ky=self.Ky, Kz=self.Kz)
and I train the function to converge on a fixed value as :
from model import R_model
torch.autograd.set_detect_anomaly(True)
mod = R_model()
labels = generate_r(Kx=torch.tensor(-0.012),Ky=torch.tensor( -0.016),Kz=torch.tensor( -0.018))
optim = torch.optim.Adam(mod.parameters(), lr=0.001)
for name, param in mod.named_parameters():
if param.requires_grad:
print( name,': ', param.data)
optim.zero_grad()
y = mod.forward()
l = torch.abs((labels - y)).sum()
print("lossValue: ", l)
l.backward()
optim.step()
So during running the script I the values after gradient update is Nan, So the .set_detect_anomaly() function → True, which returns the following error:
UserWarning: Error detected in CosBackward0. Traceback of forward call that caused the error:
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
app.launch_new_instance()
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
app.start()
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
self.io_loop.start()
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
self.asyncio_loop.run_forever()
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
self._run_once()
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
handle._run()
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/asyncio/events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
await self.process_one()
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
await dispatch(*args)
File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
await result
...
File "/home/jeet/Documents/ai_fau_study/Sose23/thesis/SED/util/analyticDampingDensity.py", line 54, in func_F
u = torch.cos(phi)
(Triggered internally at /opt/conda/conda-bld/pytorch_1682343967769/work/torch/csrc/autograd/python_anomaly_mode.cpp:114.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
/tmp/ipykernel_39719/4045680841.py in <cell line: 16>()
14 print("lossValue: ", l)
15 l.retain_grad()
---> 16 l.backward()
17 print('Tracing back tensors:')
18 optim.step()
~/anaconda3/envs/mpa/lib/python3.10/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
485 inputs=inputs,
486 )
--> 487 torch.autograd.backward(
488 self, gradient, retain_graph, create_graph, inputs=inputs
489 )
~/anaconda3/envs/mpa/lib/python3.10/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
198 # some Python versions print out the first line of a multi-line function
199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
201 tensors, grad_tensors_, retain_graph, create_graph, inputs,
202 allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
RuntimeError: Function 'SubBackward0' returned nan values in its 1th output.
The subtraction in question is
def func_p(a, b , c):
a_cos = (-c / torch.sqrt(a**2 + b**2))
a_cos[a_cos < -1] = -0.99999
a_cos[a_cos > 1] = 0.99999
a_tan = torch.atan(-a/b)
t_cos = torch.Tensor([[1],[-1]])*torch.acos(a_cos)
x = t_cos - a_tan # line_where_SubBackward_returns_Nan
return torch.clip(x, min=0, max=torch.pi/2)
All values are as they should be which I checked using debugging mode.
Any idea how and why this can happen ?