Function 'SubBackward0' returned nan values in its 1th output

I am trying to implement a model where the forward function calls to an external function that computes the values using the model’s parametes.

class R_model(torch.nn.Module):
    def __init__(self,) :
        super().__init__()
        self.Kx = torch.nn.Parameter(torch.randint(-200, -100, (1,)).float()*0.0001)
        self.Ky = torch.nn.Parameter(torch.randint(-200, -100, (1,)).float()*0.0001)
        self.Kz = torch.nn.Parameter(torch.randint(-200, -100, (1,)).float()*0.0001)

    def forward(self):
        return generate_r(Kx=self.Kx, Ky=self.Ky, Kz=self.Kz)

and I train the function to converge on a fixed value as :

from model import R_model

torch.autograd.set_detect_anomaly(True) 

mod = R_model()
labels = generate_r(Kx=torch.tensor(-0.012),Ky=torch.tensor( -0.016),Kz=torch.tensor( -0.018))
optim = torch.optim.Adam(mod.parameters(), lr=0.001)
for name, param in mod.named_parameters():
    if param.requires_grad:
        print( name,': ', param.data)
optim.zero_grad()
y = mod.forward()
l = torch.abs((labels - y)).sum()
print("lossValue: ", l)
l.backward()
optim.step()

So during running the script I the values after gradient update is Nan, So the .set_detect_anomaly() function → True, which returns the following error:

UserWarning: Error detected in CosBackward0. Traceback of forward call that caused the error:
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
    app.start()
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    self._run_once()
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
    handle._run()
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
    await self.process_one()
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
    await dispatch(*args)
  File "/home/jeet/anaconda3/envs/mpa/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
    await result
...
  File "/home/jeet/Documents/ai_fau_study/Sose23/thesis/SED/util/analyticDampingDensity.py", line 54, in func_F
    u = torch.cos(phi)
 (Triggered internally at /opt/conda/conda-bld/pytorch_1682343967769/work/torch/csrc/autograd/python_anomaly_mode.cpp:114.)
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
/tmp/ipykernel_39719/4045680841.py in <cell line: 16>()
     14 print("lossValue: ", l)
     15 l.retain_grad()
---> 16 l.backward()
     17 print('Tracing back tensors:')
     18 optim.step()

~/anaconda3/envs/mpa/lib/python3.10/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    485                 inputs=inputs,
    486             )
--> 487         torch.autograd.backward(
    488             self, gradient, retain_graph, create_graph, inputs=inputs
    489         )

~/anaconda3/envs/mpa/lib/python3.10/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    198     # some Python versions print out the first line of a multi-line function
    199     # calls in the traceback and some print out the last line
--> 200     Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201         tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202         allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass

RuntimeError: Function 'SubBackward0' returned nan values in its 1th output.

The subtraction in question is

def func_p(a, b , c):
    a_cos = (-c / torch.sqrt(a**2 + b**2))
    a_cos[a_cos < -1] = -0.99999
    a_cos[a_cos > 1] = 0.99999
    a_tan = torch.atan(-a/b)
    t_cos = torch.Tensor([[1],[-1]])*torch.acos(a_cos)
    x = t_cos - a_tan     # line_where_SubBackward_returns_Nan
    
    return torch.clip(x, min=0, max=torch.pi/2)

All values are as they should be which I checked using debugging mode.

Any idea how and why this can happen ?

It looks like the two error messages say the errors occurred in different places: SubBackward and CosBackward. Are those from the same program?

If it occurred in CosBackward it could be because you saved a NaN for backward, e.g. your phi is NaN.