Using a Custom Loss Function produces NAN

I have been trying to apply a new custom Loss function with an algorithm that works with BCEwithLogists


where
$o_i$ represents the sigmoid of the decoded values and $b_i$ represents the correct labels.

eps = 1e-10

def NBER_Loss(decoded_value, correct_result, custom_lambda=1):
    decoded_value = decoded_value.sigmoid().clamp(eps, 1-eps)
    loss1 = (torch.pow(decoded_value, correct_result) *torch.pow(1-decoded_value, 1-correct_result)).clamp(eps, 1-eps)
    loss = torch.mean(torch.pow(-loss1*torch.log2(loss1).clamp(-100.0, 0.0),
                      custom_lambda*torch.abs(1-decoded_value-correct_result)+eps))
    return loss

The output is as follows:

decoded pre sigmoid tensor([[ -55.0442,   55.0442,  -55.0442,  ...,   -3.9323,   -9.0391,
            3.9323],
        [ 129.7956,   69.6005,  -62.5831,  ...,   -0.9936,   -0.9936,
           -6.7401],
        [-111.1677,  111.1677,   77.0329,  ...,   -2.4284,   -9.3211,
            2.8110],
        ...,
        [ 190.0725,  139.7126,  111.0150,  ...,  -29.0303,   28.2014,
           27.1241],
        [ 128.6071,   90.6857,  -82.5502,  ...,   -6.8028,   -4.6547,
           -4.6547],
        [-150.4684, -150.4684,  150.4684,  ...,   -5.2145,   -8.1823,
            5.2145]], device='cuda:0', grad_fn=<CopySlices>)
decoded post sigmoid tensor([[1.0000e-10, 1.0000e+00, 1.0000e-10,  ..., 1.9222e-02, 1.1866e-04,
         9.8078e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e-10,  ..., 2.7021e-01, 2.7021e-01,
         1.1812e-03],
        [1.0000e-10, 1.0000e+00, 1.0000e+00,  ..., 8.1032e-02, 8.9505e-05,
         9.4327e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e-10, 1.0000e+00,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e-10,  ..., 1.1094e-03, 9.4274e-03,
         9.4274e-03],
        [1.0000e-10, 1.0000e-10, 1.0000e+00,  ..., 5.4077e-03, 2.7949e-04,
         9.9459e-01]], device='cuda:0', grad_fn=<ClampBackward1>)
Loss0  tensor([[1.0000e+00, 1.0000e-10, 1.0000e+00,  ..., 9.8078e-01, 1.1866e-04,
         1.9222e-02],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 2.7021e-01, 2.7021e-01,
         9.9882e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.1032e-02, 9.9991e-01,
         5.6732e-02],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [1.0000e-10, 1.0000e-10, 1.0000e-10,  ..., 9.9889e-01, 9.9057e-01,
         9.4274e-03],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9459e-01, 9.9972e-01,
         9.9459e-01]], device='cuda:0', grad_fn=<ClampBackward1>)
Loss tensor(0.2589, device='cuda:0', grad_fn=<MeanBackward0>)
[W python_anomaly_mode.cpp:104] Warning: Error detected in MulBackward0. Traceback of forward call that caused the error:
  File "xxx.py", line 835, in <module>
    loss = NBER_Loss(decoded_bits,  0.5 *
  File "xxx.py", line 780, in NBER_Loss
    loss = torch.mean(torch.pow(-loss1*torch.log2(loss1).clamp(-100.0, 0.0),
 (function _print_stack)
Traceback (most recent call last):
  File "xxx.py", line 837, in <module>
    loss.backward()
  File "/home/xxx/.local/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/xxx/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Function 'MulBackward0' returned nan values in its 0th output.

Any idea where this error might be coming from?

Could you check the input values to torch.pow as well as torch.log2 and make sure they are not creating invalid outputs?

1 Like

I tried that, I even tried to detect if there is a nan in the arrays:
New Code:

def NBER_Loss(decoded_value_in, correct_result, custom_lambda=1):
    print("decoded pre sigmoid", decoded_value_in)
    decoded_value = decoded_value_in.sigmoid()
    print("decoded post sigmoid", decoded_value, decoded_value.isnan().any())
    pow1 = torch.pow(decoded_value, correct_result)
    print("pow1 ", pow1, pow1.isnan().any())
    pow2 = torch.pow(1-decoded_value, 1-correct_result)
    print("pow2 ", pow2,  pow2.isnan().any())
    loss1 = (pow1 * pow2)
    print("Loss1 ", loss1, loss1.isnan().any())
    logy1 = torch.log2(loss1).clamp(-100, eps)
    print("logy1 ", logy1, logy1.isnan().any())
    mul1 = (-loss1*logy1)
    print("mul1 ", mul1, mul1.isnan().any())
    exp1 = custom_lambda*torch.abs(1-decoded_value-correct_result)
    print("exp1 ", exp1, exp1.isnan().any())
    loss = torch.mean(torch.pow(mul1, exp1))
    print("Loss", loss, loss.isnan().any())
    return loss

Output:

Models are loaded!
decoded pre sigmoid tensor([[  81.2153,  -81.2153,   81.2153,  ...,   11.5352,    9.4217,
          -11.5352],
        [ 110.4419,  -87.9028,  -87.3531,  ...,   -9.1117,    8.5950,
           -8.5950],
        [ -94.5346,  -54.3137,  -44.0537,  ...,   -3.7948,    3.7948,
            3.7948],
        ...,
        [  51.4893,  -33.8269,  -45.4504,  ...,    9.6243,   -6.2635,
            4.6391],
        [-132.5985,  119.7083,  115.9472,  ...,    9.1817,   -5.7137,
           -5.7137],
        [  88.0838,  -88.0838,   17.6299,  ...,   -4.0679,    6.9286,
            4.0679]], device='cuda:0', grad_fn=<CopySlices>)
decoded post sigmoid tensor([[1.0000e+00, 5.3535e-36, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         9.7800e-06],
        [1.0000e+00, 6.6723e-39, 1.1562e-38,  ..., 1.1036e-04, 9.9982e-01,
         1.8499e-04],
        [0.0000e+00, 2.5814e-24, 7.3741e-20,  ..., 2.1994e-02, 9.7801e-01,
         9.7801e-01],
        ...,
        [1.0000e+00, 2.0379e-15, 1.8245e-20,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [0.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 3.2896e-03,
         3.2896e-03],
        [1.0000e+00, 5.5682e-39, 1.0000e+00,  ..., 1.6825e-02, 9.9902e-01,
         9.8318e-01]], device='cuda:0', grad_fn=<SigmoidBackward>) tensor(False, device='cuda:0')
pow1  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 9.9982e-01,
         1.0000e+00],
        [0.0000e+00, 1.0000e+00, 7.3741e-20,  ..., 2.1994e-02, 1.0000e+00,
         9.7801e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 1.0000e+00,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         9.8318e-01]], device='cuda:0', grad_fn=<PowBackward1>) tensor(False, device='cuda:0')
pow2  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         9.9999e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9989e-01, 1.0000e+00,
         9.9981e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 2.1994e-02,
         1.0000e+00],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 9.9671e-01,
         9.9671e-01],
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 9.8318e-01, 9.7847e-04,
         1.0000e+00]], device='cuda:0', grad_fn=<PowBackward1>) tensor(False, device='cuda:0')
Loss1  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         9.9999e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9989e-01, 9.9982e-01,
         9.9981e-01],
        [0.0000e+00, 1.0000e+00, 7.3741e-20,  ..., 2.1994e-02, 2.1994e-02,
         9.7801e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 9.9671e-01,
         9.9671e-01],
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 9.8318e-01, 9.7847e-04,
         9.8318e-01]], device='cuda:0', grad_fn=<MulBackward0>) tensor(False, device='cuda:0')
logy1  tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.4103e-05,
         -1.1678e-04, -1.4103e-05],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.5918e-04,
         -2.6686e-04, -2.6694e-04],
        [-1.0000e+02,  0.0000e+00, -6.3556e+01,  ..., -5.5068e+00,
         -5.5068e+00, -3.2084e-02],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -9.5454e-05,
         -9.0390e+00, -1.3879e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.4851e-04,
         -4.7538e-03, -4.7538e-03],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+02,  ..., -2.4480e-02,
         -9.9972e+00, -2.4480e-02]], device='cuda:0', grad_fn=<ClampBackward1>) tensor(False, device='cuda:0')
mul1  tensor([[-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 1.4103e-05, 1.1677e-04,
         1.4103e-05],
        [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 1.5916e-04, 2.6681e-04,
         2.6689e-04],
        [0.0000e+00, -0.0000e+00, 4.6867e-18,  ..., 1.2111e-01, 1.2111e-01,
         3.1379e-02],
        ...,
        [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 9.5447e-05, 1.7183e-02,
         1.3746e-02],
        [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 1.4850e-04, 4.7381e-03,
         4.7381e-03],
        [-0.0000e+00, -0.0000e+00, 0.0000e+00,  ..., 2.4068e-02, 9.7819e-03,
         2.4068e-02]], device='cuda:0', grad_fn=<MulBackward0>) tensor(False, device='cuda:0')
exp1  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         9.9999e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9989e-01, 9.9982e-01,
         9.9981e-01],
        [0.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 2.1994e-02, 2.1994e-02,
         9.7801e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 9.9671e-01,
         9.9671e-01],
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 9.8318e-01, 9.7847e-04,
         9.8318e-01]], device='cuda:0', grad_fn=<MulBackward0>) tensor(False, device='cuda:0')
Loss tensor(0.2544, device='cuda:0', grad_fn=<MeanBackward0>) tensor(False, device='cuda:0')

[W python_anomaly_mode.cpp:104] Warning: Error detected in MulBackward0. Traceback of forward call that caused the error:
  File "train_KO_m2.py", line 868, in <module>
    loss = NBER_Loss(decoded_bits,  0.5 *
  File "train_KO_m2.py", line 801, in NBER_Loss
    mul1 = (-loss1*logy1)
 (function _print_stack)
Traceback (most recent call last):
  File "train_KO_m2.py", line 870, in <module>
    loss.backward()
  File "/home/marwan/.local/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/marwan/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Function 'MulBackward0' returned nan values in its 0th output.

The solution turned out to be clamping the input to the log and not the output of the log. This permits back propagation