Using a Custom Loss Function produces NAN

marwanj · June 6, 2022, 11:30pm

I have been trying to apply a new custom Loss function with an algorithm that works with BCEwithLogists

where
$o_i$ represents the sigmoid of the decoded values and $b_i$ represents the correct labels.

eps = 1e-10

def NBER_Loss(decoded_value, correct_result, custom_lambda=1):
    decoded_value = decoded_value.sigmoid().clamp(eps, 1-eps)
    loss1 = (torch.pow(decoded_value, correct_result) *torch.pow(1-decoded_value, 1-correct_result)).clamp(eps, 1-eps)
    loss = torch.mean(torch.pow(-loss1*torch.log2(loss1).clamp(-100.0, 0.0),
                      custom_lambda*torch.abs(1-decoded_value-correct_result)+eps))
    return loss

The output is as follows:

decoded pre sigmoid tensor([[ -55.0442,   55.0442,  -55.0442,  ...,   -3.9323,   -9.0391,
            3.9323],
        [ 129.7956,   69.6005,  -62.5831,  ...,   -0.9936,   -0.9936,
           -6.7401],
        [-111.1677,  111.1677,   77.0329,  ...,   -2.4284,   -9.3211,
            2.8110],
        ...,
        [ 190.0725,  139.7126,  111.0150,  ...,  -29.0303,   28.2014,
           27.1241],
        [ 128.6071,   90.6857,  -82.5502,  ...,   -6.8028,   -4.6547,
           -4.6547],
        [-150.4684, -150.4684,  150.4684,  ...,   -5.2145,   -8.1823,
            5.2145]], device='cuda:0', grad_fn=<CopySlices>)
decoded post sigmoid tensor([[1.0000e-10, 1.0000e+00, 1.0000e-10,  ..., 1.9222e-02, 1.1866e-04,
         9.8078e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e-10,  ..., 2.7021e-01, 2.7021e-01,
         1.1812e-03],
        [1.0000e-10, 1.0000e+00, 1.0000e+00,  ..., 8.1032e-02, 8.9505e-05,
         9.4327e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e-10, 1.0000e+00,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e-10,  ..., 1.1094e-03, 9.4274e-03,
         9.4274e-03],
        [1.0000e-10, 1.0000e-10, 1.0000e+00,  ..., 5.4077e-03, 2.7949e-04,
         9.9459e-01]], device='cuda:0', grad_fn=<ClampBackward1>)
Loss0  tensor([[1.0000e+00, 1.0000e-10, 1.0000e+00,  ..., 9.8078e-01, 1.1866e-04,
         1.9222e-02],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 2.7021e-01, 2.7021e-01,
         9.9882e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 8.1032e-02, 9.9991e-01,
         5.6732e-02],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [1.0000e-10, 1.0000e-10, 1.0000e-10,  ..., 9.9889e-01, 9.9057e-01,
         9.4274e-03],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9459e-01, 9.9972e-01,
         9.9459e-01]], device='cuda:0', grad_fn=<ClampBackward1>)
Loss tensor(0.2589, device='cuda:0', grad_fn=<MeanBackward0>)
[W python_anomaly_mode.cpp:104] Warning: Error detected in MulBackward0. Traceback of forward call that caused the error:
  File "xxx.py", line 835, in <module>
    loss = NBER_Loss(decoded_bits,  0.5 *
  File "xxx.py", line 780, in NBER_Loss
    loss = torch.mean(torch.pow(-loss1*torch.log2(loss1).clamp(-100.0, 0.0),
 (function _print_stack)
Traceback (most recent call last):
  File "xxx.py", line 837, in <module>
    loss.backward()
  File "/home/xxx/.local/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/xxx/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Function 'MulBackward0' returned nan values in its 0th output.

Any idea where this error might be coming from?

ptrblck · June 7, 2022, 3:38am

Could you check the input values to torch.pow as well as torch.log2 and make sure they are not creating invalid outputs?

marwanj · June 7, 2022, 4:33pm

I tried that, I even tried to detect if there is a nan in the arrays:
New Code:

def NBER_Loss(decoded_value_in, correct_result, custom_lambda=1):
    print("decoded pre sigmoid", decoded_value_in)
    decoded_value = decoded_value_in.sigmoid()
    print("decoded post sigmoid", decoded_value, decoded_value.isnan().any())
    pow1 = torch.pow(decoded_value, correct_result)
    print("pow1 ", pow1, pow1.isnan().any())
    pow2 = torch.pow(1-decoded_value, 1-correct_result)
    print("pow2 ", pow2,  pow2.isnan().any())
    loss1 = (pow1 * pow2)
    print("Loss1 ", loss1, loss1.isnan().any())
    logy1 = torch.log2(loss1).clamp(-100, eps)
    print("logy1 ", logy1, logy1.isnan().any())
    mul1 = (-loss1*logy1)
    print("mul1 ", mul1, mul1.isnan().any())
    exp1 = custom_lambda*torch.abs(1-decoded_value-correct_result)
    print("exp1 ", exp1, exp1.isnan().any())
    loss = torch.mean(torch.pow(mul1, exp1))
    print("Loss", loss, loss.isnan().any())
    return loss

Output:

Models are loaded!
decoded pre sigmoid tensor([[  81.2153,  -81.2153,   81.2153,  ...,   11.5352,    9.4217,
          -11.5352],
        [ 110.4419,  -87.9028,  -87.3531,  ...,   -9.1117,    8.5950,
           -8.5950],
        [ -94.5346,  -54.3137,  -44.0537,  ...,   -3.7948,    3.7948,
            3.7948],
        ...,
        [  51.4893,  -33.8269,  -45.4504,  ...,    9.6243,   -6.2635,
            4.6391],
        [-132.5985,  119.7083,  115.9472,  ...,    9.1817,   -5.7137,
           -5.7137],
        [  88.0838,  -88.0838,   17.6299,  ...,   -4.0679,    6.9286,
            4.0679]], device='cuda:0', grad_fn=<CopySlices>)
decoded post sigmoid tensor([[1.0000e+00, 5.3535e-36, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         9.7800e-06],
        [1.0000e+00, 6.6723e-39, 1.1562e-38,  ..., 1.1036e-04, 9.9982e-01,
         1.8499e-04],
        [0.0000e+00, 2.5814e-24, 7.3741e-20,  ..., 2.1994e-02, 9.7801e-01,
         9.7801e-01],
        ...,
        [1.0000e+00, 2.0379e-15, 1.8245e-20,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [0.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 3.2896e-03,
         3.2896e-03],
        [1.0000e+00, 5.5682e-39, 1.0000e+00,  ..., 1.6825e-02, 9.9902e-01,
         9.8318e-01]], device='cuda:0', grad_fn=<SigmoidBackward>) tensor(False, device='cuda:0')
pow1  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 9.9982e-01,
         1.0000e+00],
        [0.0000e+00, 1.0000e+00, 7.3741e-20,  ..., 2.1994e-02, 1.0000e+00,
         9.7801e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 1.0000e+00,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         9.8318e-01]], device='cuda:0', grad_fn=<PowBackward1>) tensor(False, device='cuda:0')
pow2  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         9.9999e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9989e-01, 1.0000e+00,
         9.9981e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 2.1994e-02,
         1.0000e+00],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 1.0000e+00,
         1.0000e+00],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 1.0000e+00, 9.9671e-01,
         9.9671e-01],
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 9.8318e-01, 9.7847e-04,
         1.0000e+00]], device='cuda:0', grad_fn=<PowBackward1>) tensor(False, device='cuda:0')
Loss1  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         9.9999e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9989e-01, 9.9982e-01,
         9.9981e-01],
        [0.0000e+00, 1.0000e+00, 7.3741e-20,  ..., 2.1994e-02, 2.1994e-02,
         9.7801e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 9.9671e-01,
         9.9671e-01],
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 9.8318e-01, 9.7847e-04,
         9.8318e-01]], device='cuda:0', grad_fn=<MulBackward0>) tensor(False, device='cuda:0')
logy1  tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.4103e-05,
         -1.1678e-04, -1.4103e-05],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.5918e-04,
         -2.6686e-04, -2.6694e-04],
        [-1.0000e+02,  0.0000e+00, -6.3556e+01,  ..., -5.5068e+00,
         -5.5068e+00, -3.2084e-02],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -9.5454e-05,
         -9.0390e+00, -1.3879e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -1.4851e-04,
         -4.7538e-03, -4.7538e-03],
        [ 0.0000e+00,  0.0000e+00, -1.0000e+02,  ..., -2.4480e-02,
         -9.9972e+00, -2.4480e-02]], device='cuda:0', grad_fn=<ClampBackward1>) tensor(False, device='cuda:0')
mul1  tensor([[-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 1.4103e-05, 1.1677e-04,
         1.4103e-05],
        [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 1.5916e-04, 2.6681e-04,
         2.6689e-04],
        [0.0000e+00, -0.0000e+00, 4.6867e-18,  ..., 1.2111e-01, 1.2111e-01,
         3.1379e-02],
        ...,
        [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 9.5447e-05, 1.7183e-02,
         1.3746e-02],
        [-0.0000e+00, -0.0000e+00, -0.0000e+00,  ..., 1.4850e-04, 4.7381e-03,
         4.7381e-03],
        [-0.0000e+00, -0.0000e+00, 0.0000e+00,  ..., 2.4068e-02, 9.7819e-03,
         2.4068e-02]], device='cuda:0', grad_fn=<MulBackward0>) tensor(False, device='cuda:0')
exp1  tensor([[1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9999e-01, 9.9992e-01,
         9.9999e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9989e-01, 9.9982e-01,
         9.9981e-01],
        [0.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 2.1994e-02, 2.1994e-02,
         9.7801e-01],
        ...,
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9993e-01, 1.9010e-03,
         9.9043e-01],
        [1.0000e+00, 1.0000e+00, 1.0000e+00,  ..., 9.9990e-01, 9.9671e-01,
         9.9671e-01],
        [1.0000e+00, 1.0000e+00, 0.0000e+00,  ..., 9.8318e-01, 9.7847e-04,
         9.8318e-01]], device='cuda:0', grad_fn=<MulBackward0>) tensor(False, device='cuda:0')
Loss tensor(0.2544, device='cuda:0', grad_fn=<MeanBackward0>) tensor(False, device='cuda:0')

[W python_anomaly_mode.cpp:104] Warning: Error detected in MulBackward0. Traceback of forward call that caused the error:
  File "train_KO_m2.py", line 868, in <module>
    loss = NBER_Loss(decoded_bits,  0.5 *
  File "train_KO_m2.py", line 801, in NBER_Loss
    mul1 = (-loss1*logy1)
 (function _print_stack)
Traceback (most recent call last):
  File "train_KO_m2.py", line 870, in <module>
    loss.backward()
  File "/home/marwan/.local/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/marwan/.local/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Function 'MulBackward0' returned nan values in its 0th output.

marwanj · June 7, 2022, 5:12pm

The solution turned out to be clamping the input to the log and not the output of the log. This permits back propagation