print(final_inputs,final_inputs.isnan().any())
tensor([[ 4.5512e+03, 2.2965e+03, 3.7338e+03, -1.1410e+03],
[-1.2685e+04, 2.1673e+05, 2.7743e+05, 2.2019e+05],
[ 1.3482e+03, 4.0191e+04, 1.5236e+04, 8.1835e+03],
...,
[ 4.0465e+04, 2.3288e+04, 2.5682e+04, 6.2736e+03],
[ 7.3675e+05, -9.9548e+05, -1.4421e+06, 1.7742e+04],
[ 1.3802e+03, 9.2324e+02, 9.4439e+02, 3.3211e+01]],
grad_fn=<AddmmBackward>) tensor(False)
Epoch 0, Iteration: 0.000, Loss:72828.328
tensor([[ 4.8688e+17, -5.5222e+16, 2.8847e+17, -7.2008e+17],
[ 4.8875e+17, -5.5577e+16, 2.8895e+17, -7.2208e+17],
[ 2.8952e+19, -3.2968e+18, 1.7122e+19, -4.2774e+19],
...,
[ 2.7599e+15, -3.1438e+14, 1.6323e+15, -4.0776e+15],
[ 6.9775e+18, -7.9428e+17, 4.1225e+18, -1.0305e+19],
[ 2.7847e+19, -3.1713e+18, 1.6469e+19, -4.1142e+19]],
grad_fn=<AddmmBackward>) tensor(False)
Epoch 0, Iteration: 1.000, Loss:479207024681287680.000
tensor([[-7.4644e+29, 9.6666e+25, 7.4367e+29, 2.6739e+27],
[-2.0749e+33, 2.6870e+29, 2.0672e+33, 7.4327e+30],
[-1.8951e+30, 2.4542e+26, 1.8880e+30, 6.7886e+27],
...,
[-1.2698e+30, 1.6444e+26, 1.2650e+30, 4.5486e+27],
[-1.7603e+33, 2.2796e+29, 1.7537e+33, 6.3057e+30],
[-2.1945e+30, 2.8420e+26, 2.1864e+30, 7.8614e+27]],
grad_fn=<AddmmBackward>) tensor(False)
Epoch 0, Iteration: 2.000, Loss:301857268183970173654388944404480.000
tensor([[ 1.2171e+14, 2.3199e+10, -1.2212e+14, 3.9526e+11],
[ 1.2171e+14, 2.3199e+10, -1.2212e+14, 3.9526e+11],
[ 1.2171e+14, 2.3199e+10, -1.2212e+14, 3.9526e+11],
...,
[ 1.2171e+14, 2.3199e+10, -1.2212e+14, 3.9526e+11],
[ 1.2171e+14, 2.3199e+10, -1.2212e+14, 3.9526e+11],
[ 1.2171e+14, 2.3199e+10, -1.2212e+14, 3.9526e+11]],
grad_fn=<AddmmBackward>) tensor(False)
Epoch 0, Iteration: 3.000, Loss:67027314147328.000
tensor([[-8.5834e+19, 1.3941e+16, 8.5509e+19, 3.1127e+17],
[-8.5834e+19, 1.3941e+16, 8.5509e+19, 3.1127e+17],
[-8.5834e+19, 1.3941e+16, 8.5509e+19, 3.1127e+17],
...,
[-8.5834e+19, 1.3941e+16, 8.5509e+19, 3.1127e+17],
[-8.5834e+19, 1.3941e+16, 8.5509e+19, 3.1127e+17],
[-8.5834e+19, 1.3941e+16, 8.5509e+19, 3.1127e+17]],
grad_fn=<AddmmBackward>) tensor(False)
After this point, I get this -RuntimeError: Function âLogSoftmaxBackwardâ returned nan values in its 0th output.