Logits are valid, but the loss is invalid.
Some prints: Train loss is the scaled loss
Epoch 163 | Train_Loss: 5692348.000000 | loss 3.643103
torch.float32
logits: tensor([[ 0.3500, -0.0361, -0.1291, ..., 0.1045, -0.0760, 0.0441],
[ 0.2364, -0.0687, -0.0700, ..., 0.0250, -0.0508, -0.0074],
[ 0.4655, -0.0218, -0.1786, ..., 0.1280, -0.1019, 0.0736],
...,
[ 0.8182, -0.3446, 0.8706, ..., -0.6142, -0.1828, 0.1664],
[ 0.3388, -0.0269, -0.1201, ..., 0.0948, -0.0739, 0.0373],
[ 0.3903, -0.0325, -0.1476, ..., 0.1213, -0.0867, 0.0581]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0343, 0.0233, 0.0212, ..., 0.0268, 0.0224, 0.0252],
[0.0310, 0.0228, 0.0228, ..., 0.0251, 0.0232, 0.0243],
[0.0381, 0.0234, 0.0200, ..., 0.0272, 0.0216, 0.0257],
...,
[0.0006, 0.0002, 0.0007, ..., 0.0002, 0.0002, 0.0003],
[0.0340, 0.0236, 0.0215, ..., 0.0266, 0.0225, 0.0251],
[0.0355, 0.0233, 0.0207, ..., 0.0271, 0.0220, 0.0255]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.103240
Epoch 164 | Train_Loss: 5692348.000000 | loss 3.643103
torch.float32
logits: tensor([[ 0.3586, -0.0380, -0.1317, ..., 0.1071, -0.0782, 0.0441],
[ 0.2399, -0.0689, -0.0712, ..., 0.0268, -0.0518, -0.0072],
[ 0.4774, -0.0244, -0.1818, ..., 0.1304, -0.1045, 0.0734],
...,
[ 0.8164, -0.3412, 0.8585, ..., -0.6064, -0.1850, 0.1634],
[ 0.3458, -0.0285, -0.1219, ..., 0.0964, -0.0756, 0.0370],
[ 0.4001, -0.0347, -0.1504, ..., 0.1240, -0.0890, 0.0580]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0345, 0.0232, 0.0211, ..., 0.0268, 0.0223, 0.0252],
[0.0311, 0.0228, 0.0228, ..., 0.0251, 0.0232, 0.0243],
[0.0384, 0.0233, 0.0199, ..., 0.0272, 0.0215, 0.0257],
...,
[0.0006, 0.0002, 0.0007, ..., 0.0002, 0.0002, 0.0003],
[0.0342, 0.0235, 0.0214, ..., 0.0266, 0.0224, 0.0251],
[0.0358, 0.0232, 0.0206, ..., 0.0272, 0.0219, 0.0254]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.102102
Epoch 165 | Train_Loss: 5690994.500000 | loss 3.642236
torch.float32
logits: tensor([[ 0.3679, -0.0400, -0.1345, ..., 0.1098, -0.0805, 0.0441],
[ 0.2436, -0.0692, -0.0726, ..., 0.0286, -0.0528, -0.0071],
[ 0.4897, -0.0272, -0.1853, ..., 0.1328, -0.1074, 0.0732],
...,
[ 0.8156, -0.3383, 0.8464, ..., -0.5990, -0.1871, 0.1604],
[ 0.3530, -0.0303, -0.1240, ..., 0.0980, -0.0774, 0.0366],
[ 0.4107, -0.0371, -0.1536, ..., 0.1268, -0.0917, 0.0580]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0348, 0.0231, 0.0211, ..., 0.0269, 0.0222, 0.0252],
[0.0312, 0.0228, 0.0227, ..., 0.0251, 0.0232, 0.0243],
[0.0388, 0.0232, 0.0198, ..., 0.0272, 0.0214, 0.0256],
...,
[0.0006, 0.0002, 0.0007, ..., 0.0002, 0.0002, 0.0003],
[0.0344, 0.0235, 0.0214, ..., 0.0267, 0.0224, 0.0251],
[0.0361, 0.0231, 0.0205, ..., 0.0272, 0.0218, 0.0254]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.102901
Epoch 166 | Train_Loss: 5689459.000000 | loss 3.641254
torch.float32
logits: tensor([[ 0.3773, -0.0422, -0.1376, ..., 0.1124, -0.0831, 0.0440],
[ 0.2477, -0.0696, -0.0741, ..., 0.0304, -0.0540, -0.0070],
[ 0.5024, -0.0302, -0.1891, ..., 0.1351, -0.1105, 0.0729],
...,
[ 0.8138, -0.3359, 0.8343, ..., -0.5911, -0.1891, 0.1574],
[ 0.3606, -0.0321, -0.1262, ..., 0.0996, -0.0793, 0.0361],
[ 0.4215, -0.0397, -0.1572, ..., 0.1297, -0.0946, 0.0578]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0351, 0.0231, 0.0210, ..., 0.0269, 0.0221, 0.0251],
[0.0313, 0.0228, 0.0227, ..., 0.0252, 0.0232, 0.0243],
[0.0392, 0.0230, 0.0197, ..., 0.0272, 0.0213, 0.0255],
...,
[0.0006, 0.0002, 0.0007, ..., 0.0002, 0.0002, 0.0003],
[0.0347, 0.0234, 0.0213, ..., 0.0267, 0.0223, 0.0251],
[0.0364, 0.0230, 0.0204, ..., 0.0272, 0.0217, 0.0253]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.103105
Epoch 167 | Train_Loss: 5687721.000000 | loss 3.640141
torch.float32
logits: tensor([[ 0.3874, -0.0446, -0.1411, ..., 0.1151, -0.0859, 0.0439],
[ 0.2521, -0.0701, -0.0758, ..., 0.0322, -0.0553, -0.0070],
[ 0.5159, -0.0336, -0.1933, ..., 0.1375, -0.1139, 0.0724],
...,
[ 0.8125, -0.3326, 0.8230, ..., -0.5838, -0.1909, 0.1543],
[ 0.3684, -0.0342, -0.1287, ..., 0.1012, -0.0814, 0.0357],
[ 0.4331, -0.0426, -0.1611, ..., 0.1325, -0.0977, 0.0575]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0354, 0.0230, 0.0209, ..., 0.0270, 0.0221, 0.0251],
[0.0314, 0.0228, 0.0227, ..., 0.0252, 0.0231, 0.0243],
[0.0397, 0.0229, 0.0195, ..., 0.0272, 0.0211, 0.0255],
...,
[0.0006, 0.0002, 0.0007, ..., 0.0002, 0.0002, 0.0003],
[0.0349, 0.0233, 0.0212, ..., 0.0267, 0.0223, 0.0250],
[0.0368, 0.0229, 0.0203, ..., 0.0272, 0.0216, 0.0253]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.102965
Epoch 168 | Train_Loss: 5685748.000000 | loss 3.638879
torch.float32
logits: tensor([[ 0.3982, -0.0473, -0.1450, ..., 0.1179, -0.0890, 0.0437],
[ 0.2567, -0.0708, -0.0777, ..., 0.0340, -0.0568, -0.0069],
[ 0.5303, -0.0372, -0.1981, ..., 0.1399, -0.1177, 0.0719],
...,
[ 0.8111, -0.3305, 0.8112, ..., -0.5764, -0.1931, 0.1508],
[ 0.3768, -0.0365, -0.1314, ..., 0.1027, -0.0838, 0.0351],
[ 0.4454, -0.0458, -0.1654, ..., 0.1354, -0.1012, 0.0572]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0357, 0.0229, 0.0208, ..., 0.0270, 0.0219, 0.0251],
[0.0316, 0.0228, 0.0226, ..., 0.0253, 0.0231, 0.0243],
[0.0401, 0.0227, 0.0194, ..., 0.0272, 0.0210, 0.0254],
...,
[0.0006, 0.0002, 0.0006, ..., 0.0002, 0.0002, 0.0003],
[0.0352, 0.0233, 0.0212, ..., 0.0268, 0.0222, 0.0250],
[0.0371, 0.0227, 0.0202, ..., 0.0272, 0.0215, 0.0252]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.101241
Epoch 169 | Train_Loss: nan | loss nan
torch.float32
logits: tensor([[ 0.3982, -0.0473, -0.1450, ..., 0.1179, -0.0890, 0.0437],
[ 0.2567, -0.0708, -0.0777, ..., 0.0340, -0.0568, -0.0069],
[ 0.5303, -0.0372, -0.1981, ..., 0.1399, -0.1177, 0.0719],
...,
[ 0.8111, -0.3305, 0.8112, ..., -0.5764, -0.1931, 0.1508],
[ 0.3768, -0.0365, -0.1314, ..., 0.1027, -0.0838, 0.0351],
[ 0.4454, -0.0458, -0.1654, ..., 0.1354, -0.1012, 0.0572]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0357, 0.0229, 0.0208, ..., 0.0270, 0.0219, 0.0251],
[0.0316, 0.0228, 0.0226, ..., 0.0253, 0.0231, 0.0243],
[0.0401, 0.0227, 0.0194, ..., 0.0272, 0.0210, 0.0254],
...,
[0.0006, 0.0002, 0.0006, ..., 0.0002, 0.0002, 0.0003],
[0.0352, 0.0233, 0.0212, ..., 0.0268, 0.0222, 0.0250],
[0.0371, 0.0227, 0.0202, ..., 0.0272, 0.0215, 0.0252]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.102409
Epoch 170 | Train_Loss: nan | loss nan
torch.float32
logits: tensor([[ 0.3982, -0.0473, -0.1450, ..., 0.1179, -0.0890, 0.0437],
[ 0.2567, -0.0708, -0.0777, ..., 0.0340, -0.0568, -0.0069],
[ 0.5303, -0.0372, -0.1981, ..., 0.1399, -0.1177, 0.0719],
...,
[ 0.8111, -0.3305, 0.8112, ..., -0.5764, -0.1931, 0.1508],
[ 0.3768, -0.0365, -0.1314, ..., 0.1027, -0.0838, 0.0351],
[ 0.4454, -0.0458, -0.1654, ..., 0.1354, -0.1012, 0.0572]],
device='cuda:0', grad_fn=<AddBackward0>)
logp tensor([[0.0357, 0.0229, 0.0208, ..., 0.0270, 0.0219, 0.0251],
[0.0316, 0.0228, 0.0226, ..., 0.0253, 0.0231, 0.0243],
[0.0401, 0.0227, 0.0194, ..., 0.0272, 0.0210, 0.0254],
...,
[0.0006, 0.0002, 0.0006, ..., 0.0002, 0.0002, 0.0003],
[0.0352, 0.0233, 0.0212, ..., 0.0268, 0.0222, 0.0250],
[0.0371, 0.0227, 0.0202, ..., 0.0272, 0.0215, 0.0252]],
device='cuda:0', grad_fn=<SoftmaxBackward0>)
time per epoch 0:00:00.101980