Why i trained more that 153440 iteration, but got nan at last?

@pading
This is a repeat of the question

Some of the reasons are

  • Huge difference in scale of input data vs output data

  • Large fluctuation of values in the output data

Example

import os
import numpy as np
import time
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
torch.manual_seed(42)

class sample_model(nn.Module):
    def __init__(self):
        super(sample_model, self).__init__()
        self.sequence = nn.Sequential(
            nn.Linear(100,10),
            nn.BatchNorm1d(10),
            nn.ReLU(True),
            nn.Linear(10,1)
        )
    def forward(self, x):
        return self.sequence(x)


X = np.random.randint(10, size=(100, 100))
X = X.astype(np.float32)
y = np.random.randint(10000000, size=(100,))
X = torch.FloatTensor(X)
y = torch.FloatTensor(y)

num_epochs=100
learning_rate=0.01
model = sample_model()
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    dataOutput = model(X)
    loss = criterion(dataOutput, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 1 == 0:
        print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss))

# Results
epoch [1/100], loss:38531426680832.0000
epoch [2/100], loss:3803169763669049344.0000
epoch [3/100], loss:inf
epoch [4/100], loss:nan
epoch [5/100], loss:nan
epoch [6/100], loss:nan
epoch [7/100], loss:nan
epoch [8/100], loss:nan

You can simple read about backward propagation and see how high gradients can sometimes get out of hand quickly and move to inf