@pading
This is a repeat of the question
Some of the reasons are
-
Huge difference in scale of input data vs output data
-
Large fluctuation of values in the output data
Example
import os
import numpy as np
import time
import torch
import torchvision
from torch import nn
from torch.autograd import Variable
torch.manual_seed(42)
class sample_model(nn.Module):
def __init__(self):
super(sample_model, self).__init__()
self.sequence = nn.Sequential(
nn.Linear(100,10),
nn.BatchNorm1d(10),
nn.ReLU(True),
nn.Linear(10,1)
)
def forward(self, x):
return self.sequence(x)
X = np.random.randint(10, size=(100, 100))
X = X.astype(np.float32)
y = np.random.randint(10000000, size=(100,))
X = torch.FloatTensor(X)
y = torch.FloatTensor(y)
num_epochs=100
learning_rate=0.01
model = sample_model()
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
dataOutput = model(X)
loss = criterion(dataOutput, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 1 == 0:
print('epoch [{}/{}], loss:{:.4f}'.format(epoch + 1, num_epochs, loss))
# Results
epoch [1/100], loss:38531426680832.0000
epoch [2/100], loss:3803169763669049344.0000
epoch [3/100], loss:inf
epoch [4/100], loss:nan
epoch [5/100], loss:nan
epoch [6/100], loss:nan
epoch [7/100], loss:nan
epoch [8/100], loss:nan
You can simple read about backward propagation and see how high gradients can sometimes get out of hand quickly and move to inf