Outputs of the model are sometimes nan, sometimes [0.]

Linmei · August 16, 2019, 12:14pm

I am training a simple model with three input features and one output (both inputs and outputs are numerical). The printed outputs are sometimes nan, sometimes [0.] for every training example. I adjust the number of layers and nodes, but it didn’t help. I tried the model with another random dataset and it gave some reasonable outputs. So maybe the data is not imported correctly?

Dataset

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from torch.autograd import Variable
import torch.optim as optim



class DatasetCDFarm(Dataset): 

    def __init__(self, file_path):
        df = pd.read_excel(file_path, header = 0)
        df_array = df.to_numpy() # transform fd to numpy array
        self.len = df_array.shape[0]
   
        self.x = torch.from_numpy(df_array[:, 0:3]) # prices of barley, rapeseed, wheat 
    
        self.y = torch.from_numpy(df_array[:, [3]]) # profit


    def __getitem__(self, index):  
        return self.x[index], self.y[index]

    def __len__(self): 
        return self.len


train_dataset = DatasetCDFarm('nn_CDFarm_torch_profit_train.xlsx')
train_loader = DataLoader(dataset=train_dataset, 
                    batch_size = 16, 
                    shuffle = True) 

class Net(nn.Module): 

    def __init__(self, input_size, hidden1_size, hidden2_size, output_size): 
        super(Net, self).__init__() 
        self.fc1 = nn.Linear(input_size, hidden1_size) # first layer
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1_size, hidden2_size) # second layer
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden2_size, output_size) # output layer
        self.relu3 = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out) 
        out = self.relu3(out)
        return out

Net = Net(3, 60, 60, 1)
print(Net)

criterion = nn.MSELoss()
optimizer = optim.SGD(Net.parameters(), lr=0.01, momentum=0.9)

def train(epoch):
    Net.train()
    for batch_id, data in enumerate(train_loader):

        inputs, labels = data

        inputs = Variable(inputs).float()
        labels = Variable(labels).float()

        print(epoch, batch_id, "inputs", inputs.data, "labels", labels.data)
     
        out = Net(inputs)
        print('out', type(out),out)
        print('labels', type(labels),labels)

        loss = criterion(out, labels)
        print(epoch, batch_id, loss.data) 

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
for epoch in range(1, 100):
    train(epoch)

Mohd_Zaid · August 16, 2019, 12:29pm

remove the last relu layer and try it again.

Linmei · August 16, 2019, 12:37pm

still got: out: nan, and some weights are nan, biases are all nan

Linmei · August 21, 2019, 4:57pm

problem solved by using another optimizer, e.g. Adam optimizer.
The reason was I did not standardize the target variable and it caused exploding gradients. The SGD Optimizer was not performing well in this problem.
and of course remove the last relu.