I am a beginner about pytorch.
I try to use pre-train model to do classification problem.
But I found my loss and predict nan both after the first epoch.
So I step by step to look what happen in the process,
-
I check my data have nan or not, the data doesn’t have nan.
-
Reduce the learning rate smaller, 1e-10, but the loss still nan
-
I write the break switch when I get nan predict, here I found something.
In the last batch, everything is OK, predict and loss are not nan.
In the break batch, the input x is not nan, parameters are not nan,
but the predict is nan, and then lead to everything all nan in the future.
How to understand what happen in the training process?
Is something wrong in my data? or I can modified my model that fix it?
Here is my code blew:
Main
##
## Load
from Requirement import *
os.makedirs("Holdout/", exist_ok = True)
##
## Data table
dataTable = ChestDataTable()
##
## Holdout
SetSeed()
dataTrainTable, dataValidTable = holdout(dataTable, test_size = 0.05, stratify = dataTable['Target'])
##
## Augmentation
dataTrainAugTable = AugChestTable(dataTrainTable, 3500)
##
## Data set
ptrTrainAugSet = ChestDataSet(dataTrainAugTable)
ptrValidSet = ChestDataSet(dataValidTable)
##
## Parameter set
listBatch = [2]
listEpoch = [5]
listOptimizer = ["Adam"]
listEta = [1e-6]
dictParameterSet = {"Batch": listBatch, "Epoch": listEpoch, "Optimizer": listOptimizer, "Eta": listEta}
dataParameterSet = DataFrame(ParameterGrid(dictParameterSet)).reset_index()
dataParameterSet.to_csv("Holdout/ParameterSet.csv", index = False)
##
## In parameter loop
for i, p in dataParameterSet.iterrows():
##
## Model
funModel = DenseBaseNet()
##
## Set parameter
intBatch = p['Batch']
intEpoch = p['Epoch']
strOptimizer = p['Optimizer']
floatEta = p['Eta']
funCriterion = CrossEntropyLoss()
if(strOptimizer=="Adam"):
ptrOptimizer = optim.Adam(funModel.parameters(), lr=floatEta)
##
## Loader
ptrTrainAugLoader = DataLoader(dataset=ptrTrainAugSet, batch_size=intBatch, shuffle=True, num_workers=0)
ptrValidLoader = DataLoader(dataset=ptrValidSet, batch_size=intBatch, shuffle=False, num_workers=0)
##
## Initial in the epoch
# listLoss = []
dictHistory = {"TrainLoss":[], "ValidLoss":[], "ValidAccuracy":[]}
listValidResult = []
for e in range(intEpoch):
bMax = ptrTrainAugLoader.dataset.len
nSum = 0
eLoss = 0.0
for _, b in enumerate(ptrTrainAugLoader, 0):
##
## Get x and y
x, y = b
n = x.shape[0]
##
## Zero the parameter gradients
ptrOptimizer.zero_grad()
##
## Forward + Backward + Optimize
tenOutput = funModel.cuda()(x.cuda())
bLoss = funCriterion(tenOutput, y.cuda())
bLoss.backward()
ptrOptimizer.step()
##
## Update
eLoss += (bLoss.item() * n)
nSum = nSum + n
if(nSum==bMax):
##
## Check valid
with torch.no_grad():
tenValidOutput = funModel.cpu()(ptrValidLoader.dataset.x)
floatValidLoss = funCriterion(tenValidOutput, ptrValidLoader.dataset.y).item()
_, tenValidPrediction = torch.max(tenValidOutput.data, 1)
floatAccuracy = accuracy_score(ptrValidLoader.dataset.y, tenValidPrediction)
##
## Update
eLoss = eLoss / bMax
# listLoss = listLoss + [eLoss]
dictHistory['ValidLoss'] = dictHistory['ValidLoss'] + [floatValidLoss]
dictHistory['ValidAccuracy'] = dictHistory['ValidAccuracy'] + [floatAccuracy]
dictHistory['TrainLoss'] = dictHistory['TrainLoss'] + [eLoss]
print("Epoch: %s" % e)
print("Train loss: %s" % eLoss, "Valid loss: %s " % floatValidLoss)
print("Valid accuracy: %s " % floatAccuracy)
##
## Summary on valid
_, tenValidPrediction = torch.max(tenValidOutput.data, 1)
dataValidPrediction = DataFrame({"Prediction":array(tenValidPrediction)})
dataValidOutput = DataFrame(array(tenValidOutput)).reset_index(drop=True)
dataValidOutput.columns = ["Prob-" + str(i) for i in dataValidOutput.columns]
dataValidTable = dataValidTable.reset_index(drop=True)
dataValidResult = pandas.concat([dataValidTable, dataValidOutput, dataValidPrediction], axis = 1)
dataValidResult.to_csv("Holdout/" + str(i) + "-ValidResult.csv", index=False)
##
## Save model and history
dataHistory = DataFrame(dictHistory)
dataHistory.to_csv("Holdout/" + str(i) + "-History.csv", index=False)
torch.save(funModel, "Holdout/" + str(i) + '-Model.h5')
dataParameterSet = pandas.read_csv("Holdout/ParameterSet.csv")
dataParameterSet['ValidAccuracy'] = None
for i, p in dataParameterSet.iterrows():
funModel = torch.load("Holdout/" + str(i) + "-Model.h5")
##
## Evaluate valid
with torch.no_grad():
tenValidOutput = funModel.cpu()(ptrValidLoader.dataset.x)
floatValidLoss = funCriterion(tenValidOutput, ptrValidLoader.dataset.y).item()
_, tenValidPrediction = torch.max(tenValidOutput.data, 1)
floatAccuracy = accuracy_score(ptrValidLoader.dataset.y, tenValidPrediction)
##
## Accuracy
dataParameterSet.at[i,'ValidAccuracy'] = floatAccuracy
Requirement
import pandas
import os
import shutil
import PIL.Image as pil
import xml.etree.ElementTree as et
import numpy
import xlrd
import cv2 as cv
import numpy
from sklearn.model_selection import ParameterGrid
import torch
import pandas
import numpy
import random
import PIL
import os
import sklearn
from os import listdir
from pandas import DataFrame
from numpy import array
from torch import cuda
from torch import FloatTensor, DoubleTensor
from torch import LongTensor
from torch import from_numpy
from torch.utils.data import Dataset, DataLoader
from PIL import Image as pil
from torch import optim
from torch import nn
from sklearn.model_selection import train_test_split as holdout
from sklearn.model_selection import StratifiedKFold as fold
from sklearn.metrics import roc_auc_score, accuracy_score
from torch import nn
from torch.nn import Linear, Softmax, functional, Sequential, Module, CrossEntropyLoss
from torch.nn.functional import relu
import torchvision
from torchvision import models
from torchvision import transforms
##
## Crop image
def CropChestImage(image, e = 5):
gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
binary = numpy.where(gray>250,255, 0).astype("uint8")
binary[0:int(binary.shape[1]/6), 0 :int(binary.shape[0]*2/5)] = 0
contour, _ = cv.findContours(binary, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
for i in contour:
i = numpy.squeeze(i, axis = 1)
Xmax = max(i[:,0])
Xmin = min(i[:,0])
Ymax = max(i[:,1])
Ymin = min(i[:,1])
crop = image[Ymin+e:Ymax-e, Xmin+e:Xmax-e]
stop = (crop.shape[0]>400) & ((crop.shape[1]>400))
if(stop):
break
return(crop)
##
## Data table
def ChestDataTable():
listId = listdir("./DataSet/Clean/")
listLabel = [str.split(i, "_")[0] for i in listId]
dataTable = DataFrame({"Id":listId, "Label": listLabel})
dataTable['Target'] = dataTable['Label'].replace({"n0":0, "n1":1, "n2":2, "n3":3})
return(dataTable)
##
## Data set
class ChestDataSet(Dataset):
def __init__(self, data):
funTransform = transforms.Compose([transforms.Resize((224, 224)),
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.RandomRotation(degrees = 360),
transforms.ToTensor()])
##
## Get table
dataTable = data
##
## Get image
listImage = []
for i in dataTable['Id']:
iImage = pil.open("DataSet/Clean/" + i).resize((224,224))
iImage = funTransform(iImage)
iImage = array(iImage)
listImage.append(iImage)
pass
##
## Get torch format
tenImage = from_numpy(array(listImage)).type(DoubleTensor).float()
tenTarget = from_numpy(array(dataTable['Target'])).type(LongTensor)
##
## Custom
self.len = tenTarget.shape[0]
self.x = tenImage
self.y = tenTarget
def __getitem__(self, index):
return self.x[index], self.y[index]
def __len__(self):
return self.len
# ##
# ##
# def MakeFold(data, target, k=3):
# dataTable = data
# dataTable['Fold'] = None
# funFold = fold(n_splits=k)
# for i, (_, index) in enumerate(funFold.split(dataTable, dataTable['Target'])):
# dataTable.at[index,'Fold'] = i + 1
# return(dataTable)
def AugChestTable(data, each = 5000):
listAugTable = []
dataTable = data
setTarget = set(dataTable['Target'])
for t in setTarget:
try:
dataAug = dataTable.loc[dataTable['Target']==t].sample(each, replace = False)
except:
dataAug = dataTable.loc[dataTable['Target']==t].sample(each, replace = True)
listAugTable.append(dataAug)
dataAugTable = pandas.concat(listAugTable)
return(dataAugTable)
##
## Res base model
class ResBaseNet(Module):
def __init__(self):
super(ResBaseNet, self).__init__()
self.funFeatExtra = Sequential(*[i for i in list(models.resnet34().children())[:-1]])
self.funOutputLayer = Linear(512, 4)
self.funSoftmax = Softmax(dim=1)
def forward(self, x):
x = self.funFeatExtra(x)
x = x.view(-1, x.shape[1])
x = self.funOutputLayer(x)
x = self.funSoftmax(x)
return x
##
## Dense base model
class DenseBaseNet(Module):
def __init__(self):
super(DenseBaseNet, self).__init__()
self.funFeatExtra = Sequential(*[i for i in list(models.densenet121().children())[:-1]])
self.funOutputLayer = Linear(50176, 4)
self.funSoftmax = Softmax(dim=1)
def forward(self, x):
x = self.funFeatExtra(x)
x = x.view(-1, x.shape[1]*x.shape[2]*x.shape[3])
x = self.funOutputLayer(x)
x = self.funSoftmax(x)
return x
##
## Check gpu information
def CheckGpuInf():
intDeviceCount = cuda.current_device()
print("Current device: %s" % intDeviceCount)
print("Device count: %s" % cuda.device_count())
print("Device name: %s" % cuda.get_device_name(intDeviceCount))
return
##
## Set seed
def SetSeed():
##
## Fix result
random.seed(1)
numpy.random.seed(123)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
return