LSTM shows NaN value as output while FP16 training

My model is a simple stack of FC and LSTM as follows.

class NETWORK(nn.Module):
def init(self, hparams):
super(NETWORK, self).init()

    self.fc = nn.Sequential(
        nn.Linear(self.inputDim, self.numNode), nn.ReLU(True), nn.Dropout(0.5),
        nn.Linear(self.numNode, self.numNode), nn.ReLU(True), nn.Dropout(0.5)
    )
    self.LSTM = nn.LSTM(input_size = self.numNode, hidden_size = int(self.numNode/2), num_layers = 1, bidirectional = self.bidirectional )
    self.outfc = nn.Sequential(
        nn.Linear(self.numLinear, self.outputDim))

def forward(self, inSequence, h, c):
    fcout = self.fc(inSequence)
    self.LSTM.flatten_parameters()
    lstmout, (h, c) = self.LSTM(fcout, (h, c))
    output = self.outfc(output)
    return output

I also train in FP16 environment w/ apex library from NVIDIA.
When I train wth FP32 training, everything goes well.
But when I train with FP16 training, LSTM output shows nan value.

Particularly, this NaN phenomena only occurs when I initialize hidden and cell state of LSTM w/ Normal distribution. (zero-mean, and variance value is between 0.0001~1.0000)

On the other hand, zero initialization of LSTM cell and hidden states doesnt show this NaN phenomena. I want to initialize these states as small random numbers since it results in faster convergence of model.

Are there any solutions or hints to resolve this problem?

1 Like

Which opt_level are you using for amp?
Could you post a small executable example to reproduce this issue?

my opt level is ‘O2’.
Since I use private data I cannot upload whole code,
but I try to combine following classifier modules to NVIDIA’s tacotron2 code.
(http://github.com/NVIDIA/tacotron2)

One weird point is, even there are RNN families in decoder of Tacotron2, this nan error from LSTM does not occur. But only on the output of LSTM in SimpeSERClassifier shows NaN value.

import torch
from torch import nn
from torch.distributions import normal
import argparse

class Loss(nn.Module):
def init(self):
super(Loss, self).init()
def forward(self,
SER_est_label, SERlabel):
SER_loss = nn.CrossEntropyLoss()(SER_est_label, SERlabel)
return SER_loss

class SimpleSERClassifier(nn.Module):
def init(self, hparams):
super(SimpleSERClassifier, self).init()
self.inputDim = hparams.SERinputDim
self.outputDim = hparams.SERoutputDim
self.numNode = hparams.SERnumNode
self.numLayer = hparams.SERnumLayer
self.bidirectional = True
self.numDirection = 2 if self.bidirectional else 1
self.numLinear = self.numDirection * int(self.numNode/2)
self.numLSTMLayer = 1
self.batch_size = hparams.batch_size

    self.fc = nn.Sequential(
        nn.Linear(self.inputDim, self.numNode), nn.ReLU(True), nn.Dropout(0.5),
        nn.Linear(self.numNode, self.numNode), nn.ReLU(True), nn.Dropout(0.5)
    )
    self.LSTM = nn.LSTM(input_size=self.numNode, 
                        hidden_size=int(self.numNode/2), 
                        num_layers=1,
                        bidirectional=self.bidirectional)
    self.outfc = nn.Sequential(
        nn.Linear(self.numLinear, self.outputDim))

def init_hidden(self):
    n = normal.Normal(0.0, 1)
    h = n.sample([self.numDirection, self.batch_size, int(self.numNode/2)])
    c = n.sample([self.numDirection, self.batch_size, int(self.numNode/2)])
    return (h, c)

def forward(self, inSequence, h, c):
    fcout = self.fc(inSequence)
    if torch.cuda.is_available():
        h, c = h.cuda(), c.cuda()

    fcout = fcout.permute(1, 0, 2)

    self.LSTM.flatten_parameters()
    lstmout, (h, c) = self.LSTM(fcout, (h, c))
    output = lstmout.permute(1, 0, 2)
    output = self.outfc(output)
    return output

def train(hparams):

torch.manual_seed(hparams.seed)
torch.cuda.manual_seed(hparams.seed)

model = SimpleSERClassifier(hparams).cuda()
learning_rate = hparams.learning_rate
optimizer = torch.optim.Adam(lr=learning_rate)

if hparams.fp16_run:
    from apex import amp
    model, optimizer = amp.initialize(
        model, optimizer, opt_level='O2')

criterion = Loss()
train_loader, valset, collate_fn = prepare_dataloaders(hparams)

# Load checkpoint if one exists
iteration = 0
epoch_offset = 0
model.train()
# ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, hparams.epochs):
    for i, batch in enumerate(train_loader):
        for param_group in optimizer.param_groups:
            param_group['lr'] = learning_rate

        model.zero_grad()
        y_pred = model(x)
        
        loss = criterion(y, y_pred)
        if hparams.fp16_run:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        optimizer.step()
        iteration += 1

if name == ‘main’:
parser = argparse.ArgumentParser()
# --------------------------------------------------
# Classifier param
# ---------------------------------------------------
parser.add_argument(’–SERinputDim’, type=int, default=32)
parser.add_argument(’–SERoutputDim’, type=int, default=4)

parser.add_argument('--SERnumNode', type=int, default=512)
parser.add_argument('--SERnumLayer', type=int, default=3)
parser.add_argument('--SERbidirectional', type=bool, default=True)
# --------------------------------------------------
parser.add_argument('--epochs', type=int, default=100, metavar='N',
        help='number of epochs to train (default: 10)')
parser.add_argument('--cudnn_enabled', type=bool, default=True)
parser.add_argument('--cudnn_benchmark', type=bool, default=False)
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--learning_rate', type=float, default=1e-3)
args = parser.parse_args()

torch.backends.cudnn.enabled = args.cudnn_enabled
torch.backends.cudnn.benchmark = args.cudnn_benchmark

train(args)

I also get this problem, and then, I use opt_level is “00”. Finally, it is okay.