Hello,
I have been training the modified version of variational recurrent neural network. When the loss becomes close to zero the model returns either inf
or Nan
values. I tried to fix the problem with reducing learning rate of the optimizer, adding gradient clipping and initializing the weights. However, I still get this error. I applied some of the methods suggested in the web to find where the NaN value happens. Here is part of my model and the nan_hook I used to pin point the place that the nan error happened:
def nan_hook(self, inp, out):
"""
Check for NaN inputs or outputs at each layer in the model
Usage:
# forward hook
for submodule in model.modules():
submodule.register_forward_hook(nan_hook)
"""
outputs = isinstance(out, tuple) and out or [out]
inputs = isinstance(inp, tuple) and inp or [inp]
contains_nan = lambda x: torch.isnan(x).any()
layer = self.__class__.__name__
for i, inp in enumerate(inputs):
if inp is not None and contains_nan(inp):
raise RuntimeError(f'Found NaN input at index: {i} in layer: {layer}')
for i, out in enumerate(outputs):
if out is not None and contains_nan(out):
raise RuntimeError(f'Found NaN output at index: {i} in layer: {layer}')
The main class
class VRNN_GMM(nn.Module):
def __init__(self, u_dim, y_dim, h_dim, z_dim, n_layers, n_mixtures, device, batch_norm=False, bias=False):
super(VRNN_GMM, self).__init__()
self.y_dim = y_dim
self.u_dim = u_dim
self.h_dim = h_dim
self.z_dim = z_dim
self.n_layers = n_layers
self.n_mixtures = n_mixtures
self.device = device
self.batch_norm = batch_norm
# feature-extracting transformations (phi_y, phi_u and phi_z)
layers_phi_y = [nn.Linear(self.y_dim, self.h_dim)]
if self.batch_norm: # Add batch normalization when requested
layers_phi_y.append(torch.nn.BatchNorm1d(self.h_dim))
layers_phi_y = layers_phi_y + [
nn.ReLU(),
nn.Linear(self.h_dim, self.h_dim),
]
self.phi_y = torch.nn.Sequential(*layers_phi_y)
layers_phi_u = [nn.Linear(self.u_dim, self.h_dim)]
if self.batch_norm:
layers_phi_u.append(torch.nn.BatchNorm1d(self.h_dim))
layers_phi_u = layers_phi_u + [
nn.ReLU(),
nn.Linear(self.h_dim, self.h_dim),
]
self.phi_u = torch.nn.Sequential(*layers_phi_u)
layers_phi_z = [nn.Linear(self.z_dim, self.h_dim)]
if self.batch_norm:
layers_phi_z.append(torch.nn.BatchNorm1d(self.h_dim))
layers_phi_z = layers_phi_z + [
nn.ReLU(),
nn.Linear(self.h_dim, self.h_dim),
]
self.phi_z = torch.nn.Sequential(*layers_phi_z)
#-----------------------------------------
# encoder function (phi_enc) -> Inference
encoder_layers=[nn.Linear(self.h_dim + self.h_dim, self.h_dim)]
if self.batch_norm:
encoder_layers.append(torch.nn.BatchNorm1d(self.h_dim))
encoder_layers=encoder_layers+[
nn.ReLU(),
nn.Linear(self.h_dim, self.h_dim),
]
if self.batch_norm:
encoder_layers= encoder_layers+[nn.BatchNorm1d(self.h_dim)]
encoder_layers_mean= encoder_layers+[
nn.ReLU(),
nn.Linear(self.h_dim, self.z_dim)
]
self.enc_mean=torch.nn.Sequential(*encoder_layers_mean)
###encoder log_var
encoder_layers_logvar= encoder_layers+[
nn.ReLU(),
nn.Linear(self.h_dim, self.z_dim)
]
self.enc_logvar = torch.nn.Sequential(*encoder_layers_logvar)
#-----------------------------------------
# prior function (phi_prior) -> Prior
layers_prior = [nn.Linear(self.h_dim+ self.h_dim, self.h_dim)]
if self.batch_norm:
layers_prior.append(torch.nn.BatchNorm1d(self.h_dim))
#
layers_prior_mean = layers_prior + [
nn.ReLU(),
nn.Linear(self.h_dim, self.z_dim),
]
self.prior_mean = torch.nn.Sequential(*layers_prior_mean)
layers_prior_logvar = layers_prior + [
nn.ReLU(),
nn.Linear(self.h_dim, self.z_dim),
]
self.prior_logvar = torch.nn.Sequential(*layers_prior_logvar)
#-----------------------------------------
# decoder function (phi_dec) -> Generation
layers_decoder = [
nn.Linear(self.h_dim + self.h_dim, self.h_dim)
]
if self.batch_norm:
layers_decoder.append(torch.nn.BatchNorm1d(self.h_dim))
layers_decoder=layers_decoder+[
nn.ReLU(),
nn.Linear(self.h_dim, self.h_dim)
]
if self.batch_norm:
layers_decoder.append(torch.nn.BatchNorm1d(self.h_dim))
layers_decoder_mean=layers_decoder+[
nn.ReLU(),
nn.Linear(self.h_dim, self.y_dim * self.n_mixtures),
]
self.dec_mean = nn.Sequential(*layers_decoder_mean)
layers_decoder_logvar=layers_decoder+[
nn.ReLU(),
nn.Linear(self.h_dim, self.y_dim * self.n_mixtures),
]
self.dec_logvar = nn.Sequential(*layers_decoder_logvar)
#pi
layers_decoder_pi=layers_decoder+[
nn.ReLU(),
nn.Linear(self.h_dim, self.y_dim * self.n_mixtures),
]
if self.batch_norm:
layers_decoder_pi.append(torch.nn.BatchNorm1d(self.y_dim * self.n_mixtures))
layers_decoder_pi = layers_decoder_pi + [nn.Softmax(dim=1)]
self.dec_pi = nn.Sequential(*layers_decoder_pi)
# recurrence function (f_theta) -> Recurrence
self.rnn = nn.GRU(self.h_dim + self.y_dim, self.h_dim, self.n_layers, bias)
self.hidden_state_rnn = nn.GRU(self.y_dim, self.h_dim, self.n_layers, bias)
def forward(self, u, y):
batch_size = y.size(0)
#seq_len = y.shape[-1]#original
seq_len = torch.LongTensor([x.count_nonzero(dim=-1).count_nonzero().item() for x in u])
# allocation
loss = 0
# initialization
h = torch.zeros(self.n_layers, batch_size, self.h_dim, device=self.device)
# for all time steps
for i in range(len(seq_len)):
for t in range(seq_len[i]):
# feature extraction: y_t
phi_y_t = self.phi_y(y[:, :, t])
# feature extraction: u_t
phi_u_t = self.phi_u(u[:, :, t])
# encoder: y_t, h_t -> z_t posterior
encoder_input = torch.cat([phi_y_t, h[-1]], dim=1)
enc_mean_t = self.enc_mean(encoder_input)
enc_logvar_t = self.enc_logvar(encoder_input)
enc_logvar_t = nn.Softplus()(enc_logvar_t)
# prior: h_t -> z_t (for KLD loss)
prior_input = h[-1]
prior_mean_t = self.prior_mean(prior_input)
prior_logvar_t = self.prior_logvar(prior_input)
prior_logvar_t = nn.Softplus()(prior_logvar_t)
# sampling and reparameterization: get a new z_t
#temp = tdist.Normal(enc_mean_t, enc_logvar_t.exp().sqrt())
#z_t = tdist.Normal.rsample(temp)
z_t=self.reparametrization(enc_mean_t, enc_logvar_t)
# feature extraction: z_t
phi_z_t = self.phi_z(z_t)
# decoder: h_t, z_t -> y_t
decoder_input=torch.cat([phi_z_t, h[-1]], dim=1)
dec_mean_t = self.dec_mean(decoder_input).view(batch_size, self.y_dim, self.n_mixtures)
dec_logvar_t = self.dec_logvar(decoder_input).view(batch_size, self.y_dim, self.n_mixtures)
dec_logvar_t = nn.Softplus()(dec_logvar_t)
dec_pi_t = self.dec_pi(decoder_input).view(batch_size, self.y_dim, self.n_mixtures)
# recurrence: u_t+1, z_t -> h_t+1
_, h = self.rnn(torch.cat([phi_u_t, phi_z_t], 1).unsqueeze(0), h)
# computing the loss
KLD = self.kld_gauss(enc_mean_t, enc_logvar_t, prior_mean_t, prior_logvar_t)
loss_pred = self.loglikelihood_gmm(y[:, :, t], dec_mean_t, dec_logvar_t, dec_pi_t)
loss += - loss_pred + KLD
return loss
Here is the training module:
def run_train(modelstate, loader_train, loader_valid, device, dataframe, path_general, file_name_general, nan_picker=True):
train_options={'clip':2,
'print_every':1,
'test_every':50,
'batch_size':150,
'n_epochs':400,
'lr_scheduler_nstart':10,#earning rate scheduler start epoch
'lr_scheduler_nepochs':5,#check learning rater after
'lr_scheduler_factor':10,#adapt learning rate by
'min_lr':1e-6,#minimal learning rate
'init_lr':1e-4,#initial learning rate
}
def validate(loader):
modelstate.model.eval()
total_vloss = 0
total_batches = 0
total_points = 0
with torch.no_grad():
for i, (u, y) in enumerate(loader):
u = u.to(device)
y = y.to(device)
vloss_ = modelstate.model(u, y)
total_batches += u.size()[0]
total_points += np.prod(u.shape)
total_vloss += vloss_.item()
return total_vloss / total_points # total_batches
def train(epoch):
# model in training mode
modelstate.model.train()
# initialization
total_loss = 0
total_batches = 0
total_points = 0
for i, (u, y) in enumerate(loader_train):
u = u.to(device)
y = y.to(device)
# set the optimizer
modelstate.optimizer.zero_grad()
# forward pass over model
loss_ = modelstate.model(u, y)
if nan_picker:
#debug for nan values
get_all_layers(modelstate.model.m, nan_hook)
# NN optimization
loss_.backward()
torch.nn.utils.clip_grad_value_(modelstate.model.m.dec_mean.parameters(), train_options['clip'])
torch.nn.utils.clip_grad_value_(modelstate.model.m.dec_logvar.parameters(), train_options['clip'])
torch.nn.utils.clip_grad_value_(modelstate.model.m.dec_pi.parameters(), train_options['clip'])
torch.nn.utils.clip_grad_value_(modelstate.model.m.phi_z.parameters(), train_options['clip'])
torch.nn.utils.clip_grad_value_(modelstate.model.m.phi_u.parameters(), train_options['clip'])
torch.nn.utils.clip_grad_value_(modelstate.model.m.phi_y.parameters(), train_options['clip'])
torch.nn.utils.clip_grad_norm_(modelstate.model.m.prior_mean.parameters(), 1.0, norm_type=2)
torch.nn.utils.clip_grad_norm_(modelstate.model.m.prior_logvar.parameters(), 1.0, norm_type=2)
torch.nn.utils.clip_grad_value_(modelstate.model.m.rnn.parameters(), train_options['clip'])
torch.nn.utils.clip_grad_norm_(modelstate.model.m.enc_mean.parameters(), 1.0, norm_type=2)
torch.nn.utils.clip_grad_norm_(modelstate.model.m.enc_logvar.parameters(), 1.0, norm_type=2)
modelstate.optimizer.step()
total_batches += u.size()[0]
total_points += np.prod(u.shape)
total_loss += loss_.item()
# output to console
if i % train_options['print_every'] == 0:
print(
'Train Epoch: [{:5d}/{:5d}], Batch [{:6d}/{:6d} ({:3.0f}%)]\tLearning rate: {:.2e}\tLoss: {:.3f}'.format(
epoch, train_options['n_epochs'], (i + 1), len(loader_train),
100. * (i + 1) / len(loader_train), lr, total_loss / total_points)) # total_batches
return total_loss / total_points
The error message is given as follows
1081 if nan_picker:
~/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
/tmp/ipykernel_5250/3566447173.py in forward(self, u, y)
948 y = self.normalizer_output.normalize(y)
949
--> 950 loss = self.m(u, y)
951
952 return loss
~/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
/tmp/ipykernel_5250/3566447173.py in forward(self, u, y)
654 for t in range(seq_len[i]):
655 # feature extraction: y_t
--> 656 phi_y_t = self.phi_y(y[:, :, t])
657 # feature extraction: u_t
658 phi_u_t = self.phi_u(u[:, :, t])
~/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1188 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1189 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1190 return forward_call(*input, **kwargs)
1191 # Do not call functions when jit is used
1192 full_backward_hooks, non_full_backward_hooks = [], []
~/anaconda3/lib/python3.9/site-packages/torch/nn/modules/container.py in forward(self, input)
202 def forward(self, input):
203 for module in self:
--> 204 input = module(input)
205 return input
206
/tmp/ipykernel_5250/3566447173.py in nan_hook(self, inp, out)
47 for i, out in enumerate(outputs):
48 if out is not None and contains_nan(out):
---> 49 raise RuntimeError(f'Found NaN output at index: {i} in layer: {layer}')
50
51 def find_modules(nn_module, type):
RuntimeError: Found NaN output at index: 0 in layer: Linear
even though I normalize the data and initialize the linear layer, I still get this error. Is there any other approach to avoid getting this NaNs during the training?