Call a function inside with torch.no_grad() causes nan in training

My training loop is as follows:

When I add the line ----- evaluation.make_sentence_one_by_one(inputs) ---- which has nothing to do with the model or torch, I am facing nan in training after some epochs. Is there a problem when I call a function from another file inside torch_no_grad()?

for epoch in epochs:
     encoder.train()
     decoder.train()
     for index, sample in enumerate(datalodader_train):
          encoder_optimizer.zero_grad()
          decoder_optimizer.zero_grad()
          #training
          loss.backward()
          encoder_optimizer.step()
          decoder_optimizer.step()

          with torch.no_grad():
          recommender_encoder.eval()
          recommender_decoder.eval()
               for index, sample in enumerate(dataloader_test):
               #testing
               -----  evaluation.make_sentence_one_by_one(inputs)

If it happend only when you use model.eval(), maybe you can try to check the parameters of BatchNorm layer.

Thank you for the reply. I do not have BatchNorm layers, I only have Dropout layers, and I check with and without mode.eval() and the problem does not occur.

Hi,

No, torch.no_grad() does not have such limitations.
Could you be clearer about what you tried to fix this? Removing the torch.no_grad() fixes the issue?

I tried removing the .eval() and .train() but this did not work. I tried to isolate every piece and I conclude that this specific function that I call made the issue.

import matplotlib.pyplot as plt
from Preprocessing import preprocessing
import numpy as np
from rouge import Rouge
import os
import re
import string
from unicodedata import normalize
from Preprocessing import word2vec as w2v
import numpy as np
import keras
import tensorflow as tf
from matplotlib import pyplot as plt
import sys

#convert prediction into sentence
def make_sentence_one_by_one(prediction, ground_truth, target_reviews_length, tokenizer, role, count_predictions, rating_prediction, ground_truth_rating, print_sentence,max_review_length):

        if (print_sentence):
                print('\n\n\n\n\n\n --------------------',role,'--------------------')

        #print(ground_truth.shape)

        #initialize some properties
        target_str = ''
        predicted_str = ''
        
        rouge = Rouge()
        
        if (print_sentence):
                print('\n\n\n Sentence \n\n\n')

        
        count_words = 0 #keep track of the words
        if (print_sentence):
                print('\n ---------- Predicted ---------- \n')
                print('\n------ Rating prediction:',rating_prediction,'Ground truth rating:',ground_truth_rating,'------\n')
        for index_word in prediction:
        
                word = preprocessing.index_to_word_mapping(index_word, tokenizer)
                
                #if (word==None):
                #        print(index_word)

                #we are have two condition, the first one assumes that the model has learned when to stop 'speaking' and generates the word 'eos'
                #LIMITATIONS
                
                predicted_str = predicted_str+' '+str(word)     
                
                if (print_sentence):   
                        print(word,end =" ")        
                count_words+=1
                
                if( (word=='eos') or (count_words==max_review_length) ):
                
                        if (print_sentence):
                                print('   |----- end of sentence -----|   ')
                        break
                

                #otherwise for more readable results we stop when we met the lenght of the ground truth
                
                '''
                if ( (word=='eos') or (count_words == target_reviews_length[str(count_predictions)]) ):
                
                        print('   |----- end of sentence -----|   ')
                        break
                '''
                  
        if (print_sentence):
                print('\n ---------- Target ---------- \n\n')
        for word in ground_truth:
        
        
                word = preprocessing.index_to_word_mapping(word, tokenizer)
                target_str = target_str+' '+str(word)
                if (print_sentence):       
                        print(word,end =" ")
                
                if ( (word=='eos') ):
                
                        if (print_sentence):
                                print('   |----- end of sentence -----|   ')
                        break
                   
                        
        #print(target_str)
        #print(predicted_str)
        score = rouge.get_scores(predicted_str, target_str)

        if (print_sentence):
                print('\n ---------- Rouge score ---------- \n')
                print(score)

        #initialize again for the next prediction
        #target_str = ''
        #predicted_str = ''                        
                        
        if (print_sentence):        
                print('\n\n\n')
        
        #return only the rouge-1 f score
        return score[0]['rouge-1']['f']

So this is not related to no_grad() then?
Can you give a small code sample (30-40 lines) that reproduces this issue so that we have a full picture please?

I do not think is related with no_grad(). But every time I add in my code the lines between {{}} I get nan in my losses after some epochs. Sorry for the long code. Training loop is the same as testing loop in validation, the only extras are the backpropagation procedure.

                {{
                with torch.no_grad():                        
        
                        recommender_encoder.eval()
                        recommender_decoder.eval()
        
                        role='Testing'
                        rouge_score_test,rating_loss_test = validation(recommender_encoder,recommender_decoder,epoch,role)
                        
                        role='Training'
                        rouge_score_train,rating_loss_train = validation(recommender_encoder,recommender_decoder,epoch,role)
                        }}

#test and make the sentences
def validation(recommender_encoder,recommender_decoder,epoch,role):

        #print("\n\n--------------------------- Loaded model from disk ---------------------------\n\n")
        
        ''' define the decay function in order to control the trade off between feeding the model with it's output or the real output | #initialize the decay '''
        teacher_forcing_ratio_test = 1 #because we want the decoder to feed itself only with it's own outputs       
        
        if (role=='Testing'):
                #define the datalodaer
                dataloader_test = dataset_loader.Dataset_yelp('data/REAL_minibatch_testing.pkl', transform=None) #output_space = 8171
        elif (role=='Training'):
                #define the datalodaer
                dataloader_test = dataset_loader.Dataset_yelp('data/REAL_minibatch_training.pkl', transform=None) #output_space = 8171
        elif (role=='Development'):      
                #define the datalodaer
                dataloader_test = dataset_loader.Dataset_yelp('data/REAL_minibatch_development.pkl', transform=None) #output_space = 8171       
        
        
        batch_size_test = 32
        train_sampler = ShuffleBatchSampler(SortedSampler(dataloader_test,sort_key=lambda i: len(i[4])),batch_size=batch_size_test,drop_last=False,shuffle=False)
        
        
        #define some proparties for the dataloader
        loader_params_test = {'num_workers':4, 'pin_memory':True, 'collate_fn':custom_collate, 'batch_sampler':train_sampler}
        dataloader_test = torch.utils.data.DataLoader(dataloader_test, **loader_params_test)
        
        
        #define the loss function for the review prediction
        weights_test = initialize_loss_weights()
        class_weights_test = torch.FloatTensor(weights_test).cuda()
        criterion_review_test = torch.nn.CrossEntropyLoss(class_weights_test).cuda() #to put the calcualtion into GPU
        
        
        #for the review maybe we use Negative Log likelihood error
        #criterion_review = torch.nn.NLLLoss().cuda()        
        #define the loss function for the rating prediction, we want to use RMSE so we have to use torch.sqrt(criterion(predicted, gound_truth))
        criterion_rating_test = torch.nn.MSELoss().cuda()

        overall_review_loss_test = []
        overall_rating_loss_test = []
        
        overall_rouge_score_test = []

        
        #iterate all the batches
        for index_test, (user_reviews_test, product_reviews_test, neighbourhood_reviews_test, product_ratings_test, text_reviews_test, rating_reviews_test, decoder_inputs_test) in enumerate(dataloader_test):
        #for index, (user_reviews, product_reviews, neighbourhood_reviews, product_ratings, text_reviews, rating_reviews, decoder_inputs) in enumerate([first]):
                
                #change some properties of the input. Move it to GPU and make it float tensor
                user_reviews_test = user_reviews_test.type(torch.FloatTensor)
                product_reviews_test = product_reviews_test.type(torch.FloatTensor)
                #format the ratings
                product_ratings_test = product_ratings_test.type(torch.FloatTensor)
                #take the output size, this is different for every batch
                output_length_test =  text_reviews_test.shape[1]
                
                #print('User Input reviews',user_reviews.shape)
                #print('Product input reviews',product_reviews.shape)
                #print('Product reviews ratings',product_ratings.shape)
                #print('Target review',text_reviews.shape)
                #print('Target rating',rating_reviews.shape)
                #print('Batch:',index)
                #print('---------- new ----------')
                
                #initialize mask to use it in the prediction of rating
                text_reviews_mask_test = text_reviews_test.view(-1).type(torch.LongTensor).to(device1)
                mask_test = initialize_mask(text_reviews_mask_test)
                
                user_reviews_test = user_reviews_test.to(device1)
                product_reviews_test = product_reviews_test.to(device1)
                product_ratings_test = product_ratings_test.to(device1)
                
                #run the encoder
                overall_context_vector_encoder_test, overall_attention_weights_test, decoder_hidden_test, user_h_test, product_h_test, user_lstm_output_test, product_lstm_output_test = recommender_encoder(user_reviews_test, product_reviews_test)
                
                
                #activations_loss, activations, rating_prediction, repetitiveness_loss_back = encoder_decoder_parallel(user_reviews, product_reviews, output_length, product_ratings, decoder_inputs, mask, teacher_forcing_ratio)
                
                #overall_context_vector_encoder = overall_context_vector_encoder.to(self.device1)
                #decoder_hidden = (decoder_hidden[0].to(self.device1),decoder_hidden[1].to(self.device1))
                #overall_attention_weights = overall_attention_weights.to(self.device1)
                decoder_inputs_test = decoder_inputs_test.to(device1)
                
                #run the decoder
                activations_loss_test, rating_prediction_test, repetitiveness_loss_back_test, target_length_test, decoder_h_c_list_test = recommender_decoder(overall_context_vector_encoder_test, decoder_hidden_test, decoder_inputs_test, output_length_test, overall_attention_weights_test, product_ratings_test, mask_test, teacher_forcing_ratio_test)
                

                '''
                #----------------------------------------------------------------------------------------> TEXT LOSS
                #replace Nan values with 0s
                #activations_loss[activations_loss==float('nan')] = 0
                activations_loss = activations_loss.view(-1, output_space)
                
                print(activations_loss,text_reviews)
                
                review_loss = criterion_review(activations_loss, text_reviews)
                overall_review_loss.append(review_loss.item())
                '''
                #----------------------------------------------------------------------------------------> RATING LOSS
                #move target to GPU, and change it's type
                rating_reviews_test = rating_reviews_test.to(device1)
                rating_prediction_test = rating_prediction_test.view(-1)
                rating_prediction_test = rating_prediction_test.type(torch.DoubleTensor).to(device1)
                #keep record of the rating prediction loss
                rating_loss_test = criterion_rating_test(rating_reviews_test, rating_prediction_test).type(torch.FloatTensor).to(device1)
                overall_rating_loss_test.append(rating_loss_test.item())
                
                #format the ratings
                #rating_prediction = rating_prediction.squeeze(1).squeeze(1)
                
                #iterate every sample in the batch
                for sample_test in range(activations_loss_test.shape[0]):
                        predicted_sentence_test = []
                        ground_truth_test = []
                        target_reviews_length_test = activations_loss_test[sample_test].shape[0]
                        for word_test in range(activations_loss_test[sample_test].shape[0]):  
                                #find the position of the max element, which represents the word ID
                                predicted_index_test = torch.argmax(activations_loss_test[sample_test][word_test]).item()
                                ground_truth_test.append(text_reviews_test[sample_test][word_test].item())
                                predicted_sentence_test.append(predicted_index_test)
                                
                                
                        ground_truth_rating_test = rating_reviews_test[sample_test].item()
                        rating_prediction_item_test = rating_prediction_test[sample_test].item()
                        count_predictions_test=1
                        
                        
                        #for every predicted sentence
                        print_sentence_test = False
                        max_review_test = 500
                        rouge_score_test = evaluation.make_sentence_one_by_one(predicted_sentence_test, ground_truth_test, target_reviews_length_test, tokenizer, role, count_predictions_test, rating_prediction_item_test, ground_truth_rating_test,print_sentence_test,max_review_test)
                        overall_rouge_score.append(rouge_score)

                return sum(overall_rouge_score_test)/len(overall_rouge_score_test), sum(overall_rating_loss_test)/len(overall_rating_loss_test)

Hi,

It is really hard to help you given that most of what happens in this big function is using other custom functions from your code.
You would need to reduce the example to only contain a small subset of things so that you can share them here and it is easy for us to read it.

The custom functions are validation and make_sentence_one_by_one.
As I said before when I use the function validation I am getting a nan in training loss. When I comment them and just print something inside torch.no_grad() everything works fine. The problem is not torch.no_grad() the problem is my function. Sorry for the long code again, but I tried to give some expressive code. I understand that this is a lot to read without knowing the subject, but thank you for your replies.

#!usr/bin/python

batch_size = 64
train_sampler = ShuffleBatchSampler(SortedSampler(train_dataset,sort_key=lambda i: len(i[4])),batch_size=batch_size,drop_last=False,shuffle=True)
        
''' define the dataloader || define some proparties for the dataloader, NUMBER_OF_WORKERS = NUMBER_OF_CPU_CORES '''
#loader_params = {'batch_size':32, 'shuffle':True, 'num_workers':10, 'pin_memory':True, 'collate_fn':custom_collate}
loader_params = {'num_workers':10, 'pin_memory':True, 'collate_fn':custom_collate, 'batch_sampler':train_sampler}
train_dataset_loader = torch.utils.data.DataLoader(train_dataset, **loader_params)
overall_batch = round(train_dataset.__len__()/batch_size)
length = train_dataset.__len__()
print('Dataset length:',length)

for index, (user_reviews, product_reviews, neighbourhood_reviews, product_ratings, text_reviews, rating_reviews, decoder_inputs) in enumerate(train_dataset_loader):

        #make grads zero
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        #change some properties of the input. Move it to GPU and make it float tensor
        user_reviews = user_reviews.type(torch.FloatTensor)
        product_reviews = product_reviews.type(torch.FloatTensor)
        product_ratings = product_ratings.type(torch.FloatTensor)
        output_length =  text_reviews.shape[1]
        
        #initialize mask to use it in the prediction of rating
        text_reviews = text_reviews.view(-1).to(device1)
        mask = initialize_mask(text_reviews)

        user_reviews = user_reviews.to(device1)
        product_reviews = product_reviews.to(device1)
        product_ratings = product_ratings.to(device1)
        
        #run the encoder
        overall_context_vector_encoder, overall_attention_weights, decoder_hidden = recommender_encoder(user_reviews, product_reviews)

        decoder_inputs = decoder_inputs.to(device1)
        
        #run the decoder
        activations_loss, rating_prediction, repetitiveness_loss_back, target_length = recommender_decoder(overall_context_vector_encoder, decoder_hidden, decoder_inputs, output_length, overall_attention_weights, product_ratings, mask, teacher_forcing_ratio)


        activations_loss = activations_loss.view(-1, output_space)
        review_loss = criterion_review(activations_loss, text_reviews)
        overall_review_loss.append(review_loss.item())
        #----------------------------------------------------------------------------------------> RATING LOSS
        #move target to GPU, and change it's type
        rating_reviews = rating_reviews.to(device1)
        rating_prediction = rating_prediction.view(-1)
        rating_prediction = rating_prediction.type(torch.DoubleTensor).to(device1)
        #keep record of the rating prediction loss
        rating_loss = criterion_rating(rating_reviews, rating_prediction).type(torch.FloatTensor).to(device1)
        overall_rating_loss.append(rating_loss.item())

        loss = review_loss + rating_loss
        
        #backpropagate
        loss.backward()

        torch.nn.utils.clip_grad_norm_(recommender_encoder.parameters(), max_norm=0.25) #gradient clipping
        torch.nn.utils.clip_grad_norm_(recommender_decoder.parameters(), max_norm=0.25) #gradient clipping
        
        encoder_optimizer.step()
        decoder_optimizer.step()
        #mlt_optimizer.step()
        
#do not build the computational graph
with torch.no_grad():

        recommender_encoder.eval()
        recommender_decoder.eval()
        
        role='Testing'
        rouge_score_test, rating_loss_test = validation(recommender_encoder,recommender_decoder,epoch,role)
        
        role='Training'
        rouge_score_train, rating_loss_train = validation(recommender_encoder,recommender_decoder,epoch,role)
        
        
#test and make the sentences
def validation(recommender_encoder,recommender_decoder,epoch,role):

        #only in the validation
        from Keras import evaluation

        #print("\n\n--------------------------- Loaded model from disk ---------------------------\n\n")
        
        ''' define the decay function in order to control the trade off between feeding the model with it's output or the real output | #initialize the decay '''
        teacher_forcing_ratio = 1 #because we want the decoder to feed itself only with it's own outputs       
        
        if (role=='Testing'):
                #define the datalodaer
                dataloader = dataset_loader.Dataset_yelp('data/testing.pkl', transform=None) #output_space = 8171
        elif (role=='Training'):
                #define the datalodaer
                dataloader = dataset_loader.Dataset_yelp('data/training.pkl', transform=None) #output_space = 8171
        elif (role=='Development'):      
                #define the datalodaer
                dataloader = dataset_loader.Dataset_yelp('data/development.pkl', transform=None) #output_space = 8171
        
        batch_size = 64
        train_sampler = ShuffleBatchSampler(SortedSampler(dataloader,sort_key=lambda i: len(i[4])),batch_size=batch_size,drop_last=False,shuffle=False)
        
        
        #define some proparties for the dataloader
        loader_params = {'num_workers':4, 'pin_memory':True, 'collate_fn':custom_collate, 'batch_sampler':train_sampler}
        dataloader = torch.utils.data.DataLoader(dataloader, **loader_params)
        
        
        #define the loss function for the review prediction
        weights = initialize_loss_weights()
        class_weights = torch.FloatTensor(weights).cuda()
        criterion_review = torch.nn.CrossEntropyLoss(class_weights).cuda() #to put the calcualtion into GPU
        
        
        #for the review maybe we use Negative Log likelihood error
        #criterion_review = torch.nn.NLLLoss().cuda()        
        #define the loss function for the rating prediction, we want to use RMSE so we have to use torch.sqrt(criterion(predicted, gound_truth))
        criterion_rating = torch.nn.MSELoss().cuda()

        overall_review_loss = []
        overall_rating_loss = []
        
        overall_rouge_score = []
        
        max_review_length = 500
        
        #iterate all the batches
        for index, (user_reviews, product_reviews, neighbourhood_reviews, product_ratings, text_reviews, rating_reviews, decoder_inputs) in enumerate(dataloader):
        #for index, (user_reviews, product_reviews, neighbourhood_reviews, product_ratings, text_reviews, rating_reviews, decoder_inputs) in enumerate([first]):
                
                #change some properties of the input. Move it to GPU and make it float tensor
                user_reviews = user_reviews.type(torch.FloatTensor)
                product_reviews = product_reviews.type(torch.FloatTensor)
                #format the ratings
                product_ratings = product_ratings.type(torch.FloatTensor)
                #take the output size, this is different for every batch
                output_length =  text_reviews.shape[1]
                
                #initialize mask to use it in the prediction of rating
                text_reviews_mask = text_reviews.view(-1).type(torch.LongTensor).to(device1)
                mask = initialize_mask(text_reviews_mask)
                
                user_reviews = user_reviews.to(device1)
                product_reviews = product_reviews.to(device1)
                product_ratings = product_ratings.to(device1)
                
                #run the encoder
                overall_context_vector_encoder, overall_attention_weights, decoder_hidden= recommender_encoder(user_reviews, product_reviews)

                decoder_inputs = decoder_inputs.to(device1)
                
                #run the decoder
                activations_loss, rating_prediction, repetitiveness_loss_back, target_length = recommender_decoder(overall_context_vector_encoder, decoder_hidden, decoder_inputs, output_length, overall_attention_weights, product_ratings, mask, teacher_forcing_ratio)

                #----------------------------------------------------------------------------------------> TEXT LOSS
                #replace Nan values with 0s
                #activations_loss[activations_loss==float('nan')] = 0
                activations_loss = activations_loss.view(-1, output_space)
                text_reviews = text_reviews.view(-1).to(device1)
                #print(activations_loss,text_reviews)
                
                review_loss = criterion_review(activations_loss, text_reviews)
                overall_review_loss.append(review_loss.item())
                
                #----------------------------------------------------------------------------------------> RATING LOSS
                #move target to GPU, and change it's type
                rating_reviews = rating_reviews.to(device1)
                rating_prediction = rating_prediction.view(-1)
                rating_prediction = rating_prediction.type(torch.DoubleTensor).to(device1)
                #keep record of the rating prediction loss
                rating_loss = criterion_rating(rating_reviews, rating_prediction).type(torch.FloatTensor).to(device1)
                overall_rating_loss.append(rating_loss.item())
                
                #iterate every sample in the batch
                for sample in range(activations_loss.shape[0]):
                        predicted_sentence = []
                        ground_truth = []
                        target_reviews_length = activations_loss[sample].shape[0]
                        for word in range(activations_loss[sample].shape[0]):  
                                #find the position of the max element, which represents the word ID
                                predicted_index = torch.argmax(activations_loss[sample][word]).item()
                                ground_truth.append(text_reviews[sample][word].item())
                                predicted_sentence.append(predicted_index)
                                
                                
                        ground_truth_rating = rating_reviews[sample].item()
                        rating_prediction_item = rating_prediction[sample].item()
                        count_predictions=1
                        print_sentence = False
                        rouge_score = evaluation.make_sentence_one_by_one(predicted_sentence, ground_truth, target_reviews_length, tokenizer, role, count_predictions, rating_prediction_item, ground_truth_rating,print_sentence,max_review_length)
                        overall_rouge_score.append(rouge_score)

                return sum(overall_review_loss)/len(overall_review_loss), sum(overall_rating_loss)/len(overall_rating_loss)
        
        
        
        

I do not know why, but as soon as I removed the dataloader from the validation function and replace them above the training loop it worked fine.

In the beginning

#test and make the sentences
def validation(recommender_encoder,recommender_decoder,epoch,role):

        #only in the validation
        from Keras import evaluation

        #print("\n\n--------------------------- Loaded model from disk ---------------------------\n\n")
        
        ''' define the decay function in order to control the trade off between feeding the model with it's output or the real output | #initialize the decay '''
        teacher_forcing_ratio = 1 #because we want the decoder to feed itself only with it's own outputs       
        
        if (role=='Testing'):
                #define the datalodaer
                dataloader = dataset_loader.Dataset_yelp('data/testing.pkl', transform=None) #output_space = 8171
        elif (role=='Training'):
                #define the datalodaer
                dataloader = dataset_loader.Dataset_yelp('data/training.pkl', transform=None) #output_space = 8171
        elif (role=='Development'):      
                #define the datalodaer
                dataloader = dataset_loader.Dataset_yelp('data/development.pkl', transform=None) #output_space = 8171
        
        batch_size = 64
        train_sampler = ShuffleBatchSampler(SortedSampler(dataloader,sort_key=lambda i: len(i[4])),batch_size=batch_size,drop_last=False,shuffle=False)
        
        
        #define some proparties for the dataloader
        loader_params = {'num_workers':4, 'pin_memory':True, 'collate_fn':custom_collate, 'batch_sampler':train_sampler}
        dataloader = torch.utils.data.DataLoader(dataloader, **loader_params)

Fix it

#test and make the sentences
def validation(recommender_encoder,recommender_decoder,epoch,role,train_dataset_loader,dataloader_test,dataloader_development):

        teacher_forcing_ratio = 1 #because we want the decoder to feed itself only with it's own outputs

        if (role=='Testing'):
                #define the datalodaer
                dataloader = dataloader_test
        elif (role=='Training'):
                #define the datalodaer
                dataloader = train_dataset_loader
        elif (role=='Development'):      
                #define the datalodaer
                dataloader = dataloader_development