HELP --> RuntimeWarning: invalid value encountered in double_scalars

hfdp · June 11, 2020, 10:30am

Hello,

I get this error: RuntimeWarning: invalid value encountered in double_scalars
related to this line of code: FILE.write("\nEpoch “+str(epoch)+”: Training Loss = " +str(np.sum(trg_loss).max()/len(trg_loss)))

The output looks as follows with NaN: Epoch : 0
Epoch 0: Training Loss = nan

The entire code is posted below:

> 
> model = MyModel()
> 
> print(model)
> 
> ## name of text file to save training stats
> TRAINING_LOG = "log.txt"
> FILE = open(TRAINING_LOG, 'w')
> 
> 
> #%% Here we train and evaluate the model
>         
> saved_models_path = os.getcwd() + '/models/'
> if(not os.path.isdir(saved_models_path)):
>     os.makedirs(saved_models_path)
> 
> 
> # Check if CUDA is available
> use_cuda = torch.cuda.is_available()
> # print("\n\n USE CUDA: ", use_cuda)
> # print("\n\n")
> 
> 
> ## Setting cuda as False for stability. Please comment out if using GPU
> use_cuda = False
> 
> device = torch.device("cuda:0" if use_cuda else "cpu")
> # cudnn.benchmark = True
> 
> 
> 
> 
> # Set training parameters
> params = {'batch_size': 64,
>           'shuffle': True,
>           'num_workers': 10}
> max_epochs = 100
> core_num = 20
> tune_every = 10 # test and validate the model every x epochs
> 
> data_library = {}
> prior = '100MHz'
> target = '250MHz'
> 
> ## Amount of dataset to use for the experiment
> data_frac = 1  ## What fraction of data to use for training (please set to 1 for final experiment ) 
> 
> 
> 
> # Load all the data from the txt file
> file_IDs = open('ID_list.txt','r').read().split('\n')
> file_IDs = file_IDs[:-1] # remove last line
> file_IDs = file_IDs[:int(len(file_IDs)*data_frac)]
> complete_dataset = Dataset(file_IDs)
> 
> 
> # create your optimizer
> optimizer = optim.SGD(model.parameters(), lr=0.005, momentum = 0.1)
> scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
> 
> #%% Here we train the network
> 
> # Divide the dataset into the training and validation set
> lengths = [int(np.ceil(len(complete_dataset)*0.8)), int(np.floor(len(complete_dataset)*0.1)), int(np.floor(len(complete_dataset)*0.1))]
> training_set, validation_set, evaluation_set = torch.utils.data.random_split(complete_dataset, lengths)
> training_generator = data.DataLoader(training_set, **params)
> validation_generator = data.DataLoader(validation_set, **params)
> evaluation_generator = data.DataLoader(evaluation_set, **params)
> 
> 
> # instantiate the model to make it a double tensor
> forward_model = model.double()
> 
> # # check if model works for random instance of the data 
> # t100 ,t250 = next(iter(training_generator))
> # one_prediction = forward_model(t100)
> 
> 
> loss_function = nn.MSELoss()
> 
> 
> for param in forward_model.parameters():
>     param.requires_grad = True
> 
> def train(low_res, high_res):
> # Loop over epochs
>     global forward_model, optimizer
>     low_res, high_res = low_res.to(device), high_res.to(device)
>     optimizer.zero_grad()
>     prediction_training = forward_model(low_res)
>     loss = loss_function(prediction_training, high_res) #Here compute tha value that estimates how far the outut is from the target
>     #print("\nTraining Loss: ", loss)
>     loss.backward() #The whole graph is differentiatted wrt the loss and all ensors in the graph that have 'required_gard = true' will have their .grad tensot accumulaed ith the gradient. 
>     optimizer.step()
>     return loss
> 
> 
> def val(low_res, high_res):
>     global forward_model, optimizer
>     low_res, high_res = low_res.to(device), high_res.to(device)
>     optimizer.zero_grad()
>     prediction_training = forward_model(low_res)
>     loss = loss_function(prediction_training, high_res) #Here compute tha value that estimates how far the outut is from the target
>     #print("\nValidation Loss: ", loss)
>     #scheduler.step()
>     return loss
> 
> 
> 
> for epoch in range(max_epochs):
>     print('\nEpoch : ' + str(epoch))   
>     FILE.write('\nEpoch : ' + str(epoch))
> 
>    	# Training
>     trg_loss = [] 
>     forward_model.train()
>     with torch.set_grad_enabled(True):
>         for data_library['100MHz'], data_library['250MHz'] in training_generator:
>             if(core_num>1):
>                 Parallel(n_jobs=core_num)
>                 #print(data_library['20MHz'].shape)
>                 delayed(train)(data_library[prior],data_library[target])
>             else:
>                 new_loss = train(data_library[prior],data_library[target])
>                 trg_loss.append(new_loss)
>             # Parallel(n_jobs=core_num)
>             # print(data_library['20MHz'].shape)
>             # delayed(train)(data_library[prior],data_library[target])
>     #print("\nTraining Loss: ", sum(trg_loss).detach().numpy().max()/len(trg_loss))
>     FILE.write("\nEpoch "+str(epoch)+": Training Loss = " +str(np.sum(trg_loss).max()/len(trg_loss)))
>            
>             
>     if epoch % tune_every == 0 and epoch > 0:
>    	# validation
>         val_loss = []
>         forward_model.eval()
>         with torch.set_grad_enabled(False):
>             for data_library['100MHz'], data_library['250MHz'] in validation_generator:
>                 if(core_num>1):
>                     Parallel(n_jobs=core_num)
>                     #print(data_library['20MHz'].shape)
>                     delayed(train)(data_library[prior],data_library[target])
>                 else:
>                     new_loss = val(data_library[prior],data_library[target])
>                     val_loss.append(new_loss)
> 
>         #print("\nValidation Loss: ", sum(val_loss).numpy().max()/len(val_loss))
>         FILE.write("\nEpoch "+str(epoch)+": Validation Loss = " +str(np.sum(val_loss).max()/len(val_loss)))
>             # HYPERPARAMETER TUNING GOES IN HERE
> 
> FILE.close()

albanD · June 11, 2020, 1:04pm

Are you sure it is not numpy that through this warning?
Also I guess this is because you have nan values. Fixing the nan values will most likely make the warning go away.

hfdp · June 11, 2020, 1:53pm

Then would you suggest it wuthout the numpy? aka:

FILE.write("\nEpoch “+str(epoch)+”: Training Loss = " +str(sum(trg_loss).max()/len(trg_loss)))

The NaN also needs to be fixed, but no clue where this comes from yet. Any way to check it quickly?

albanD · June 11, 2020, 2:19pm

Yes, you actually do the same sum on the commented line just above without the np

If it happens during the forward, you can add a few prints in your code to find out where it appears.
If it is during the backward, you can enable the anomaly detection to know where it appears in the backward.