Hello,
I get this error: RuntimeWarning: invalid value encountered in double_scalars
related to this line of code: FILE.write("\nEpoch “+str(epoch)+”: Training Loss = " +str(np.sum(trg_loss).max()/len(trg_loss)))
The output looks as follows with NaN: Epoch : 0
Epoch 0: Training Loss = nan
The entire code is posted below:
>
> model = MyModel()
>
> print(model)
>
> ## name of text file to save training stats
> TRAINING_LOG = "log.txt"
> FILE = open(TRAINING_LOG, 'w')
>
>
> #%% Here we train and evaluate the model
>
> saved_models_path = os.getcwd() + '/models/'
> if(not os.path.isdir(saved_models_path)):
> os.makedirs(saved_models_path)
>
>
> # Check if CUDA is available
> use_cuda = torch.cuda.is_available()
> # print("\n\n USE CUDA: ", use_cuda)
> # print("\n\n")
>
>
> ## Setting cuda as False for stability. Please comment out if using GPU
> use_cuda = False
>
> device = torch.device("cuda:0" if use_cuda else "cpu")
> # cudnn.benchmark = True
>
>
>
>
> # Set training parameters
> params = {'batch_size': 64,
> 'shuffle': True,
> 'num_workers': 10}
> max_epochs = 100
> core_num = 20
> tune_every = 10 # test and validate the model every x epochs
>
> data_library = {}
> prior = '100MHz'
> target = '250MHz'
>
> ## Amount of dataset to use for the experiment
> data_frac = 1 ## What fraction of data to use for training (please set to 1 for final experiment )
>
>
>
> # Load all the data from the txt file
> file_IDs = open('ID_list.txt','r').read().split('\n')
> file_IDs = file_IDs[:-1] # remove last line
> file_IDs = file_IDs[:int(len(file_IDs)*data_frac)]
> complete_dataset = Dataset(file_IDs)
>
>
> # create your optimizer
> optimizer = optim.SGD(model.parameters(), lr=0.005, momentum = 0.1)
> scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)
>
> #%% Here we train the network
>
> # Divide the dataset into the training and validation set
> lengths = [int(np.ceil(len(complete_dataset)*0.8)), int(np.floor(len(complete_dataset)*0.1)), int(np.floor(len(complete_dataset)*0.1))]
> training_set, validation_set, evaluation_set = torch.utils.data.random_split(complete_dataset, lengths)
> training_generator = data.DataLoader(training_set, **params)
> validation_generator = data.DataLoader(validation_set, **params)
> evaluation_generator = data.DataLoader(evaluation_set, **params)
>
>
> # instantiate the model to make it a double tensor
> forward_model = model.double()
>
> # # check if model works for random instance of the data
> # t100 ,t250 = next(iter(training_generator))
> # one_prediction = forward_model(t100)
>
>
> loss_function = nn.MSELoss()
>
>
> for param in forward_model.parameters():
> param.requires_grad = True
>
> def train(low_res, high_res):
> # Loop over epochs
> global forward_model, optimizer
> low_res, high_res = low_res.to(device), high_res.to(device)
> optimizer.zero_grad()
> prediction_training = forward_model(low_res)
> loss = loss_function(prediction_training, high_res) #Here compute tha value that estimates how far the outut is from the target
> #print("\nTraining Loss: ", loss)
> loss.backward() #The whole graph is differentiatted wrt the loss and all ensors in the graph that have 'required_gard = true' will have their .grad tensot accumulaed ith the gradient.
> optimizer.step()
> return loss
>
>
> def val(low_res, high_res):
> global forward_model, optimizer
> low_res, high_res = low_res.to(device), high_res.to(device)
> optimizer.zero_grad()
> prediction_training = forward_model(low_res)
> loss = loss_function(prediction_training, high_res) #Here compute tha value that estimates how far the outut is from the target
> #print("\nValidation Loss: ", loss)
> #scheduler.step()
> return loss
>
>
>
> for epoch in range(max_epochs):
> print('\nEpoch : ' + str(epoch))
> FILE.write('\nEpoch : ' + str(epoch))
>
> # Training
> trg_loss = []
> forward_model.train()
> with torch.set_grad_enabled(True):
> for data_library['100MHz'], data_library['250MHz'] in training_generator:
> if(core_num>1):
> Parallel(n_jobs=core_num)
> #print(data_library['20MHz'].shape)
> delayed(train)(data_library[prior],data_library[target])
> else:
> new_loss = train(data_library[prior],data_library[target])
> trg_loss.append(new_loss)
> # Parallel(n_jobs=core_num)
> # print(data_library['20MHz'].shape)
> # delayed(train)(data_library[prior],data_library[target])
> #print("\nTraining Loss: ", sum(trg_loss).detach().numpy().max()/len(trg_loss))
> FILE.write("\nEpoch "+str(epoch)+": Training Loss = " +str(np.sum(trg_loss).max()/len(trg_loss)))
>
>
> if epoch % tune_every == 0 and epoch > 0:
> # validation
> val_loss = []
> forward_model.eval()
> with torch.set_grad_enabled(False):
> for data_library['100MHz'], data_library['250MHz'] in validation_generator:
> if(core_num>1):
> Parallel(n_jobs=core_num)
> #print(data_library['20MHz'].shape)
> delayed(train)(data_library[prior],data_library[target])
> else:
> new_loss = val(data_library[prior],data_library[target])
> val_loss.append(new_loss)
>
> #print("\nValidation Loss: ", sum(val_loss).numpy().max()/len(val_loss))
> FILE.write("\nEpoch "+str(epoch)+": Validation Loss = " +str(np.sum(val_loss).max()/len(val_loss)))
> # HYPERPARAMETER TUNING GOES IN HERE
>
> FILE.close()