The custom functions are validation and make_sentence_one_by_one.
As I said before when I use the function validation I am getting a nan in training loss. When I comment them and just print something inside torch.no_grad() everything works fine. The problem is not torch.no_grad() the problem is my function. Sorry for the long code again, but I tried to give some expressive code. I understand that this is a lot to read without knowing the subject, but thank you for your replies.
#!usr/bin/python
batch_size = 64
train_sampler = ShuffleBatchSampler(SortedSampler(train_dataset,sort_key=lambda i: len(i[4])),batch_size=batch_size,drop_last=False,shuffle=True)
''' define the dataloader || define some proparties for the dataloader, NUMBER_OF_WORKERS = NUMBER_OF_CPU_CORES '''
#loader_params = {'batch_size':32, 'shuffle':True, 'num_workers':10, 'pin_memory':True, 'collate_fn':custom_collate}
loader_params = {'num_workers':10, 'pin_memory':True, 'collate_fn':custom_collate, 'batch_sampler':train_sampler}
train_dataset_loader = torch.utils.data.DataLoader(train_dataset, **loader_params)
overall_batch = round(train_dataset.__len__()/batch_size)
length = train_dataset.__len__()
print('Dataset length:',length)
for index, (user_reviews, product_reviews, neighbourhood_reviews, product_ratings, text_reviews, rating_reviews, decoder_inputs) in enumerate(train_dataset_loader):
#make grads zero
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
#change some properties of the input. Move it to GPU and make it float tensor
user_reviews = user_reviews.type(torch.FloatTensor)
product_reviews = product_reviews.type(torch.FloatTensor)
product_ratings = product_ratings.type(torch.FloatTensor)
output_length = text_reviews.shape[1]
#initialize mask to use it in the prediction of rating
text_reviews = text_reviews.view(-1).to(device1)
mask = initialize_mask(text_reviews)
user_reviews = user_reviews.to(device1)
product_reviews = product_reviews.to(device1)
product_ratings = product_ratings.to(device1)
#run the encoder
overall_context_vector_encoder, overall_attention_weights, decoder_hidden = recommender_encoder(user_reviews, product_reviews)
decoder_inputs = decoder_inputs.to(device1)
#run the decoder
activations_loss, rating_prediction, repetitiveness_loss_back, target_length = recommender_decoder(overall_context_vector_encoder, decoder_hidden, decoder_inputs, output_length, overall_attention_weights, product_ratings, mask, teacher_forcing_ratio)
activations_loss = activations_loss.view(-1, output_space)
review_loss = criterion_review(activations_loss, text_reviews)
overall_review_loss.append(review_loss.item())
#----------------------------------------------------------------------------------------> RATING LOSS
#move target to GPU, and change it's type
rating_reviews = rating_reviews.to(device1)
rating_prediction = rating_prediction.view(-1)
rating_prediction = rating_prediction.type(torch.DoubleTensor).to(device1)
#keep record of the rating prediction loss
rating_loss = criterion_rating(rating_reviews, rating_prediction).type(torch.FloatTensor).to(device1)
overall_rating_loss.append(rating_loss.item())
loss = review_loss + rating_loss
#backpropagate
loss.backward()
torch.nn.utils.clip_grad_norm_(recommender_encoder.parameters(), max_norm=0.25) #gradient clipping
torch.nn.utils.clip_grad_norm_(recommender_decoder.parameters(), max_norm=0.25) #gradient clipping
encoder_optimizer.step()
decoder_optimizer.step()
#mlt_optimizer.step()
#do not build the computational graph
with torch.no_grad():
recommender_encoder.eval()
recommender_decoder.eval()
role='Testing'
rouge_score_test, rating_loss_test = validation(recommender_encoder,recommender_decoder,epoch,role)
role='Training'
rouge_score_train, rating_loss_train = validation(recommender_encoder,recommender_decoder,epoch,role)
#test and make the sentences
def validation(recommender_encoder,recommender_decoder,epoch,role):
#only in the validation
from Keras import evaluation
#print("\n\n--------------------------- Loaded model from disk ---------------------------\n\n")
''' define the decay function in order to control the trade off between feeding the model with it's output or the real output | #initialize the decay '''
teacher_forcing_ratio = 1 #because we want the decoder to feed itself only with it's own outputs
if (role=='Testing'):
#define the datalodaer
dataloader = dataset_loader.Dataset_yelp('data/testing.pkl', transform=None) #output_space = 8171
elif (role=='Training'):
#define the datalodaer
dataloader = dataset_loader.Dataset_yelp('data/training.pkl', transform=None) #output_space = 8171
elif (role=='Development'):
#define the datalodaer
dataloader = dataset_loader.Dataset_yelp('data/development.pkl', transform=None) #output_space = 8171
batch_size = 64
train_sampler = ShuffleBatchSampler(SortedSampler(dataloader,sort_key=lambda i: len(i[4])),batch_size=batch_size,drop_last=False,shuffle=False)
#define some proparties for the dataloader
loader_params = {'num_workers':4, 'pin_memory':True, 'collate_fn':custom_collate, 'batch_sampler':train_sampler}
dataloader = torch.utils.data.DataLoader(dataloader, **loader_params)
#define the loss function for the review prediction
weights = initialize_loss_weights()
class_weights = torch.FloatTensor(weights).cuda()
criterion_review = torch.nn.CrossEntropyLoss(class_weights).cuda() #to put the calcualtion into GPU
#for the review maybe we use Negative Log likelihood error
#criterion_review = torch.nn.NLLLoss().cuda()
#define the loss function for the rating prediction, we want to use RMSE so we have to use torch.sqrt(criterion(predicted, gound_truth))
criterion_rating = torch.nn.MSELoss().cuda()
overall_review_loss = []
overall_rating_loss = []
overall_rouge_score = []
max_review_length = 500
#iterate all the batches
for index, (user_reviews, product_reviews, neighbourhood_reviews, product_ratings, text_reviews, rating_reviews, decoder_inputs) in enumerate(dataloader):
#for index, (user_reviews, product_reviews, neighbourhood_reviews, product_ratings, text_reviews, rating_reviews, decoder_inputs) in enumerate([first]):
#change some properties of the input. Move it to GPU and make it float tensor
user_reviews = user_reviews.type(torch.FloatTensor)
product_reviews = product_reviews.type(torch.FloatTensor)
#format the ratings
product_ratings = product_ratings.type(torch.FloatTensor)
#take the output size, this is different for every batch
output_length = text_reviews.shape[1]
#initialize mask to use it in the prediction of rating
text_reviews_mask = text_reviews.view(-1).type(torch.LongTensor).to(device1)
mask = initialize_mask(text_reviews_mask)
user_reviews = user_reviews.to(device1)
product_reviews = product_reviews.to(device1)
product_ratings = product_ratings.to(device1)
#run the encoder
overall_context_vector_encoder, overall_attention_weights, decoder_hidden= recommender_encoder(user_reviews, product_reviews)
decoder_inputs = decoder_inputs.to(device1)
#run the decoder
activations_loss, rating_prediction, repetitiveness_loss_back, target_length = recommender_decoder(overall_context_vector_encoder, decoder_hidden, decoder_inputs, output_length, overall_attention_weights, product_ratings, mask, teacher_forcing_ratio)
#----------------------------------------------------------------------------------------> TEXT LOSS
#replace Nan values with 0s
#activations_loss[activations_loss==float('nan')] = 0
activations_loss = activations_loss.view(-1, output_space)
text_reviews = text_reviews.view(-1).to(device1)
#print(activations_loss,text_reviews)
review_loss = criterion_review(activations_loss, text_reviews)
overall_review_loss.append(review_loss.item())
#----------------------------------------------------------------------------------------> RATING LOSS
#move target to GPU, and change it's type
rating_reviews = rating_reviews.to(device1)
rating_prediction = rating_prediction.view(-1)
rating_prediction = rating_prediction.type(torch.DoubleTensor).to(device1)
#keep record of the rating prediction loss
rating_loss = criterion_rating(rating_reviews, rating_prediction).type(torch.FloatTensor).to(device1)
overall_rating_loss.append(rating_loss.item())
#iterate every sample in the batch
for sample in range(activations_loss.shape[0]):
predicted_sentence = []
ground_truth = []
target_reviews_length = activations_loss[sample].shape[0]
for word in range(activations_loss[sample].shape[0]):
#find the position of the max element, which represents the word ID
predicted_index = torch.argmax(activations_loss[sample][word]).item()
ground_truth.append(text_reviews[sample][word].item())
predicted_sentence.append(predicted_index)
ground_truth_rating = rating_reviews[sample].item()
rating_prediction_item = rating_prediction[sample].item()
count_predictions=1
print_sentence = False
rouge_score = evaluation.make_sentence_one_by_one(predicted_sentence, ground_truth, target_reviews_length, tokenizer, role, count_predictions, rating_prediction_item, ground_truth_rating,print_sentence,max_review_length)
overall_rouge_score.append(rouge_score)
return sum(overall_review_loss)/len(overall_review_loss), sum(overall_rating_loss)/len(overall_rating_loss)