ValueError: array must not contain infs or NaNs

I am implementing pre-trained models (Densenet161, ResNet50 & InceptionV3) with options of setting boolean to one of them.
But, the model outputs the Nan values. I check if it was like those Nan values from some parts in the array of all, and unfortunately, all the values are nan.
I’ve attached part of the code.
Any suggestion is kindly welcome.

image


Did you check where the NaNs are created and if the input tensors already container invalid values?
If so, could you post a minimal, executable code snippet to reproduce the issue?
You can post code snippets by wrapping them into three backticks ```, which makes debugging easier than trying to understand the code from screenshots.

1 Like

NaNs start resulting from the output of the forward pass step of the model.
Let me post with backticks.
‘’’
def train(model, train_loader, test_loader,
epochs, optimizer, criterion, scheduler=None,
name=“model.pt”, path=None):

# compare overfitted
train_loss_data, valid_loss_data = [], []
# check for validation loss
valid_loss_min = np.Inf
# calculate time
since = time.time()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    e_since = time.time()

    ###################
    # train the model #
    ###################
    model.train()  # prep model for training

    for images, labels in train_loader:
        # Move input and label tensors to the default device
        images, labels = images.to(device), labels.to(device)
        print(images)
        # clear the gradients of all optimized variables
        # forward pass: compute predicted outputs by passing inputs to the model
        log_ps = model(images)
        # calculate the loss
        loss = criterion(log_ps, labels)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
        # update running training loss
        train_loss += loss.item() * images.size(0)
        

    ######################
    # validate the model #
    ######################
    print("\t\tGoing for validation")
    model.eval()  # prep model for evaluation
    for data, target in test_loader:
        losses = []
        # Move input and label tensors to the default device
        data, target = data.to(device), target.to(device)
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        # print(output)
        # calculate the loss
        loss_p = criterion(output, target)
        losses.append(loss_p.item())        
        
        # update running validation loss
        valid_loss += loss_p.item() * data.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        # equals = top_class == target.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
        # _, predicted = torch.max(output.data, 1)
        total += target.size(0)
        result = output.data.cpu().numpy()
        truth = np.array(target.data.cpu().numpy())
        print(result)
        print(truth)
        # result = np.nan_to_num(result)
        # truth = np.nan_to_num(truth)
        
        # pred_epoch = np.append(pred_epoch, pred_batch_numpy)
        # labels_epoch = np.append(labels_epoch, labels_batch_numpy)
        
        # Evaluation of performance
        mae = np.average(np.abs(result - truth))
        SRCC, p1 = stats.spearmanr(np.squeeze(truth), np.squeeze(result))
        PLCC, p2 = stats.pearsonr(np.squeeze(truth), np.squeeze(result))
        
        #   Display
        # print("\n Model Parameter [epoch={0}]".format(args.epoch))
        print(" Mean Absolute Error with Truth: {0:.4f}".format(mae))
        print(" Speerman's Correlation Coefficient: {0:.3f}".format(SRCC))
        print(" Pearson's Linear Correlation Coefficient: {0:.3f}".format(PLCC))
        
        # correct += SRCC.sum().item()
        # correct += (stats.spearmanr(truth, result) ).sum().item().cpu().numpy()
        # correct += np.sum((stats.spearmanr(predicted.cpu().numpy(), target.cpu().numpy()))).item()
        # correct += (predicted == target).sum().item()
        # print(correct)

    # print training/validation statistics
    ret_loss = np.mean(losses)
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_loader.dataset)
    valid_loss = valid_loss / len(test_loader.dataset)

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
           "\tLoss: {:.4f}".format( ret_loss))
            # "\tAccuracy: {:.4f}".format(correct / total * 100))

    if scheduler is not None:
        scheduler.step()  # step up scheduler  

    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('\tValidation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(model.state_dict(), name)
        valid_loss_min = valid_loss
        # save to google drive
        if path is not None:
            torch.save(model.state_dict(), path)

    # Time take for one epoch
    time_elapsed = time.time() - e_since
    print('\tEpoch:{} completed in {:.0f}m {:.0f}s'.format(
        epoch + 1, time_elapsed // 60, time_elapsed % 60))

# compare total time
time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

# load best model
model = load_latest_model(model, name)

# return the model
return [model, train_loss_data, valid_loss_data]

‘’’

For the model ,

‘’’

- Model loading for transfer learning

    densenet = models.densenet161(pretrained=True)
    densenet = freeze_parameters(densenet)	# freeze feature extraction block
    #	Change output netowork to Regression
    densenet.classifier = regression(in_n=densenet.classifier.in_features)
	#	check model
    print(densenet.classifier)
    densenet.to(device)
	
	# - Training Setting
    criterion = nn.HuberLoss()
    optimizer = optim.Adam(densenet.classifier.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

	# - Training
    densenet, train_loss, test_loss = train(densenet, train_loader, test_loader, epochs, optimizer, criterion)
    check_overfitted(train_loss, test_loss)
	
	# - Save
    torch.save(densenet.state_dict(), "data/densenet.pth")
    print("Saved Densenet to densenet.pth !")
	
    model = densenet

‘’’