RuntimeError: one of the variables needed for ensemble neural net gradient computation has been modified by an inplace operation

Mlohens257 · April 10, 2023, 11:30pm

I am trying to train an ensemble of neural networks in PyTorch. During the backward pass, I am encountering the following error message:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [240, 1]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead.

Here is the code that is causing the error:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import numpy as np

Check if a GPU is available

device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
print(“Using device:”, device)

Load the dataset (Possible values: ‘iris’, ‘digits’, ‘wine’, ‘breast_cancer’, ‘diabetes’, ‘boston’)

dataset_name = ‘breast_cancer’
X, y = load_dataset(dataset_name)

Shuffle and split the dataset into training and testing sets

X, y = shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Standardize the data (zero mean, unit variance)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Convert NumPy arrays to PyTorch tensors

X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

Define a simple neural network

class SimpleNN(nn.Module):
def init(self, input_size, hidden_size, output_size):
super(SimpleNN, self).init()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, hidden_size)
self.relu2 = nn.ReLU()
self.fc3 = nn.Linear(hidden_size, hidden_size)
self.relu3 = nn.ReLU()
self.fc4 = nn.Linear(hidden_size, hidden_size)
self.relu4 = nn.ReLU()
self.fc5 = nn.Linear(hidden_size, hidden_size)
self.relu5 = nn.ReLU()
self.fc6 = nn.Linear(hidden_size, hidden_size)
self.relu6 = nn.ReLU()
self.fc7 = nn.Linear(hidden_size, hidden_size)
self.relu7 = nn.ReLU()
self.fc8 = nn.Linear(hidden_size, output_size)

def forward(self, x):
    x = self.fc1(x)
    x = self.relu1(x)
    x = self.fc2(x)
    x = self.relu2(x)
    x = self.fc3(x)
    x = self.relu3(x)
    x = self.fc4(x)
    x = self.relu4(x)
    x = self.fc5(x)
    x = self.relu5(x)
    x = self.fc6(x)
    x = self.relu6(x)
    x = self.fc7(x)
    x = self.relu7(x)
    x = self.fc8(x)
    return x

Create a list of neural network instances

input_size = X.shape[1]
hidden_size = input_size*8
output_size = 1
ensemble_size = 20
ensemble = [SimpleNN(input_size, hidden_size, output_size).to(device) for i in range(ensemble_size)]

Enable anomaly detection

torch.autograd.detect_anomaly()

Specify number of epochs

epochs = 7500

def entropy(p):
eps = 1e-9
return -p * torch.log2(p + eps) - (1 - p) * torch.log2(1 - p + eps)

def step_sigmoid(x):
return torch.sigmoid(1000*(x -.5))

def voted_predict(y_pred):
return step_sigmoid(torch.mean(step_sigmoid(y_pred), axis = 1))

Compute the indicator function I

def indicator(y_pred, y_true):
return (voted_predict(y_pred)- y_true)**2

Calculate the probability

def positive_probability(y_pred):
num_pos = torch.sum(step_sigmoid(y_pred), axis = 1)
total = y_pred.shape[1]
p_pos = num_pos / total
return p_pos

def concatenate_predictions(ens_y_pred, y_pred):
if ens_y_pred is None:
ens_y_pred = y_pred.clone()
else:
# Create a new tensor to hold the concatenated values
new_ens_y_pred = torch.cat([ens_y_pred, y_pred], dim=1)

    # Return the new tensor
    ens_y_pred = new_ens_y_pred
return ens_y_pred

Define the custom loss function

def custom_loss(y_true, y_pred):

# Make a copy of y_pred before passing it to the indicator function
y_pred_copy = y_pred#.clone().detach()
y_true_copy = y_true#.clone().detach()  

I = indicator(y_pred_copy, y_true_copy).detach().clone()
print("I in custom_loss:", I)

# Calculate the probability
p_pos = positive_probability(y_pred_copy)
print("p_pos in custom_loss:", p_pos)

# Compute the entropy of the predicted probabilities
H = entropy(p_pos)#.detach().clone()
print("H in custom_loss:", H)

# Compute the loss function
loss = I + torch.pow(H, 1 - I) * torch.pow(1 - H, I)
print("loss in custom_loss:", loss)
print("torch.mean(loss):", torch.mean(loss))

return torch.mean(loss)

optimizer = optim.SGD([{‘params’: model.parameters()} for model in ensemble], lr=0.01)

# Initialize TensorBoard writer

writer = SummaryWriter()

# Train the ensemble of neural networks

prev_weights = {}

for epoch in range(epochs):
if epoch%100 == 0:
print(“epoch:”, epoch)

# Shuffle the training data for each epoch

X_train_shuffled, y_train_shuffled = shuffle(X_train.clone(), y_train.clone(), random_state=epoch)

# Initialize the ensemble predictions

ens_y_pred = None

Train each neural network in the ensemble

for i, model in enumerate(ensemble):
# Set model to training mode
model.train()

  # Forward pass
  y_pred = model(X_train)
  print("y_pred.shape:", y_pred.shape)

  # Concatenate the tensors along the first dimension
  ens_y_pred = concatenate_predictions(ens_y_pred, y_pred)

  # Reshape tensor

ens_y_pred = ens_y_pred.reshape([ens_y_pred.shape[1],ens_y_pred.shape[0]])

print(“ens_y_pred.shape:”,ens_y_pred.shape )

for i, model in enumerate(ensemble):
print(‘i:’, i)
# Calculate loss

  loss = custom_loss(y_train, ens_y_pred)

  # Backward pass and optimization
  optimizer.zero_grad()
  loss.backward(retain_graph=True)
  optimizer.step()

ptrblck · April 11, 2023, 1:38am

Often using retain_graph=True is causing these issues as e.g. the parameters were already updated and stale forward activations are used.
Could you explain why you are using this argument?

Mlohens257 · April 11, 2023, 1:44am

@ptrblck, Thank you for your prompt response. If I don’t use retain_graph = True, then I get the following error:

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

ptrblck · April 11, 2023, 2:00am

This seems to be the actual issue then.
Your code is unfortunately not formatted properly so hard to read (you can post code snippets by wrapping them into three backticks ```), but generally these issues are raised if you are reusing tensors attached to a computation graph and are then calling backward multiple times.

Mlohens257 · April 11, 2023, 2:09am

Ah, thank you for letting me know. I’ve included the formatted code below. Does this have anything to do with the fact that I’m sharing a tensor loss for all the models in my ensemble.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
import numpy as np

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load the dataset (Possible values: 'iris', 'digits', 'wine', 'breast_cancer', 'diabetes', 'boston')
dataset_name = 'breast_cancer'
X, y = load_dataset(dataset_name)

# Print data shapes
print("Dataset Name:", dataset_name)
print("Input Shape:", X.shape)
print("Possible Outputs:", len(np.unique(y)))
print("Unique Outputs:", np.unique(y))
print(" ")

# Shuffle and split the dataset into training and testing sets
X, y = shuffle(X, y, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (zero mean, unit variance)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert NumPy arrays to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.relu4 = nn.ReLU()
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.relu5 = nn.ReLU()
        self.fc6 = nn.Linear(hidden_size, hidden_size)
        self.relu6 = nn.ReLU()
        self.fc7 = nn.Linear(hidden_size, hidden_size)
        self.relu7 = nn.ReLU()
        self.fc8 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        x = self.relu4(x)
        x = self.fc5(x)
        x = self.relu5(x)
        x = self.fc6(x)
        x = self.relu6(x)
        x = self.fc7(x)
        x = self.relu7(x)
        x = self.fc8(x)
        return x

# Create a list of neural network instances
input_size = X.shape[1]
hidden_size = input_size*8
output_size = 1
ensemble_size = 20
ensemble = [SimpleNN(input_size, hidden_size, output_size).to(device) for i in range(ensemble_size)]

# Enable anomaly detection
torch.autograd.detect_anomaly(True)

# Specify number of epochs
epochs = 7500

def entropy(p):
    eps = 1e-9
    return -p * torch.log2(p + eps) - (1 - p) * torch.log2(1 - p + eps)

def step_sigmoid(x):
  return torch.sigmoid(1000*(x -.5))

def voted_predict(y_pred):
  return step_sigmoid(torch.mean(step_sigmoid(y_pred), axis = 1))

# Compute the indicator function I
def indicator(y_pred, y_true):
    return (voted_predict(y_pred)- y_true)**2

# Calculate the probability
def positive_probability(y_pred):
  num_pos = torch.sum(step_sigmoid(y_pred), axis = 1)
  total = y_pred.shape[1]
  p_pos = num_pos / total
  return p_pos

def concatenate_predictions(ens_y_pred, y_pred):
    if ens_y_pred is None:
        ens_y_pred = y_pred.clone()
    else:
        # Create a new tensor to hold the concatenated values
        new_ens_y_pred = torch.cat([ens_y_pred, y_pred], dim=1)

        # Return the new tensor
        ens_y_pred = new_ens_y_pred
    return ens_y_pred
 

# Define the custom loss function
def custom_loss(y_true, y_pred):
    # print("y_true.shape in custom_loss:", y_true.shape)  
    # print("y_pred.shape in custom_loss:", y_pred.shape)

    # Make a copy of y_pred before passing it to the indicator function
    y_pred_copy = y_pred#.clone().detach()
    y_true_copy = y_true#.clone().detach()

    
    
    I = indicator(y_pred_copy, y_true_copy).detach().clone()
    # print("I in custom_loss:", I)

    # Calculate the probability
    p_pos = positive_probability(y_pred_copy)
    # print("p_pos in custom_loss:", p_pos)

    # Compute the entropy of the predicted probabilities
    H = entropy(p_pos)#.detach().clone()
    # print("H in custom_loss:", H)

    # Compute the loss function
    loss = I + torch.pow(H, 1 - I) * torch.pow(1 - H, I)
    print("loss in custom_loss:", loss)
    # print("torch.mean(loss):", torch.mean(loss))

    return torch.mean(loss)         

#optimizer = optim.SGD([{'params': model.parameters()} for model in ensemble], lr=0.01)
optimizer = optim.SGD([{'params': model.parameters()} for model in ensemble], lr=0.01)
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-5)
# or
# optimizer = torch.optim.SGD(model.classifier.parameters(), lr=1e-5)

# # Initialize TensorBoard writer
# writer = SummaryWriter()


# # Train the ensemble of neural networks
# prev_weights = {}

for epoch in range(epochs):
  if epoch%100 == 0:
    print("epoch:", epoch)



  # Shuffle the training data for each epoch
  # X_train_shuffled, y_train_shuffled = shuffle(X_train.clone(), y_train.clone(), random_state=epoch)
  # X_train = X_train_shuffled.clone()
  # y_train = y_train_shuffled.clone()
  # Initialize the ensemble predictions
  ens_y_pred = None

  # Train each neural network in the ensemble
  for i, model in enumerate(ensemble):
      # Set model to training mode
      model.train()

      # Forward pass
      y_pred = model(X_train)
      # print("y_pred.shape:", y_pred.shape)

      # Concatenate the tensors along the first dimension
      ens_y_pred = concatenate_predictions(ens_y_pred, y_pred)

      # Reshape tensor
  # ens_y_pred = ens_y_pred.reshape([ens_y_pred.shape[1],ens_y_pred.shape[0]])

  print("ens_y_pred.shape:",ens_y_pred.shape )

  for i, model in enumerate(ensemble):      
      print('i:', i)
      # Calculate loss

      loss = custom_loss(y_train, ens_y_pred)

      # Backward pass and optimization
      optimizer.zero_grad()
      # loss.backward(retain_graph=True)
      loss.backward()
      optimizer.step()
      

      # # Record the training loss for this model
      # writer.add_scalar(f'Training Loss (Model {i + 1})', loss.item(), epoch)

      # # Record the requested information for this model
      # for j, layer in enumerate(model.children()):
      #     if isinstance(layer, nn.Linear):
      #         # Change in weights
      #         if (i, j) in prev_weights:
      #             weight_diff = layer.weight.detach() - prev_weights[(i, j)]['weight']
      #             writer.add_histogram(f'Model {i + 1}/Layer {j + 1}/Weight Change', weight_diff, epoch)
      #         prev_weights[(i, j)] = {'weight': layer.weight.detach().clone(), 'bias': layer.bias.detach().clone()}

      #         # Store weights
      #         if (i, j) in prev_weights:
      #             weight_ = layer.weight.detach()
      #             writer.add_histogram(f'Model {i + 1}/Layer {j + 1}/Weights', weight_, epoch)

      #         # Variance between input and output
      #         input_var = np.var(layer_outputs[i][j - 1]) if j > 0 else np.var(X_train.cpu().numpy())
      #         output_var = np.var(layer_outputs[i][j])
      #         writer.add_scalars(f'Model {i + 1}/Layer {j + 1}/Variance', {'input': input_var, 'output': output_var}, epoch)

      #         # Neuron standard deviation
      #         neuron_std = np.std(layer_outputs[i][j], axis=0)
      #         writer.add_histogram(f'Model {i + 1}/Layer {j + 1}/STD', torch.tensor(neuron_std), epoch)

      #         # Bias of each layer
      #         writer.add_histogram(f'Model {i + 1}/Layer {j + 1}/Bias', layer.bias.detach(), epoch)

              # # Entropy for each layer
              # if j == len(model) - 1:  # Only apply softmax to the output layer
              #     softmax = nn.Softmax(dim=1)
              #     probs = softmax(y_pred).detach().cpu().numpy()
              # else:
              #     probs = layer_outputs[i][j]
            
              # layer_entropy = entropy(probs)
              # writer.add_scalar(f'Model {i + 1}/Layer {j + 1}/Entropy', layer_entropy, epoch)

# Evaluate the ensemble of neural networks on the test set
with torch.no_grad():
    outputs = []
    for model in ensemble:
        model.eval()
        outputs.append(model(X_test))
    outputs = torch.stack(outputs)
    ensemble_output = torch.mean(outputs, dim=0)
    ensemble_loss = nn.functional.cross_entropy(ensemble_output, y_test)
    # writer.add_scalar('Ensemble Test Loss', ensemble_loss.item(), epoch)

    _, predicted = torch.max(ensemble_output, 1)
    correct = (predicted == y_test).sum().item()
    accuracy = correct / y_test.size(0)
    # writer.add_scalar('Ensemble Test Accuracy', accuracy, epoch)

Mlohens257 · April 11, 2023, 3:18am

I think that the error (“Trying to backward through the graph a second time…”) may have been because of the torch.cat function that is was using previously. I updated my code by using a list to store the ensemble predictions, instead of the torch.cat method, as shown below. The code is running now, but the loss stays the same. Any suggestions @ptrblck?

‘’’

Initialize the ensemble predictions

ens_y_pred = None

ens_y_pred_list = []

Train each neural network in the ensemble

for i, model in enumerate(ensemble):
# Set model to training mode
model.train()

  # Forward pass
  y_pred = model(X_train)
  # print("y_pred.shape:", y_pred.shape)

  # Concatenate the tensors along the first dimension
  # ens_y_pred = concatenate_predictions(ens_y_pred, y_pred)
  ens_y_pred_list.append(y_pred)

  # Reshape tensor

ens_y_pred = ens_y_pred.reshape([ens_y_pred.shape[1],ens_y_pred.shape[0]])

print(“ens_y_pred.shape:”,ens_y_pred.shape )

for i, model in enumerate(ensemble):

  # print('i:', i)
  # Calculate loss
  y_pred = ens_y_pred_list[i]
  loss = custom_loss(y_train, y_pred)
  # loss = custom_loss(y_train, ens_y_pred)

  # Backward pass and optimization
  optimizer.zero_grad()
  # loss.backward(retain_graph=True)
  loss.backward()
  optimizer.step()

‘’’

ptrblck · April 11, 2023, 6:07am

The issue in the previously posted code is raised because you are creating ens_y_pred by concatenating the outputs from different models and try to call backward on it multiple times instead of once. The new code seems to treat each prediction individually and you could create a single for loop instead of storing the outputs on a list and iterate it afterwards.

Your model training is stuck as the intermediate losses are zero matrices when I execute the code, e.g. step_sigmoid is saturating the output to large negative values thus creating an output full of zeros, which then creates zeros in H etc.