Variables needed for gradient computation has been modified

Hello!
I’m trying to implement the following optimization problem minimizing a loss functions built as follow:
image

During the implementation i’ve developed the surrogate of agent i (f tilde) and the other term with respect to all the j other agents of a multi agent system. Trying to solve this using loss.backward() arises issued related to "variables needed for gradient computation has been modified ". This is my code:

class Agents:
def __init__(self, train_loaders, val_loaders, test_loader, num_agents, pie = [], tau = 10, epochs=5):
    self.train_loaders = train_loaders
    self.val_loaders = val_loaders
    self.test_loader = test_loader
    self.num_agents = num_agents
    self.epochs = epochs
    self.gamma_values = [ 0.5 ]
    self.models = [Net() for _ in range(num_agents)]
    self.optimizers = [optim.SGD(model.parameters(), lr= 1/tau) for model in self.models]
    self.criterion = nn.CrossEntropyLoss()
    self.validation_accuracies = {i: 0 for i in range(num_agents)}  # Dictionary to store validation accuracies
    self.losses = {i: [] for i in range(num_agents)}  # Dictionary to store losses


def gamma_update(self, epsilon=2): #updating step size rule
    gamma_next = self.gamma_values[-1]*(1- (epsilon*self.gamma_values[-1])) #gamma[0] = 1/eps
    self.gamma_values.append(gamma_next)


def full_linearization(self, y_k, agent_idx):
      model_i = self.models[agent_idx]
      x_i = [param.clone() for param in model_i.parameters()]  # Store the current parameters (x_i)
      model_i.load_state_dict(y_k)
      model_i.train()
      # Initialize pi_i and difference with zeros (same shape as model_i state_dict)
      pi_i = {k: torch.zeros_like(v) for k, v in model_i.state_dict().items()}
      difference = {k: torch.zeros_like(v) for k, v in model_i.state_dict().items()}
      total_surrogate_loss = 0.0

      for inputs, labels in self.train_loaders[agent_idx]:
          optimizer = self.optimizers[agent_idx]
          optimizer.zero_grad()

          # Forward pass for agent i
          outputs_i = model_i(inputs)
          CE_surrogate = self.criterion(outputs_i, labels)
          total_surrogate_loss += CE_surrogate

          # Compute pi_i(x_i[n]) gradients
          for j in range(self.num_agents):
              if j != agent_idx:
                  model_j = self.models[j]
                  model_j.load_state_dict(y_k)  # Load y_k for agent j

                  for inputs_j, labels_j in self.train_loaders[j]:
                      outputs_j = model_j(inputs_j)
                      loss_j = self.criterion(outputs_j, labels_j)
                      grads = torch.autograd.grad(loss_j, model_j.parameters(), retain_graph=True)

                      # Accumulate gradients in pi_i (note the use of `clone` to avoid in-place operations)
                      for (name, param), grad in zip(model_j.named_parameters(), grads):
                          pi_i[name] = pi_i[name] + grad.clone()

          # Flatten pi_i
          pi_flatten = torch.cat([v.view(-1) for v in pi_i.values()])

          # Compute difference vector x_i - y_k
          for (name, y_k_param), x_i_param in zip(y_k.items(), x_i):
              difference[name] = difference[name] + (x_i_param - y_k_param).clone()

          diff_flat = torch.cat([v.view(-1) for v in difference.values()])

          # Add pi_flatten^T * diff_flat to the surrogate loss
          total_surrogate_loss += torch.dot(pi_flatten, diff_flat)

          # Backward pass and optimization
          total_surrogate_loss.backward(retain_graph=True)  # Using retain_graph=True since we might need it for further computations
          optimizer.step()

      post_opt = model_i.state_dict()

      avg_loss = total_surrogate_loss.item() / len(self.train_loaders[agent_idx])
      self.losses[agent_idx].append(avg_loss)
      print(f"Surrogate agent {agent_idx}: {avg_loss}")

      return pre_opt, post_opt

Can anyone see the issue related to this code? There are apparently no inplace operation s that modify the tensors required for the optimization procedure. This is the error:


RuntimeError Traceback (most recent call last)
in <cell line: 4>()
2 s = Agents(train_loaders, val_loaders, test_loader, num_agents)
3 y_k = Net().state_dict()
----> 4 s.full_linearization(y_k, 0)

3 frames
/usr/local/lib/python3.10/dist-packages/torch/autograd/graph.py in _engine_run_backward(t_outputs, *args, **kwargs)
766 unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)
767 try:
→ 768 return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
769 t_outputs, *args, **kwargs
770 ) # Calls into the C++ engine to run the backward pass

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 10]], which is output 0 of AsStridedBackward0, is at version 4; expected version 3 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Hi Giuseppe!

Note that optimizer.step() performs inplace modifications on
the Parameters it is optimizing. This, together with your use of
retain_graph = True strongly suggests that optimizer.step()
is the cause of your issue.

You should think through with care how you are performing your
optimization. Do you really want to be optimizing an “agent” with
a stale computation graph? Or is your use of retain_graph = True
just an expedient to avoid doing the necessary bookkeeping?

In your case, I think you should track down the root cause of your
error and fix it because it likely indicates a flaw in the design of your
optimization loop.

However, you could also try fixing your issue “automatically” by
using pytorch’s sweep-inplace-modification-errors-under-the-rug
context manager.

Best.

K. Frank

Dear KFrank, thanks a lot for your answer.
Retain graph is just an expedit to be alble to let work the code. The main issue with i’m struggling is tha fact that I wanto to optimize a strongly convex optimization problem like this:
image
Where f is the loss function (f_tilde is a surrogate loss function) of i^th/j^th agent of the system.
This problem is the sum of basically two terms (f_tilde and the other one that is like a first order taylor expansion). The issue by a code point of view is that I’m not able to optimize wrt the whole optimization problem, how can I try to implement this in torch? Can I do it using the simple .backward() procedure?

Thanks in advice for your time!

Hi Giuseppe!

I’m not sure what you’re asking here.

Do you believe that the code you originally posted would perform
your optimization procedure correctly if you could resolve your
inplace-modification error?

Or is the code you posted just a red herring and your real question
is how to use pytorch to solve your optimization problem?

Best.

K. Frank

Hi Frank!
No, this code posted above is not a red herring.
I’ve changed the way how I’m updating the variable that keep track of the loss function (total_surrogate_loss) and it seems that the inplace-modification issue is solved.
By the way there is something strange in the optimization procedure, sometimes it works and in other run times arises a different error:


I believe that the implemented optimization procedure should be correct but I have been stuck for days on this point and I’m asking if someone can see some issue that i’m not able to see. I attach here the last implementation code :

class Agents:
def __init__(self, train_loaders, val_loaders, test_loader, num_agents, pie = [], tau = 100, epochs=5):
    self.train_loaders = train_loaders
    self.val_loaders = val_loaders
    self.test_loader = test_loader
    self.num_agents = num_agents
    self.epochs = epochs
    self.gamma_values = [np.random.uniform(0.2,1/0.5)]
    self.models = [Net() for _ in range(num_agents)]
    self.optimizers = [optim.SGD(model.parameters(), lr= 1/tau) for model in self.models]
    self.criterion = nn.CrossEntropyLoss()
    self.validation_accuracies = {i: 0 for i in range(num_agents)}  # Dictionary to store validation accuracies
    self.losses = {i: [] for i in range(num_agents)}  # Dictionary to store losses


def gamma_update(self, epsilon=0.5): #updating step size rule
    gamma_next = self.gamma_values[-1]*(1- (epsilon*self.gamma_values[-1])) #gamma[0] < 1/eps and eps in (0,1])
    self.gamma_values.append(gamma_next)


def centralized_fullL(self, y_k, agent_idx): #best response map implementation
  model_i = self.models[agent_idx]
  x_i = [param.clone() for param in model_i.parameters()]  # Store the current parameters (x_i)
  model_i.load_state_dict(y_k)
  model_i.train()

  # Initialize pi_i and difference with zeros (same shape as model_i state_dict)
  pi_i = {k: torch.zeros_like(v) for k, v in model_i.state_dict().items()}
  difference = {k: torch.zeros_like(v) for k, v in model_i.state_dict().items()}

  for inputs, labels in self.train_loaders[agent_idx]:
      optimizer = self.optimizers[agent_idx]
      optimizer.zero_grad()

      # Forward pass for agent i
      outputs_i = model_i(inputs)
      surrogate_loss = self.criterion(outputs_i, labels)  # Surrogate loss for agent i

      # Accumulate gradients from other agents
      for j in range(self.num_agents):
          if j != agent_idx:
              model_j = self.models[j]
              model_j.load_state_dict(y_k)  # Load y_k for agent j

              for inputs_j, labels_j in self.train_loaders[j]:
                  outputs_j = model_j(inputs_j)
                  loss_j = self.criterion(outputs_j, labels_j)
                  grads = torch.autograd.grad(loss_j, model_j.parameters(), retain_graph=True)

                  for (name, param), grad in zip(model_j.named_parameters(), grads):
                      pi_i[name] += grad.clone()  # Accumulate gradients

      # Flatten pi_i
      pi_flatten = torch.cat([v.view(-1) for v in pi_i.values()])

      # Compute difference vector x_i - y_k
      for (name, y_k_param), x_i_param in zip(y_k.items(), x_i):
          difference[name] += (x_i_param - y_k_param).clone()

      diff_flat = torch.cat([v.view(-1) for v in difference.values()])

      # Add pi_flatten^T * diff_flat to the surrogate loss
      surrogate_loss += torch.dot(pi_flatten, diff_flat)

      # Backward pass and optimization
      surrogate_loss.backward()
      optimizer.step()

  avg_loss = surrogate_loss.item() / len(self.train_loaders[agent_idx])
  self.losses[agent_idx].append(avg_loss)
  print(f"Surrogate agent {agent_idx}: {avg_loss}")
  return x_i, model_i.state_dict()

Thanks,

Giuseppe