RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation [torch==1.9]

When I am running below code I am getting error as
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 1]], which is output 0 of TBackward, is at version 65; expected version 64 instead

import torch
# import numpy as np
from copy import deepcopy

from torch.autograd import Variable
from torch.nn import functional as F
from collections import OrderedDict

from embeddings import item, user

torch.autograd.set_detect_anomaly(True)

class user_preference_estimator(torch.nn.Module):
    def __init__(self, config):
        super(user_preference_estimator, self).__init__()
        self.embedding_dim = config['embedding_dim']
        self.fc1_in_dim = config['embedding_dim'] * 8
        self.fc2_in_dim = config['first_fc_hidden_dim']
        self.fc2_out_dim = config['second_fc_hidden_dim']
        self.use_cuda = config['use_cuda']

        self.item_emb = item(config)
        self.user_emb = user(config)
        self.fc1 = torch.nn.Linear(self.fc1_in_dim, self.fc2_in_dim)
        self.fc2 = torch.nn.Linear(self.fc2_in_dim, self.fc2_out_dim)
        self.linear_out = torch.nn.Linear(self.fc2_out_dim, 1)

    def forward(self, x, training = True):
        rate_idx = Variable(x[:, 0], requires_grad=False)
        genre_idx = Variable(x[:, 1:26], requires_grad=False)
        director_idx = Variable(x[:, 26:2212], requires_grad=False)
        actor_idx = Variable(x[:, 2212:10242], requires_grad=False)
        gender_idx = Variable(x[:, 10242], requires_grad=False)
        age_idx = Variable(x[:, 10243], requires_grad=False)
        occupation_idx = Variable(x[:, 10244], requires_grad=False)
        area_idx = Variable(x[:, 10245], requires_grad=False)

        """
        - You can check the embedding class for complete detail but just to summarize 
            - We are using embeddings for single feature and for list we are using liner layer
            - Then we are concatenating it
            - in this method we are concatenating item and user embedding as well, then then we are passing it to the fully connected layers 
        """
        item_emb = self.item_emb(rate_idx, genre_idx, director_idx, actor_idx)
        user_emb = self.user_emb(gender_idx, age_idx, occupation_idx, area_idx)
        x = torch.cat((item_emb, user_emb), 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        return self.linear_out(x)


class MeLU(torch.nn.Module):
    def __init__(self, config):
        super(MeLU, self).__init__()
        self.use_cuda = config['use_cuda']
        self.model = user_preference_estimator(config)
        self.local_lr = config['local_lr']
        self.store_parameters()
        self.meta_optim = torch.optim.Adam(self.model.parameters(), lr=config['lr'])
        self.local_update_target_weight_name = ['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias', 'linear_out.weight', 'linear_out.bias']

    def store_parameters(self):
        self.keep_weight = deepcopy(self.model.state_dict())
        self.weight_name = list(self.keep_weight.keys())
        self.weight_len = len(self.keep_weight)
        self.fast_weights = OrderedDict()

    def forward(self, support_set_x, support_set_y, query_set_x, num_local_update):
        for idx in range(num_local_update):
            if idx > 0:
                self.model.load_state_dict(self.fast_weights)
            weight_for_local_update = list(self.model.state_dict().values())
            support_set_y_pred = self.model(support_set_x)
            loss = F.mse_loss(support_set_y_pred, support_set_y.view(-1, 1))
            self.model.zero_grad()
            grad = torch.autograd.grad(loss, self.model.parameters(), create_graph=True)
            # local update
            for i in range(self.weight_len):
                if self.weight_name[i] in self.local_update_target_weight_name:
                    self.fast_weights[self.weight_name[i]] = weight_for_local_update[i] - self.local_lr * grad[i]
                else:
                    self.fast_weights[self.weight_name[i]] = weight_for_local_update[i]
        self.model.load_state_dict(self.fast_weights)
        query_set_y_pred = self.model(query_set_x)
        self.model.load_state_dict(self.keep_weight)
        return query_set_y_pred

    def global_update(self, support_set_xs, support_set_ys, query_set_xs, query_set_ys, num_local_update):
        batch_sz = len(support_set_xs)
        losses_q = []
        if self.use_cuda:
            for i in range(batch_sz):
                support_set_xs[i] = support_set_xs[i].cuda()
                support_set_ys[i] = support_set_ys[i].cuda()
                query_set_xs[i] = query_set_xs[i].cuda()
                query_set_ys[i] = query_set_ys[i].cuda()
        for i in range(batch_sz):
            query_set_y_pred = self.forward(support_set_xs[i], support_set_ys[i], query_set_xs[i], num_local_update)
            loss_q = F.mse_loss(query_set_y_pred, query_set_ys[i].view(-1, 1))
            losses_q.append(loss_q)
        losses_q = torch.stack(losses_q).mean(0)
        self.meta_optim.zero_grad()
        losses_q.backward()
        self.meta_optim.step()
        self.store_parameters()
        return

When I am removing below lines from forward, the error is gone

self.model.load_state_dict(self.fast_weights)
self.model.load_state_dict(self.keep_weight)

Hi!

As you’ve observed, the problematic lines are the load_state_dict operations in the Module’s forward method. loading model weights here breaks some torch autograd assumptions - backward pass operations recorded by autograd are expected to be for the current version of the weights in the given training step.

I’d recommend moving the load_state_dict operations:

self.model.load_state_dict(self.fast_weights)
self.model.load_state_dict(self.keep_weight)

outside the forward method and have it in your training loop

In general the preferred way to avoid these errors is to reorder your operation, e.g. perhaps along the lines of something as Sumanth suggests. However, I’d point out that a catch-all solution for these types of issues is Automatic differentiation package - torch.autograd — PyTorch 2.5 documentation
It will automatically turn any in-place operation as necessary into out-of-place operations in situations that would lead to such an error.

1 Like

Hi @soulitzer,

Although the solution worked with CPU, with GPU, I am getting below error

AttributeError: 'list' object has no attribute 'data_ptr'

at line
self.meta_optim.step()

any thoughts?

Sounds unexpected. Happy to take a look if you have a repro. A stack trace of the error would also be helpful.

Traceback (most recent call last):
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/manish.kumar/.vscode-server/extensions/ms-python.debugpy-2024.14.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/__main__.py", line 71, in <module>
    cli.main()
  File "/home/manish.kumar/.vscode-server/extensions/ms-python.debugpy-2024.14.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 501, in main
    run()
  File "/home/manish.kumar/.vscode-server/extensions/ms-python.debugpy-2024.14.0-linux-x64/bundled/libs/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py", line 351, in run_file
    runpy.run_path(target, run_name="__main__")
  File "/home/manish.kumar/.vscode-server/extensions/ms-python.debugpy-2024.14.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 310, in run_path
    return _run_module_code(code, init_globals, run_name, pkg_name=pkg_name, script_name=fname)
  File "/home/manish.kumar/.vscode-server/extensions/ms-python.debugpy-2024.14.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 127, in _run_module_code
    _run_code(code, mod_globals, init_globals, mod_name, mod_spec, pkg_name, script_name)
  File "/home/manish.kumar/.vscode-server/extensions/ms-python.debugpy-2024.14.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 118, in _run_code
    exec(code, run_globals)
  File "/home/manish.kumar/melu/main.py", line 37, in <module>
    training(melu, total_dataset, batch_size=config['batch_size'], num_epoch=config['num_epoch'], model_save=True, model_filename=model_filename)
  File "/home/manish.kumar/melu/model_training.py", line 139, in training
    melu.global_update(supp_xs, supp_ys, query_xs, query_ys, config['inner'])
  File "/home/manish.kumar/melu/MeLU.py", line 139, in global_update
    self.meta_optim.step()
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/optim/optimizer.py", line 487, in wrapper
    out = func(*args, **kwargs)
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/optim/optimizer.py", line 91, in _use_grad
    ret = func(self, *args, **kwargs)
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/optim/adam.py", line 223, in step
    adam(
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/optim/optimizer.py", line 154, in maybe_fallback
    return func(*args, **kwargs)
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/optim/adam.py", line 784, in adam
    func(
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/optim/adam.py", line 524, in _multi_tensor_adam
    torch._foreach_add_(
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/autograd/graph.py", line 676, in __torch_dispatch__
    tid = _get_tid(t)
  File "/home/manish.kumar/miniconda3/envs/melu/lib/python3.9/site-packages/torch/autograd/graph.py", line 600, in _get_tid
    data_ptr = tensor.data_ptr()
AttributeError: 'list' object has no attribute 'data_ptr'

This is a stack race. I have already pasted the code from the file in which I am getting an error in my first comment. If you still need the entire repo, do let me know.

Just for reference, I have just modified the code from this repo (https://github.com/hoyeoplee/MeLU) to work with my dataset and yes, I have added with torch.autograd.graph.allow_mutation_on_saved_tensors():