One of the variables needed for gradient computation has been modified by an inplace operation, using pytorch geometric

Hello, I try to implement MAML meta learning algorithm for GNN, but i had always an inplace error. you find below my code. Can anyone help me to solve this issue.

def MAML(model,update_step, update_lr,outer_lr):
    
    list_pat = np.load("/home/arahmani/seizure_data/list_10pat.npy")
    model = model.to(device)
    model.train()
    meta_optim = torch.optim.Adam(model.parameters(), lr=outer_lr)
    dict_data = {}
    for pat in list_pat:
        X1_train, X1_val, patient1_trainloader = generate_data_GNN(df_4seiz_only, pat, 2500, "")
        dict_data[pat] = (X1_train, X1_val, patient1_trainloader)
        
    losses_q = [0 for _ in range(update_step + 1)]  # losses_q[i] is the loss on step i
    corrects = [0 for _ in range(update_step + 1)]

    keep_weight = deepcopy(model.state_dict())
    
    for i in range(10):
        fast_weights = OrderedDict()
        for data in patient1_trainloader:
            data = data.to(device)
            support_outp = model(data.x, data.edge_index,data.edge_attr.float(), data.batch) 
        # 1. run the i-th task and compute loss for k=0
            loss = F.cross_entropy(support_outp, data.y)
            grad = torch.autograd.grad(loss, model.parameters())
            for i,(weight_name, weight) in enumerate(model.named_parameters()):
                fast_weights[weight_name] = keep_weight[weight_name] - update_lr* grad[i]
            
        query_train_loader = DataLoader(X1_val, batch_size = len(X1_val))
        # this is the loss and accuracy before first update
        with torch.no_grad():
            
            
            for data in query_train_loader:
                data = data.to(device)
                logits_q = model(data.x, data.edge_index,data.edge_attr.float(), data.batch) 
                loss_q = F.cross_entropy(logits_q, data.y)
                losses_q[0] += loss_q

                pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
                correct = torch.eq(pred_q, data.y).sum().item()
                corrects[0] = corrects[0] + correct

        # this is the loss and accuracy after the first update
        with torch.no_grad():
            
            model.load_state_dict(fast_weights)
            for data in query_train_loader:
                data = data.to(device)
                logits_q = model(data.x, data.edge_index,data.edge_attr.float(), data.batch) 
                loss_q = F.cross_entropy(logits_q, data.y)
                losses_q[1] += loss_q
           
                pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
                correct = torch.eq(pred_q, data.y).sum().item()
                corrects[1] = corrects[1] + correct

        for k in range(1, update_step):
            # 1. run the i-th task and compute loss for k=1~K-1
            model.load_state_dict(fast_weights)
            for data in patient1_trainloader:
                data = data.to(device)
                support_outp = model(data.x, data.edge_index,data.edge_attr.float(), data.batch) 
            # 1. run the i-th task and compute loss for k=0
                loss = F.cross_entropy(support_outp, data.y)
                grad = torch.autograd.grad(loss, model.parameters(), create_graph=True)
                for i,(weight_name, weight) in enumerate(model.named_parameters()):
                    fast_weights[weight_name] = fast_weights[weight_name] - update_lr* grad[i]
            
            model.load_state_dict(fast_weights)
            for data in query_train_loader:
                data = data.to(device)
                logits_q = model(data.x, data.edge_index,data.edge_attr.float(), data.batch) 
                loss_q = F.cross_entropy(logits_q, data.y)
                losses_q[k+1] += loss_q
                with torch.no_grad():
                    pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
                    correct = torch.eq(pred_q, data.y).sum().item()
                    corrects[k+1] = corrects[k+1] + correct
    # end of all tasks
    # sum over all losses on query set across all tasks
    loss_q = losses_q[-1] / 10
    model.load_state_dict(keep_weight)
    # optimize theta parameters
    meta_optim.zero_grad()
    torch.autograd.set_detect_anomaly(True)
    loss_q.backward()
    # print('meta update')
    # for p in self.net.parameters()[:5]:
    # 	print(torch.norm(p).item())
    meta_optim.step()


    #accs = np.array(corrects) / (querysz * task_num)

    return model
model1 = gnn_model.GCN(64)
best_model = MAML(model1, 3,0.00001,0.0001)
RuntimeError                              Traceback (most recent call last)
/home/arahmani/seizure_detection/new_meta_code.ipynb Cell 8' in <module>
      1 model1 = gnn_model.GCN(64)
----> 2 _ = MAML(model1, 3,0.00001,0.0001)

/home/arahmani/seizure_detection/new_meta_code.ipynb Cell 7' in MAML(model, update_step, update_lr, outer_lr)
     84 meta_optim.zero_grad()
     85 torch.autograd.set_detect_anomaly(True)
---> 86 loss_q.backward()
     87 # print('meta update')
     88 # for p in self.net.parameters()[:5]:
     89 # 	print(torch.norm(p).item())
     90 meta_optim.step()

File ~/.conda/envs/seizure_task/lib/python3.9/site-packages/torch/_tensor.py:307, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    298 if has_torch_function_unary(self):
    299     return handle_torch_function(
    300         Tensor.backward,
    301         (self,),
   (...)
    305         create_graph=create_graph,
    306         inputs=inputs)
--> 307 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)

File ~/.conda/envs/seizure_task/lib/python3.9/site-packages/torch/autograd/__init__.py:154, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    151 if retain_graph is None:
    152     retain_graph = create_graph
--> 154 Variable._execution_engine.run_backward(
    155     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    156     allow_unreachable=True, accumulate_grad=True)

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 2]], which is output 0 of AsStridedBackward0, is at version 52; expected version 51 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Hi,

It seems that one of variables with size [64, 2] is changed after loss computation.
Is this a FC layer weights?

Also, did you try to replace every inplace operation?

a += 1  # inplace addition
a = a+1  # normal addition

Hi thank you for replying, yes I don’t do any inplace operation, you can find below my model architecture.

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(12345)
        

        self.conv1 = GCNConv(2500, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.relu = nn.ReLU()
        
        self.lin = nn.Linear(hidden_channels, 2)

    def forward(self, x, edge_index, edge_attr, batch):
    
        #x = self.drop_out(x)
        x = self.conv1(x, edge_index,edge_attr)
        x = self.relu(x)
        x = self.conv2(x, edge_index,edge_attr)
        x = self.relu(x)
        x = self.conv3(x, edge_index,edge_attr)
        x = self.relu(x)
        x = global_mean_pool(x, batch)
        x = self.lin(x)
        return x

HI,

Could you try to put model.load_state_dict(keep_weight) behind loss_q.backward()?

Hello,
Thank you for your time. I tried it but still the same issue.

Hello,
Tracking the error showed me this, I am just wondering if is it an issue in F.linear layer of my pytorch version.

File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 677, in start
    self.io_loop.start()
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/asyncio/base_events.py", line 596, in run_forever
    self._run_once()
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once
    handle._run()
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 471, in dispatch_queue
    await self.process_one()
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 460, in process_one
    await dispatch(*args)
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 367, in dispatch_shell
    await result
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/site-packages/torch/nn/modules/linear.py", line 103, in forward
    return F.linear(input, self.weight, self.bias)
  File "/home/arahmani/.conda/envs/seizure_task/lib/python3.9/site-packages/torch/nn/functional.py", line 1848, in linear
    return torch._C._nn.linear(input, weight, bias)

HI,

Sorry for not being helpful. Maybe you could try to update the version first.
I notice that there are two error messages in different section:

     41     losses_q = torch.stack(losses_q).mean(0)
     42     global_optim.zero_grad()
---> 43     losses_q.backward()
     44     global_optim.step()
     46 return model

and

     84 meta_optim.zero_grad()
     85 torch.autograd.set_detect_anomaly(True)
---> 86 loss_q.backward()
     87 # print('meta update')
     88 # for p in self.net.parameters()[:5]:
     89 # 	print(torch.norm(p).item())
     90 meta_optim.step()

I have some similar experience about this issue, hope it will be help.
If there are several .backward() and .step() in your system, Pytorch will updates your model weights inplace in first .step() .
And it will cause this RuntimeError if the remaining loss computation is based on the same model.
You could refer here.
This is all solution that I can think of, sorry about that.

Thank you, I just copied to different error because I tried to change my codes but it doesn’t work. Thank you for your time. The main error is the second one, I just edit it in the post.