One of the variables needed for gradient computation has been modified by an inplace operation error occured

I have an unknown problem in my learning model. Could you tell me why?

import torch
import torch.nn as nn
import os
import numpy as np


def cov(m, rowvar=False):
    if m.dim() > 2:
        raise ValueError('m has more than 2 dimensions')
    if m.dim() < 2:
        m = m.view(1, -1)
    if not rowvar and m.size(0) != 1:
        m = m.t()
    fact = 1.0 / (m.size(1) - 1)
    #특징에서 평균 빼기
    mean_m=torch.mean(m, dim=1, keepdim=True)
    m -= mean_m
    mt = m.t()  # if complex: mt = m.t().conj()
    m_squeeze=m.matmul(mt).squeeze()
    result=fact * m_squeeze
    return result
def feature_cov(feature):
    cov_arr=[]
    for i in range(feature.shape[0]):
        our_c=cov(feature[i])
        
        cov_arr.append(our_c)
    cov_arr=torch.stack(cov_arr)
    return cov_arr
def trace(matrix):
    
    trace_val=0
    for i in range(matrix.shape[0]):
        trace_val+=matrix[i][i]
    trace_val2=0.0001*trace_val
    result_trace_val=torch.mul(trace_val2,torch.eye(matrix.shape[0]).cuda())

    return result_trace_val

def normalize_cov(cov_matrix):
    normalized_cov=[]
    for i in range(cov_matrix.shape[0]):
        trace_val=trace(cov_matrix[i])
        tmp=cov_matrix[i]+trace_val
        normalized_cov.append(tmp)
    normalized_cov=torch.stack(normalized_cov)
    return normalized_cov

def tile(a, dim, n_tile):
    init_dim = a.size(dim)
    repeat_idx = [1] * a.dim()
    repeat_idx[dim] = n_tile
    a = a.repeat(*(repeat_idx))
    order_index = torch.cuda.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
    out=torch.index_select(a, dim, order_index)
    return out

#(3,128,128) shape covariance pooling
def cal_cov_pooling(feature):

    feature=feature.view(feature.shape[0],feature.shape[1],-1)
    
    cov_matrix=feature_cov(feature)
    cov_regularized=normalize_cov(cov_matrix)

    return cov_regularized

# computes weights for BiMap Layer


def variable_with_orth_weight_decay(shape):
    s1=int(shape[2])
    s2=int(shape[2]/2)
    rand_val=torch.randn([s1, s2],device='cuda:0')
    w0_init, _ = torch.qr(rand_val)
    w0 = w0_init
    tmp1 = w0.view(1, s1, s2)
    tmp2 = w0.t().view(1, s2, s1)
    tmp1 = tile(tmp1,0,shape[0])
    tmp2 = tile(tmp2,0,shape[0])
    return tmp1, tmp2
# ReEig Layer
def cal_rect_cov(features):
    weight1, weight2 = variable_with_orth_weight_decay(features.shape)
    features = torch.bmm(torch.bmm(weight2, features), weight1)
#     print(features.requires_grad,weight1.requires_grad, weight2.requires_grad)
#     print(features.device,weight1.device, weight2.device)
    
#     features=features.detach().cpu()
    result=[]
    for i in range(features.shape[0]):
        s_f,v_f=torch.symeig(features[i], eigenvectors=True)
        s_f_clamp=torch.clamp(s_f,0.0001,10000)
        s_f_clamp2=torch.diag(s_f_clamp)
        sv_m=torch.matmul(v_f,s_f_clamp2)
        features_t=torch.matmul(sv_m,v_f.t())
        result.append(features_t)
    
    result = torch.stack(result)
    return result

# LogEig Layer
def cal_log_cov(features):
#     features=features.detach().cpu()
    result=[]
    for i in range(features.shape[0]):
        s_f,v_f=torch.symeig(features[i], eigenvectors=True)
        s_f_log = torch.log(s_f)
        s_f_log2=torch.diag(s_f_log)
        sv_m=torch.matmul(v_f,s_f_log2)
        features_t=torch.matmul(sv_m,v_f.t())
        result.append(features_t)
    result = torch.stack(result)

    return result

class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        #conv_layer
        self.layer1=nn.Sequential(
            #1
            nn.Conv2d(3,64,kernel_size=3,stride=1,padding=1,bias=False),
#             nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,padding=0),
            #2
            nn.Conv2d(64,96,kernel_size=3,stride=1,padding=1,bias=False),
#             nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,padding=0),
            #3
            nn.Conv2d(96,128,kernel_size=3,stride=1,padding=1,bias=False),
#             nn.BatchNorm2d(128),
            nn.ReLU(),
#             nn.MaxPool2d(kernel_size=2,padding=0),
            #4
            nn.Conv2d(128,128,kernel_size=3,stride=1,padding=1,bias=False),
#             nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,padding=0),
            #5
            nn.Conv2d(128,256,kernel_size=3,stride=1,padding=1,bias=False),
#             nn.BatchNorm2d(64),
            nn.ReLU(),
#             nn.MaxPool2d(kernel_size=2,padding=0)
            
            nn.Conv2d(256,256,kernel_size=3,stride=1,padding=1,bias=False),
#             nn.BatchNorm2d(64),
            nn.ReLU(),
#             nn.MaxPool2d(kernel_size=2,padding=0)
            )
        self.fc1=nn.Sequential(
            nn.Linear(16384,2000),
            nn.ReLU(),
        )
        self.fc2=nn.Sequential(
            nn.Linear(2000,128),
            nn.ReLU(),
        )
        self.fc3=nn.Sequential(
            nn.Linear(128,7)
        )
        
    def forward(self,x):
        #cov matrix
        out=self.layer1(x)

        #공분산 행렬
    
        out=cal_cov_pooling(out)
        print("cov_pooling grad stage")
        check_grad(out)

        #bimap layer1
        out=cal_rect_cov(out)
        
        print("rect cov pooling grad stage")
        check_grad(out)

        #bimap layer2
#         out=cal_rect_cov(out)
#         print("rect cov pooling grad stage")
#         check_grad(out)

        out=cal_log_cov(out)
        print("log cov pooling grad stage")
        check_grad(out)

        out=out.view(out.shape[0],-1)
        out=self.fc1(out)
        out=self.fc2(out)
        out=self.fc3(out)
        return out

The contents of the error are as follows.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-7-d4ada5de2326> in <module>
     22         # Backward and optimize
     23         optimizer.zero_grad()
---> 24         loss.backward()
     25         optimizer.step()
     26         total += labels.size(0)

~\Anaconda3\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
    105                 products. Defaults to ``False``.
    106         """
--> 107         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    108 
    109     def register_hook(self, hook):

~\Anaconda3\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     91     Variable._execution_engine.run_backward(
     92         tensors, grad_tensors, retain_graph, create_graph,
---> 93         allow_unreachable=True)  # allow_unreachable flag
     94 
     95 

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [256, 256]], which is output 0 of AsStridedBackward, is at version 128; expected version 127 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).```



please help me , thank you

--james

Could you change

m -= mean_m

to

m = m - mean

and run it again?

2 Likes

thank you for your help!

can you please explain why it fix that problem??
(i had the same problem and you just saved me…)

The first line of code modifies m inplace, while the second one creates a new m tensor with the modification.
The error message points to an unexpected inplace modification of one variable, so my guess was that this line of code could create the issue.
The general problem is, that some operations need the input to this particular operation to calculate the gradients.
If you modify the input inplace, Autograd cannot calculate the gradients and will raise this error.

2 Likes

I have 4 networks each being optimized individually. For 3 networks, backprop occurs without any error, but the last one throws the error:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [4096, 3072]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck

The shape of the tensor points me to the last linear layer in one of the network:

class Generator(nn.Module):
    def __init__(self):
        super().__init__()
        self.main = nn.Sequential(
            nn.Linear(256 * 2 + 10 + 1 +512, 4 * 1024),
            nn.BatchNorm1d(2 * 2 * 1024),
            nn.ReLU(True),
            nn.Linear(4 * 1024, 3 * 1024),
            nn.Tanh()
        )
    def forward(self, x):
        b = x.size(0)
        noise = torch.FloatTensor(b, 512).normal_(0, 1).to(x.device)
        noise = make_variable(noise)
        x = torch.cat([x, noise], 1)
        out = self.main(x)
        return out.view(b, 3, 1024, 1)

I don’t see any in place operation here, so what exactly is the issue here? @ptrblck

You could be hitting this issue which would be raised in case a backward pass tries to compute gradients with already updated parameters and thus also stale forward activations.

would it be a disadvantage if I calculate the loss of diff sub-networks and optimise them simultaneously with a single backward pass over all the losses?

Depending on the use case your memory usage might be higher, but you might increase the perf. by using a single backward pass. As long as your proposed workflow works for your use case and doesn’t trigger any other errors, it should be fine.