# One of the variables needed for gradient computation has been modified by an inplace operation error occured

I have an unknown problem in my learning model. Could you tell me why?

``````import torch
import torch.nn as nn
import os
import numpy as np

def cov(m, rowvar=False):
if m.dim() > 2:
raise ValueError('m has more than 2 dimensions')
if m.dim() < 2:
m = m.view(1, -1)
if not rowvar and m.size(0) != 1:
m = m.t()
fact = 1.0 / (m.size(1) - 1)
#특징에서 평균 빼기
mean_m=torch.mean(m, dim=1, keepdim=True)
m -= mean_m
mt = m.t()  # if complex: mt = m.t().conj()
m_squeeze=m.matmul(mt).squeeze()
result=fact * m_squeeze
return result
def feature_cov(feature):
cov_arr=[]
for i in range(feature.shape[0]):
our_c=cov(feature[i])

cov_arr.append(our_c)
cov_arr=torch.stack(cov_arr)
return cov_arr
def trace(matrix):

trace_val=0
for i in range(matrix.shape[0]):
trace_val+=matrix[i][i]
trace_val2=0.0001*trace_val
result_trace_val=torch.mul(trace_val2,torch.eye(matrix.shape[0]).cuda())

return result_trace_val

def normalize_cov(cov_matrix):
normalized_cov=[]
for i in range(cov_matrix.shape[0]):
trace_val=trace(cov_matrix[i])
tmp=cov_matrix[i]+trace_val
normalized_cov.append(tmp)
normalized_cov=torch.stack(normalized_cov)
return normalized_cov

def tile(a, dim, n_tile):
init_dim = a.size(dim)
repeat_idx = [1] * a.dim()
repeat_idx[dim] = n_tile
a = a.repeat(*(repeat_idx))
order_index = torch.cuda.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
out=torch.index_select(a, dim, order_index)
return out

#(3,128,128) shape covariance pooling
def cal_cov_pooling(feature):

feature=feature.view(feature.shape[0],feature.shape[1],-1)

cov_matrix=feature_cov(feature)
cov_regularized=normalize_cov(cov_matrix)

return cov_regularized

# computes weights for BiMap Layer

def variable_with_orth_weight_decay(shape):
s1=int(shape[2])
s2=int(shape[2]/2)
rand_val=torch.randn([s1, s2],device='cuda:0')
w0_init, _ = torch.qr(rand_val)
w0 = w0_init
tmp1 = w0.view(1, s1, s2)
tmp2 = w0.t().view(1, s2, s1)
tmp1 = tile(tmp1,0,shape[0])
tmp2 = tile(tmp2,0,shape[0])
return tmp1, tmp2
# ReEig Layer
def cal_rect_cov(features):
weight1, weight2 = variable_with_orth_weight_decay(features.shape)
features = torch.bmm(torch.bmm(weight2, features), weight1)
#     print(features.device,weight1.device, weight2.device)

#     features=features.detach().cpu()
result=[]
for i in range(features.shape[0]):
s_f,v_f=torch.symeig(features[i], eigenvectors=True)
s_f_clamp=torch.clamp(s_f,0.0001,10000)
s_f_clamp2=torch.diag(s_f_clamp)
sv_m=torch.matmul(v_f,s_f_clamp2)
features_t=torch.matmul(sv_m,v_f.t())
result.append(features_t)

result = torch.stack(result)
return result

# LogEig Layer
def cal_log_cov(features):
#     features=features.detach().cpu()
result=[]
for i in range(features.shape[0]):
s_f,v_f=torch.symeig(features[i], eigenvectors=True)
s_f_log = torch.log(s_f)
s_f_log2=torch.diag(s_f_log)
sv_m=torch.matmul(v_f,s_f_log2)
features_t=torch.matmul(sv_m,v_f.t())
result.append(features_t)
result = torch.stack(result)

return result

class Model(nn.Module):
def __init__(self):
super(Model,self).__init__()
#conv_layer
self.layer1=nn.Sequential(
#1
#             nn.BatchNorm2d(64),
nn.ReLU(),
#2
#             nn.BatchNorm2d(96),
nn.ReLU(),
#3
#             nn.BatchNorm2d(128),
nn.ReLU(),
#4
#             nn.BatchNorm2d(128),
nn.ReLU(),
#5
#             nn.BatchNorm2d(64),
nn.ReLU(),

#             nn.BatchNorm2d(64),
nn.ReLU(),
)
self.fc1=nn.Sequential(
nn.Linear(16384,2000),
nn.ReLU(),
)
self.fc2=nn.Sequential(
nn.Linear(2000,128),
nn.ReLU(),
)
self.fc3=nn.Sequential(
nn.Linear(128,7)
)

def forward(self,x):
#cov matrix
out=self.layer1(x)

#공분산 행렬

out=cal_cov_pooling(out)

#bimap layer1
out=cal_rect_cov(out)

#bimap layer2
#         out=cal_rect_cov(out)
#         print("rect cov pooling grad stage")

out=cal_log_cov(out)

out=out.view(out.shape[0],-1)
out=self.fc1(out)
out=self.fc2(out)
out=self.fc3(out)
return out
``````

The contents of the error are as follows.

``````---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
22         # Backward and optimize
---> 24         loss.backward()
25         optimizer.step()
26         total += labels.size(0)

~\Anaconda3\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
105                 products. Defaults to ``False``.
106         """
108
109     def register_hook(self, hook):

91     Variable._execution_engine.run_backward(
---> 93         allow_unreachable=True)  # allow_unreachable flag
94
95

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [256, 256]], which is output 0 of AsStridedBackward, is at version 128; expected version 127 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).```

--james``````

Could you change

``````m -= mean_m
``````

to

``````m = m - mean
``````

and run it again?

2 Likes

can you please explain why it fix that problem??
(i had the same problem and you just saved me…)

The first line of code modifies `m` inplace, while the second one creates a new `m` tensor with the modification.
The error message points to an unexpected inplace modification of one variable, so my guess was that this line of code could create the issue.
The general problem is, that some operations need the input to this particular operation to calculate the gradients.
If you modify the input inplace, Autograd cannot calculate the gradients and will raise this error.

2 Likes

I have 4 networks each being optimized individually. For 3 networks, backprop occurs without any error, but the last one throws the error:

`RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [4096, 3072]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck`

The shape of the tensor points me to the last linear layer in one of the network:

``````class Generator(nn.Module):
def __init__(self):
super().__init__()
self.main = nn.Sequential(
nn.Linear(256 * 2 + 10 + 1 +512, 4 * 1024),
nn.BatchNorm1d(2 * 2 * 1024),
nn.ReLU(True),
nn.Linear(4 * 1024, 3 * 1024),
nn.Tanh()
)
def forward(self, x):
b = x.size(0)
noise = torch.FloatTensor(b, 512).normal_(0, 1).to(x.device)
noise = make_variable(noise)
x = torch.cat([x, noise], 1)
out = self.main(x)
return out.view(b, 3, 1024, 1)
``````

I don’t see any in place operation here, so what exactly is the issue here? @ptrblck

You could be hitting this issue which would be raised in case a backward pass tries to compute gradients with already updated parameters and thus also stale forward activations.

would it be a disadvantage if I calculate the loss of diff sub-networks and optimise them simultaneously with a single backward pass over all the losses?

Depending on the use case your memory usage might be higher, but you might increase the perf. by using a single backward pass. As long as your proposed workflow works for your use case and doesn’t trigger any other errors, it should be fine.