RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [3, 1]], which is output 0 of TanhBackward, is at version 1; expected version 0 instead

Se_di · July 1, 2020, 6:11pm

Hey,

I’m getting this error but i don’t understand why it has a problem with ‘action_value = torch.tanh(self.action_values(x))’ when I use relu or hardtanh there is no problem.

class NAF(nn.Module):
    def __init__(self, state_size, action_size,layer_size, n_step, seed):
        super(NAF, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.input_shape = state_size
        self.action_size = action_size
                
        self.head_1 = nn.Linear(self.input_shape, layer_size)
        self.ff_1 = nn.Linear(layer_size, layer_size)
        self.action_values = nn.Linear(layer_size, action_size)
        self.value = nn.Linear(layer_size, 1)
        self.matrix_entries = nn.Linear(layer_size, int(self.action_size*(self.action_size+1)/2))
        

    
    def forward(self, input_):
        """
        
        """

        x = torch.relu(self.head_1(input_))
        x = torch.relu(self.ff_1(x))
        action_value = torch.tanh(self.action_values(x))
        entries = torch.tanh(self.matrix_entries(x))
        V = self.value(x)
        

        # create lower-triangular matrix
        L = torch.zeros((input_.shape[0], self.action_size, self.action_size)).to(device)

        # get lower triagular indices
        tril_indices = torch.tril_indices(row=self.action_size, col=self.action_size, offset=0)  
        
        # fill matrix with entries
        L[:, tril_indices[0], tril_indices[1]] = entries#.clone().detach()
        L.diagonal(dim1=1,dim2=2).exp_()

        # calculate state-dependent, positive-definite square matrix
        P = L*L.transpose(1, 2)

        # calculate action:
        dist = Normal(action_value, 1) #torch.inverse(P)
        action = dist.sample()
        action = torch.clamp(action, min=-1, max=1).unsqueeze(-1)
        action_value.unsqueeze_(-1)

        # calculate Advantage:
        A = (-0.5 * (action - action_value).transpose(1, 2) * P * (action - action_value)).squeeze(2)
        
        Q = A + V

        return action, Q, V

the problem only appears when trying to backprop

    def learn(self, experiences):
        """Update value parameters using given batch of experience tuples.
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
        """
        self.optimizer.zero_grad()
        states, actions, rewards, next_states, dones = experiences

        # get the Value for the next state from target model
        with torch.no_grad():
            _, _, V_ = self.qnetwork_target(next_states)

        # Compute Q targets for current states 
        V_targets = rewards + (self.GAMMA * V_ * (1 - dones))
        
        # Get expected Q values from local model
        _, Q, _ = self.qnetwork_local(states)

        # Compute loss
        loss = F.mse_loss(Q, V_targets)
        
        # Minimize the loss
        loss.backward()
        #clip_grad_norm_(self.qnetwork_local.parameters(),1)
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target)
        return loss.detach().cpu().numpy()

hope you guys can help

ptrblck · July 2, 2020, 10:53am

The issue is created by the inplace unsqueeze_ call on action_value, but is raised in the tanh.
If you use action_value = action_value.unsqueeze(-1) instead, your code should work.

Se_di · July 2, 2020, 2:43pm

indeed now it works, thank you!!

ged_nigel · August 1, 2020, 6:46pm

Hey, I also encountered with the similar error. Checked in the same way you posted but still didn’t figure out what’s actually going wrong. Below are the code

def run_facial_transfer(args):
    g_all = nn.Sequential(OrderedDict([
        ('g_mapping', G_mapping()),
        #('truncation', Truncation(avg_latent)),
        ('g_synthesis', G_synthesis(resolution=1024))    
        ]))

    g_all.load_state_dict(torch.load(args.weight_file, map_location=device))
    g_all.eval()
    g_all.to(device)
    g_mapping,g_synthesis=g_all[0],g_all[1]


    img_0=image_reader(args.src_im1, 1024) #(1,3,1024,1024) -1~1
    img_0=img_0.to(device)

    img_1=image_reader(args.src_im2, 1024)
    img_1=img_1.to(device) #(1,3,1024,1024)


    MSE_Loss=nn.MSELoss(reduction="mean")
    upsample2d=torch.nn.Upsample(scale_factor=0.5, mode='bilinear')

    img_p0=img_0.clone() #resize for perceptual net
    img_p0=upsample2d(img_p0)
    img_p0=upsample2d(img_p0) #(1,3,256,256)

    img_p1=img_1.clone()
    img_p1=upsample2d(img_p1)
    img_p1=upsample2d(img_p1) #(1,3,256,256)


    perceptual_net=VGG16_for_Perceptual(n_layers=[2,4,14,21]).to(device) #conv1_1,conv1_2,conv2_2,conv3_3
    dlatent_a=torch.zeros((1,18,512),requires_grad=True,device=device) #appearace latent s1
    dlatent_e=torch.zeros((1,18,512),requires_grad=True,device=device) # expression latent s2
    optimizer=optim.Adam({dlatent_a,dlatent_e},lr=0.01,betas=(0.9,0.999),eps=1e-8)

    alpha=torch.zeros((1,18,512)).to(device)
    alpha[:,3:5,:]=1

    print("Start")
    loss_list=[]
    for i in range(args.iteration):
        optimizer.zero_grad()
        synth_img_a=g_synthesis(dlatent_a)
        synth_img_a= (synth_img_a + 1.0) / 2.0

        synth_img_e=g_synthesis(dlatent_e)
        synth_img_e= (synth_img_e + 1.0) / 2.0

        loss_1=caluclate_contentloss(synth_img_a,perceptual_net,img_p1,MSE_Loss,upsample2d)
        print(loss_1.shape)
        loss_1.backward()
        optimizer.step()

        loss_2=caluclate_styleloss(synth_img_e,img_p0,perceptual_net,upsample2d)
        loss_2.backward()
        optimizer.step()

        loss_1=loss_1.detach().cpu().numpy()
        loss_2=loss_2.detach().cpu().numpy()


        dlatent1=dlatent_a*alpha+dlatent_e*(1-alpha)
        dlatent2=dlatent_a*(1-alpha)+dlatent_e*alpha

        synth_img1=g_synthesis(dlatent1)
        synth_img1= (synth_img1 + 1.0) / 2.0

        synth_img2=g_synthesis(dlatent2)
        synth_img2= (synth_img2 + 1.0) / 2.0

        if i%10==0:
            print("iter{}:   loss0 --{},  loss1 --{}".format(i,loss_1,loss_2))
            save_image(synth_img_a.clamp(0,1),"save_image/exchange/a/{}_a.png".format(i))
            save_image(synth_img_e.clamp(0,1),"save_image/exchange/e/{}_e.png".format(i))
            save_image(synth_img1.clamp(0,1),"save_image/exchange/result1/{}_exchange1.png".format(i))
            save_image(synth_img2.clamp(0,1),"save_image/exchange/result2/{}_exchange2.png".format(i))



            # np.save("latent_W/exchange1.npy",dlatent1.detach().cpu().numpy())
            # np.save("latent_W/exchange2.npy",dlatent2.detach().cpu().numpy())
        

def caluclate_contentloss(synth_img,perceptual_net,img_p,MSE_Loss,upsample2d): #W_l

     real_0,real_1,real_2,real_3=perceptual_net(img_p)
     synth_p=upsample2d(synth_img) #(1,3,256,256)
     synth_p=upsample2d(synth_p)
     synth_0,synth_1,synth_2,synth_3=perceptual_net(synth_p)

     perceptual_loss=0


     perceptual_loss=MSE_Loss(synth_0,real_0)+perceptual_loss
     perceptual_loss=MSE_Loss(synth_1,real_1)+perceptual_loss

     perceptual_loss=MSE_Loss(synth_2,real_2)+perceptual_loss
     perceptual_loss=MSE_Loss(synth_3,real_3)+perceptual_loss



     return perceptual_loss



class StyleLoss(nn.Module):
     def __init__(self, target_feature):
          super(StyleLoss, self).__init__()
          self.target = self.gram_matrix(target_feature).detach()
     def forward(self, input):
          G = self.gram_matrix(input)
          self.loss = F.mse_loss(G, self.target)
          return self.loss
     def gram_matrix(self,input):
          a, b, c, d = input.size()  
          features = input.view(a * b, c * d)  

          G = torch.mm(features, features.t())  
          return G.div(a * b * c * d)


def caluclate_styleloss(synth_img,img_p,perceptual_net,upsample2d):

     synth_p=upsample2d(synth_img) #(1,3,256,256)
     synth_p=upsample2d(synth_p)

     _,_,_,style_real=perceptual_net(img_p) #conv3_3
     _,_,_,style_synth=perceptual_net(synth_p)

     style_loss=StyleLoss(style_real)

     loss=style_loss(style_synth)

     return loss

run_facial_transfer(args) then get this,
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1, 512]], which is output 0 of SelectBackward, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

ptrblck · August 2, 2020, 2:27am

I don’t see any inplace operations in the posted code, so I guess the error might be raised by an inplace op in the models.
Did you enable anomaly detection and rerun the code?
If so, could you post the complete stack trace here?

Full · August 9, 2020, 3:57pm

Hi. I’m trying to implement CNN-LSTM and also have same problem. I don’t have ‘+=’ operations(I think), but squeezing and unsqueezing. Was twiking some, but still haven’t found the correct solution. I marked where the error occurs in the code with # <<<< ========ERROR .

class A2C(nn.Module):
    def __init__(self, input_size, num_actions, args, convolution=True):
        super(A2C, self).__init__()
        self.num_actions = num_actions
        self.convolution = convolution
        self.sequence_lenght = args.sequence_lenght
        self.batch_size = args.batch_size
        self.lstm_hidden_size = args.lstm_hidden_size

        if convolution:
            convolution_layers = [nn.Conv2d(input_size[0], 512, kernel_size=8, stride=4), nn.ReLU(),
                                  nn.Conv2d(512, 256, kernel_size=4, stride=2), nn.ReLU(),
                                  nn.Conv2d(256, 128, kernel_size=3, stride=2), nn.ReLU(),
                                  nn.Conv2d(128, 64, kernel_size=3, stride=1), nn.ReLU()]
            self.conv = nn.Sequential(*convolution_layers)

            self.input_size = self.get_conv_out(input_size)

        if convolution == False:
            self.input_size = input_size[0]

        self.lstm = nn.LSTM(self.input_size, self.lstm_hidden_size, num_layers = 1, batch_first=True)

        if args.hidden_layers_num == 1:
            layers_a = [nn.Linear(self.lstm_hidden_size, args.hidden_1), nn.ReLU(), nn.Linear(args.hidden_1, num_actions)]
            layers_c = [nn.Linear(self.lstm_hidden_size, args.hidden_1), nn.ReLU(), nn.Linear(args.hidden_1, 1)]
        if args.hidden_layers_num != 1:
            layers_a = [nn.Linear(self.lstm_hidden_size, args.hidden_1), nn.ReLU(), nn.Linear(args.hidden_1, args.hidden_2), nn.ReLU(), nn.Linear(args.hidden_2, num_actions)]
            layers_c = [nn.Linear(self.lstm_hidden_size, args.hidden_1), nn.ReLU(), nn.Linear(args.hidden_1, args.hidden_2), nn.ReLU(), nn.Linear(args.hidden_2, 1)]

        self.Actor = nn.Sequential(*layers_a)
        self.Critic = nn.Sequential(*layers_c)

    def get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, obs, h_x, c_x):
        obs = torch.FloatTensor([obs]).to(device)

        if self.convolution:
            batch_size = obs.size()[0]
            obs = self.conv(obs).view(1, -1).unsqueeze(0)

        obs, (h_x, c_x) = self.lstm(obs, (h_x, c_x))

        logits = self.Actor(obs).squeeze(0) # **<<<< ========ERROR**
        values = self.Critic(obs).squeeze(0)

        action_probs = F.softmax(logits, dim=1).cpu().detach().numpy()[0]
        action = np.random.choice(self.num_actions, p=action_probs)
        return action, logits, values, h_x, c_x

    def init_hidden(self):
        return torch.zeros(1, 1, self.lstm_hidden_size), torch.zeros(1, 1, self.lstm_hidden_size)

This is my sample generator. Env is continuous and I’m taking n steps as sequence and saving last detached hidden state as start of another loop. (could be a problem if using multiple environments or detach in every env is enough to separate them?)

def play(self, net, device):
        
        if self.first == True:
            self.state = self.env.reset()
            self.first = False

        done = False

        if self.h_x == None:
            self.h_x, self.c_x = net.init_hidden()

        values = []
        logits_ = []
        actions = []
        rewards = []
        total_reward = 0.0
        _idx = 0
        while True:
            action, logits, value, self.h_x, self.c_x = net(self.state, self.h_x.to('cuda'), self.c_x.to('cuda')) # **<<<< ========ERROR**
            next_state, reward, done, _ = self.env.step(action)
            if _idx == 0:
                reward-=2*(self.env.trade_fees) * self.env.leverage * 10_000
            _idx = _idx+1
            values.append(value)
            logits_.append(logits)
            actions.append(action)

            if done and self.if_trading_env == False:
                reward = -1 # <---
            rewards.append(reward)
            total_reward+=reward

            self.state = next_state

            if len(actions) >= args.sequence_lenght:
                self.h_x = self.h_x.detach()
                self.c_x = self.c_x.detach()

                return values, logits_, actions, discounted_rewards(rewards, self.gamma), total_reward

My training loop.

    idx = 0
    while True:
        
        batch_counter = 0
        batch_values = []
        batch_logits = []
        batch_actions =[]
        batch_vals_ref = []
        while True:
            for env in enviroments:
                values, _logits, actions, vals_ref, total_reward = env.play(net, device) #**<<<< ========ERROR**

                batch_values.append(values)
                batch_logits.append(_logits)
                batch_actions.append(actions)
                batch_vals_ref.append(vals_ref)
                episodes_rewrds.append(total_reward)
                batch_counter += 1
                if batch_counter >= args.batch_size:
                    break
            if batch_counter >= args.batch_size:
                break

        for i in range(len(batch_values)):

            torch.cuda.empty_cache()

            values_v = torch.stack(batch_values[i]).to(device)
            logits_v = torch.stack(batch_logits[i]).squeeze(1).to(device)
            actions_t = torch.LongTensor(batch_actions[i]).to(device)
            vals_ref_v = torch.FloatTensor(batch_vals_ref[i]).to(device)

            net.zero_grad()

            value_loss = args.zeta * F.mse_loss(values_v.squeeze(-1).squeeze(-1), vals_ref_v) 
            advantage = vals_ref_v - values_v.detach()
            log_probs = F.log_softmax(logits_v, dim=1)
            log_prob_action = advantage * log_probs[range(len(actions_t)), actions_t]
            policy_loss = - log_prob_action.mean()
            actions_probs = F.softmax(logits_v, dim=1)
            entropy_loss = - args.entropy_beta * (actions_probs * log_probs).sum(dim=1).mean()
            total_policy_loss = policy_loss + entropy_loss

            total_policy_loss.backward(retain_graph=True) # **<<<< ========ERROR**
            value_loss.backward()
            nn_utils.clip_grad_norm_(net.parameters(), args.clip_grad)
            optimizer.step()

        idx +=1
        print(idx, round(np.mean(episodes_rewrds), 2))
        torch.save(net.state_dict(), NET_PARAMS_PATH)

        if np.mean(episodes_rewrds) > 1_000_000:
            break

And this is my error.

Warning: Error detected in MmBackward. Traceback of forward call that caused the error:
  File "E:\Market Data Collection\crypto_gym\A2C_LSTM_multi_1.0.py", line 320, in <module>
    values, _logits, actions, vals_ref, total_reward = env.play(net, device)
  File "E:\Market Data Collection\crypto_gym\A2C_LSTM_multi_1.0.py", line 257, in play
    action, logits, value, self.h_x, self.c_x = net(self.state, self.h_x.to('cuda'), self.c_x.to('cuda'))
  File "C:\Python38\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "E:\Market Data Collection\crypto_gym\A2C_LSTM_multi_1.0.py", line 202, in forward
    logits = self.Actor(obs).squeeze(0)
  File "C:\Python38\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "C:\Python38\lib\site-packages\torch\nn\modules\container.py", line 100, in forward
    input = module(input)
  File "C:\Python38\lib\site-packages\torch\nn\modules\module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "C:\Python38\lib\site-packages\torch\nn\modules\linear.py", line 87, in forward
    return F.linear(input, self.weight, self.bias)
  File "C:\Python38\lib\site-packages\torch\nn\functional.py", line 1612, in linear
    output = input.matmul(weight.t())
 (print_stack at ..\torch\csrc\autograd\python_anomaly_mode.cpp:60)
Traceback (most recent call last):
  File "E:\Market Data Collection\crypto_gym\A2C_LSTM_multi_1.0.py", line 355, in <module>
    total_policy_loss.backward(retain_graph=True)
  File "C:\Python38\lib\site-packages\torch\tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)
  File "C:\Python38\lib\site-packages\torch\autograd\__init__.py", line 98, in backward
    Variable._execution_engine.run_backward(
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1024, 2]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Also have a question. Can I shuffle samples from multiple batches for training when using LSTM in my code. Hidden states are already preserved in samples when I was adding them on environment iteration?

Raphael_Emeka · August 28, 2021, 6:12pm

Hi, please I have a similar problem, with the same kind of error message. I am new to this. everything works well with the forward propagation but the backprop keeps breaking after 2 epochs. Here is the code:


# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))



# Defining the Network model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        """this is the neural network 
        it can take a shape of 9 columns by any column
        the output layer is 2
        """
        self.model = nn.Sequential(
            nn.Linear(3*N,N-1),
            nn.Sigmoid(),
            nn.Linear(N-1,N-1),
            nn.Sigmoid(),
            nn.Linear(N-1,N-1),
            nn.Softmax(),
        )

   
    def forward(self, x, time_idx):
        
        #x is input data that you want to pass into the neural network
        # self.network_output is network output
        #t[:,time_idx+1] is the new clock time for the next time index (Eqn. 16) gotten from the weights (i.e., softmax o/p)
        self.network_output = self.model(x)
        
        for i in range(N):
            diff=t[:,time_idx]-t[i,time_idx]
            diff=diff[diff!=0]
            t[i,time_idx+1]=t[i,time_idx]+Time_res+ torch.sum(torch.dot(self.network_output[i,:], diff))
        self.newtime=t[:,time_idx+1]
        
        return self.network_output, self.newtime
    
    def custom_loss(self, t_outvec, time_idx):
        #t_outvec=self.newtime
        sumerror=torch.zeros((N,1), dtype=float)
        for i in range(N):
            ee=t_outvec-t_outvec[i]
            error= torch.sum(torch.square(ee))
            sumerror[i]=torch.log(torch.tensor(2+time_idx))*error
        self.loss=torch.sum(sumerror)
        return self.loss

modelmy=NeuralNetwork().to(device)
# Setting the input Matrix to the NN
Nodein=torch.zeros(N,3*N)
#Node=np.empty()
for b in range(N):
    Node=torch.cat([t[:,0], torch.Tensor(Distance_matrix[b,:]),torch.Tensor(RSSI_Matrix[b,:])])
    Nodein[b,:]=Node

torch.autograd.set_detect_anomaly(True)
#loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(modelmy.parameters(), lr=0.01)
steps = 20
for i in range(steps):
    output = modelmy.forward(Nodein,i)
    loss=modelmy.custom_loss(output[1],i)
    print(loss)
    #loss = loss_fn(y_true, output)
    #print(f"loss = {loss}")
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

I will greatly appreciate your help

ptrblck · August 29, 2021, 5:22am

I cannot debug the code as it’s not executable, but you should check these operations:

t[i,time_idx+1]=t[i,time_idx]+Time_res+ torch.sum(torch.dot(self.network_output[i,:], diff))

sumerror[i]=torch.log(torch.tensor(2+time_idx))*error

Nodein[b,:]=Node

as they are all assigning tensors inplace and try to replace them with a new tensor creation, if possible.

Raphael_Emeka · August 30, 2021, 9:35am

Thank you very much for the response. I did try changing the functions above as suggested but nothing changed still.

Here is the complete executable code:

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']="2"
import numpy as np # linear algebra
import random
from numpy import newaxis
from numpy import array
import torch
from torch import nn
torch.pi = torch.acos(torch.zeros(1)).item() * 2 # which is 3.1415927410125732  
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import math
import array
from sklearn.metrics import confusion_matrix 
plt.style.use('fivethirtyeight')
print ('import completed')



#initializing the power time and distance
global N
global t
global Time_res
N=5
t=np.zeros((N,201), dtype=float)
t=torch.Tensor(t)
Tx_powers=np.ones((N,1), dtype=int)
AttExp = 2              # Attenuation Exponent (2 = free space)
AttConst = 1000**2       # Attenuation coefficient (e.g., Aeff)
X_max= 20
Y_max=20
Time_res=1/200
location_X=X_max*np.random.rand(N,1)
location_Y=Y_max*np.random.rand(N,1)
Distance_matrix=np.zeros((N,N), dtype=float)
P0= 0.01

#Allocating the different nodes randomly
for i in range(N):
    for j in range(i+1, N):
        Distance_matrix[i,j]=1000*np.sqrt(np.square(location_X[j]-location_X[i])+np.square(location_Y[j]-location_Y[i]))
        Distance_matrix[j,i]=Distance_matrix[i,j]
        
#Generating RSSI matrix, i.e. Pr
#Row index = receiver, Column index = transmitter
RSSI_Matrix = np.zeros((N,N),dtype=float)
for i in range(N):
    for j in range(N):
        if i != j :     # If ii=jj the RSSI is infinite
            RSSI_Matrix[i,j] = Tx_powers[j]*AttConst/Distance_matrix[i,j]**AttExp
        elif i == j:
            RSSI_Matrix[i,j] = 0
            
# Initializing nodes to random clocks. 
# t_i(0) is the initial offset of user % i. It is uniform over 1/200 sec
# Time resolution is 1/200 of a sec. We count timing in unit of T_res = 1/200 sec

for i in range (N):
    t[i,0]= torch.rand(1,1)*Time_res
    
    
plt.scatter(location_X, location_Y, label='WSN Layout', marker='o')

Data_pow=pd.DataFrame(data=RSSI_Matrix, index=None, columns=None, dtype=None, copy=None)
print(Data_pow.to_string())

Data_time=pd.DataFrame(data=t, index=None, columns=None, dtype=None, copy=None)
print(Data_time.loc[0:N,0].to_string())

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))



# Defining the Network model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        """this is the neural network 
        it can take a shape of 9 columns by any column
        the output layer is 2
        """
        self.model = nn.Sequential(
            nn.Linear(3*N,N-1),
            nn.Sigmoid(),
            nn.Linear(N-1,N-1),
            nn.Sigmoid(),
            nn.Linear(N-1,N-1),
            nn.Softmax(),
        )

    def extra_repr(self, time_idx):
        return time_idx
    
    def forward(self, x, time_idx):
        
        #x is input data that you want to pass into the neural network
        # self.network_output is network output
        #t[:,time_idx+1] is the new clock time for the next time index (Eqn. 16) gotten from the weights (i.e., softmax o/p)
        self.network_output = self.model(x)
        
        for i in range(N):
            diff=t[:,time_idx]-t[i,time_idx]
            diff=diff[diff!=0]
            t[i,time_idx+1]=t[i,time_idx]+Time_res+ torch.sum(torch.dot(self.network_output[i,:], diff))
        self.newtime=t[:,time_idx+1]
        
        return self.network_output, self.newtime
    
    def custom_loss(self, t_outvec, time_idx):
        sumerror=torch.zeros((N,1), dtype=float)
        for i in range(N):
            ee=t_outvec-t_outvec[i]
            error= torch.sum(torch.square(ee))
            sumerror[i]=torch.log(torch.tensor(2+time_idx))*error
        self.loss=torch.sum(sumerror)
        return self.loss
modelmy=NeuralNetwork().to(device)

# Setting the input Matrix to the NN
Nodein=torch.zeros(N,3*N)
#Node=np.empty()
for b in range(N):
    Node=torch.cat([t[:,0], torch.Tensor(Distance_matrix[b,:]),torch.Tensor(RSSI_Matrix[b,:])])
    Nodein[b,:]=Node

#Runnung the code for 20 timesteps
torch.autograd.set_detect_anomaly(True)
optimizer = torch.optim.SGD(modelmy.parameters(), lr=0.01)
steps = 20
for i in range(steps):
    output = modelmy.forward(Nodein,i)
    loss=modelmy.custom_loss(output[1],i)
    print(loss)
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

My guess is there is a problem with the customized loss function, just can point out what exactly it is. Thanks in advance for your help.

ptrblck · August 30, 2021, 5:52pm

The new code still seems to use all previously mentioned in-place operations so I would expect it also to fail.

Raphael_Emeka · August 31, 2021, 3:46pm

My bad, Sorry for the back and forth. I tried using the .clone() function to sort the issue with inplace operation. However it still gave the same error. I am not sure of another way to get this done. is there another route different from .clone()?

Here is the edited NN code below:

import os
os.environ['TF_CPP_MIN_LOG_LEVEL']="2"
import numpy as np # linear algebra
import random
from numpy import newaxis
from numpy import array
import torch
from torch import nn
torch.pi = torch.acos(torch.zeros(1)).item() * 2 # which is 3.1415927410125732  
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import math
import array
from sklearn.metrics import confusion_matrix 
plt.style.use('fivethirtyeight')
print ('import completed')



#initializing the power time and distance
global N
global t
global Time_res
N=5
t=np.zeros((N,201), dtype=float)
t=torch.Tensor(t)
Tx_powers=np.ones((N,1), dtype=int)
AttExp = 2              # Attenuation Exponent (2 = free space)
AttConst = 1000**2       # Attenuation coefficient (e.g., Aeff)
X_max= 20
Y_max=20
Time_res=torch.tensor(1/200)
location_X=X_max*np.random.rand(N,1)
location_Y=Y_max*np.random.rand(N,1)
Distance_matrix=np.zeros((N,N), dtype=float)
P0= 0.01

#Allocating the different nodes randomly
for i in range(N):
    for j in range(i+1, N):
        Distance_matrix[i,j]=1000*np.sqrt(np.square(location_X[j]-location_X[i])+np.square(location_Y[j]-location_Y[i]))
        Distance_matrix[j,i]=Distance_matrix[i,j]
        
#Generating RSSI matrix, i.e. Pr
#Row index = receiver, Column index = transmitter
RSSI_Matrix = np.zeros((N,N),dtype=float)
for i in range(N):
    for j in range(N):
        if i != j :     # If ii=jj the RSSI is infinite
            RSSI_Matrix[i,j] = Tx_powers[j]*AttConst/Distance_matrix[i,j]**AttExp
        elif i == j:
            RSSI_Matrix[i,j] = 0
            
# Initializing nodes to random clocks. 
# t_i(0) is the initial offset of user % i. It is uniform over 1/200 sec
# Time resolution is 1/200 of a sec. We count timing in unit of T_res = 1/200 sec

for i in range (N):
    t[i,0]= torch.rand(1,1)*Time_res
    
    
plt.scatter(location_X, location_Y, label='WSN Layout', marker='o')

Data_pow=pd.DataFrame(data=RSSI_Matrix, index=None, columns=None, dtype=None, copy=None)
print(Data_pow.to_string())

Data_time=pd.DataFrame(data=t, index=None, columns=None, dtype=None, copy=None)
print(Data_time.loc[0:N,0].to_string())
print(t[:,0])


# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))



# Defining the Network model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        """this is the neural network 
        it can take a shape of 9 columns by any column
        the output layer is 2
        """
        self.model = nn.Sequential(
            nn.Linear(3*N,N-1),
            nn.Sigmoid(),
            nn.Linear(N-1,N-1),
            nn.Sigmoid(),
            nn.Linear(N-1,N-1),
            nn.Softmax(dim=1),
        )

   
    def forward(self, x, time_idx):
        
        #x is input data that you want to pass into the neural network
        # self.network_output is network output
        #t[:,time_idx+1] is the new clock time for the next time index (Eqn. 16) gotten from the weights (i.e., softmax o/p)
        self.network_output = self.model(x)
        
        for i in range(N):
            diff=t[:,time_idx].clone()-t[i,time_idx].clone()
            diff=diff[diff!=0].clone()
            t[i,time_idx+1]=t[i,time_idx].clone()+Time_res.clone() + torch.sum(torch.dot(self.network_output[i,:].clone(), diff.clone()))
        self.newtime=t[:,time_idx+1].clone()
        
        sumerror=torch.zeros((N,1), dtype=float)
        mse_loss = nn.MSELoss()
        for i in range(N):
            for j in [a for a in range(N) if a != i]:
                sumerror[i]=sumerror[i].clone()+mse_loss(self.newtime[i].clone(),self.newtime[j].clone())
        
        self.loss=torch.sum(sumerror.clone())   
                
        return self.network_output, self.newtime, self.loss
    
modelmy=NeuralNetwork().to(device)


# Setting the input Matrix to the NN
Nodein=torch.zeros(N,3*N)
for b in range(N):
    Node=torch.cat([t[:,0].clone(), torch.Tensor(Distance_matrix[b,:]).clone(),torch.Tensor(RSSI_Matrix[b,:]).clone()])
    Nodein[b,:]=Node.clone()
print(Nodein)


#RUNNING THE CODE
torch.autograd.set_detect_anomaly(True)
optimizer = torch.optim.SGD(modelmy.parameters(), lr=0.01)
steps = 20
for i in range(steps):
    output = modelmy.forward(Nodein,i)
    #loss=modelmy.custom_loss(output[1],i)
    loss=modelmy.loss
    print(loss)
    optimizer.zero_grad()
    loss.backward(retain_graph=True)
    optimizer.step()

Not sure, but, I am guessing the optimizer.step is modifying the parameters inplace.

gphilip · August 31, 2021, 7:10pm

I am reasonably sure that it doesn’t, because it is a PyTorch method.

ptrblck · August 31, 2021, 7:21pm

Parameters are update inplace by optimizers as seen here.
@Raphael_Emeka if you think this might be the issue, you might be running into this issue and would need to check your wokflow to make sure no stale forward activations are used.

Raphael_Emeka · November 25, 2021, 10:59am

Ok Thanks will check it out

asura · February 28, 2022, 10:52am

Will making a .clone() of the tensor make sense to resolve this error?

ptrblck · February 28, 2022, 7:35pm

Yes, cloning the tensor which is disallowed to be changed inplace should work.

asura · March 1, 2022, 6:16am

Thanks @ptrblck. When is a tensor disallowed to be changed inplace?

ptrblck · March 1, 2022, 6:19am

Inplace operations are disallowed if the tensor is needed to calculate the gradients during the backward pass as e.g. given in this example.

KanZa · June 15, 2022, 5:53am

I am trying SAM optimizer for multiple methods and I am facing this error for both ways of calling backward() multiple times.

# -----------------SAM Optimizer -------------------
        
        criterion(models['backbone'](inputs)[0], labels)
        loss.backward(retain_graph=True)
        optimizers['backbone'].first_step(zero_grad=True)
        
        criterion(models['backbone'](inputs)[0], labels)
        loss.backward(retain_graph=True)
        optimizers['backbone'].second_step(zero_grad=True)

        # -----------------SAM Optimizer for LLOSS Method -------------------
        if method == 'lloss':
            #optimizers['module'].step()
            loss1 = criterion(models['backbone'](inputs)[0], labels)
            loss1.backward()
            optimizers['module'].first_step(zero_grad=True)
            
            loss2 = criterion(models['backbone'](inputs)[0], labels)
            loss2.backward()
            optimizers['module'].second_step(zero_grad=True)

            loss = torch.tensor([loss1, loss2])
            loss.backward(gradient=torch.tensor([1.0,1.0]))

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [512, 100]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

ptrblck · June 15, 2022, 6:04am

I don’t know how the optimizer is exactly working internally, but if the first_step() is already updating parameters used to calculate loss2, the loss2.backward() call would fail since the forward activations are stale and the parameters were already updated inplace.