RuntimeError one of the variables needed for gradient computation has been modified by an inplace operation


(Saar) #1

hello, i am trying to train my net but i am getting this error: “one of the variables needed for gradient computation has been modified by an inplace operation”.
here is the code:

class PolyNet(nn.Module): # nn.Module is parent class
def init(self, rank=4):
super(PolyNet, self).init() #calls init of parent class
self.layer1 = nn.Sequential(
nn.Conv2d(3, 1, kernel_size=7, stride=1, padding=2),
nn.BatchNorm2d(1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2))
self.fc = nn.Linear(63631, rank)

 def forward(self, x):
     """
     Feed forward through network
     Args:
         x - input to the network
         
     Returns "out", which is the network's output
     """
     
     out = self.layer1(x)
     out = out.reshape(out.size(0), -1)
     out = self.fc(out)
    
     return out
def func(x,a,b,c,d):
    return (x**3)*a+(x**2)*b+c*x+d
def my_loss(outputs, labels):
    
    """
    Args:
        outputs - output of network ([batch size, rank+1]) 
        labels  - desired coefficients  ([batch size, rank+1])
    """
    
    loss = torch.zeros(1, dtype=torch.float, requires_grad=True)
    loss = loss.to(device)
    arr2 = torch.zeros(4, dtype=torch.float, requires_grad=True)
    arr = torch.zeros(10, dtype=torch.float, requires_grad=True)
    for i in range(0,4):
        x=0
        for j in range(0,10):
            x = x + 0.1*j
            y1=func(x,outputs[i,0],outputs[i,1],outputs[i,2],outputs[i,3]) 
            y2=func(x,labels[i,0],labels[i,1],labels[i,2],labels[i,3])
            n=torch.abs(y1-y2)
            arr[j]=n
        a=torch.max(arr)
        arr2[i] = a
        
        
    loss=(torch.sum(arr2))/4
    print(loss)

        
    # Observe: If you need to iterate and add certain values to loss defined above
    # you cannot write: loss +=... because this will raise the error: 
    # "Leaf variable was used in an inplace operation"
    # Instead, to avoid this error write: loss = loss + ...  
    
                                      
    return loss

model = PolyNet().to(device)
print ("Number of model trainable parameters:", get_train_params_num(model))

#----------------------------------------------
#  Choose your optimizer:
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
#----------------------------------------------

def train_model(model,
               optimizer,
               train_loader,
               validation_loader,
               train_losses,
               validation_losses,
               epochs=2):
   
   """
   Trains a neural network. 
   Args;
       model               - model to be trained
       optimizer           - optimizer used for training
       train_loader        - loader from which data for training comes 
       validation_loader   - loader from which data for validation comes (maybe at the end, you use test_loader)
       train_losses        - adding train loss value to this list for future analysis
       validation_losses   - adding validation loss value to this list for future analysis
       epochs              - number of runs over the entire data set 
   """
   
   
   for epoch in range(epochs):
       for i, data in enumerate(loader, 0):
           inputs = (data['image']).to(device)
           labels = (data['labels']).to(device)
           # Forward pass
           outputs = model(inputs.float())
           loss = my_loss(outputs, labels)
       
            # Backward and optimize
           optimizer.zero_grad()
           loss.backward()
           optimizer.step()
   
   return 
   
if not 'train_losses' in vars():
   train_losses = []
if not 'validation_losses' in vars():
   validation_losses = []


train_model(model, 
           optimizer,
           train_loader, 
           validation_loader, 
           train_losses, 
           validation_losses,
           epochs=2)

and the error:


RuntimeError Traceback (most recent call last)
in ()
17 train_losses,
18 validation_losses,
—> 19 epochs=2)

in train_model(model, optimizer, train_loader, validation_loader, train_losses, validation_losses, epochs)
30 # Backward and optimize
31 optimizer.zero_grad()
—> 32 loss.backward()
33 optimizer.step()
34

~\Miniconda3\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
91 products. Defaults to False.
92 “”"
—> 93 torch.autograd.backward(self, gradient, retain_graph, create_graph)
94
95 def register_hook(self, hook):

~\Miniconda3\lib\site-packages\torch\autograd_init_.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
87 Variable._execution_engine.run_backward(
88 tensors, grad_tensors, retain_graph, create_graph,
—> 89 allow_unreachable=True) # allow_unreachable flag
90
91

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation


#2

I believe the following lines are causing the error:

    ...
    arr[j]=n
...
arr2[i] = a
...

You could append n and a to a list and creating a tensor from it.
Have a look at this modified code:

def my_loss(outputs, labels):
    
    """
    Args:
        outputs - output of network ([batch size, rank+1]) 
        labels  - desired coefficients  ([batch size, rank+1])
    """
    
    loss = torch.zeros(1, dtype=torch.float, requires_grad=True)
    loss = loss.to(device)
    arr2 = []

    for i in range(0,4):
        x=0
        arr = []
        for j in range(0,10):
            x = x + 0.1*j
            y1=outputs[i,0]
            y2=labels[i,0]
            n=torch.abs(y1-y2)
            arr.append(n)
        arr = torch.stack(arr)
       
        a=torch.max(arr)
        arr2.append(a)
        
    arr2 = torch.stack(arr2)
    loss=(torch.sum(arr2))/4
    print(loss)
                                  
    return loss

Note that I had to move arr into the inner loop.
Would that work for you? I’m not familiar with your loss function, so could you check, if the values make sense?


(Thúy đỗ) #3

Thank you, I got the same problem and your tip worked for me. However, I don’t know what the root cause is.
In my case,

def forward(self, inputs, hidden):
...
     emb_to_hidden = self.first_hidden(embedded_inputs[timestep])
     previous_to_hidden = self.last_time_hiddens[0](hidden[0])
     hidden[0] = emb_to_hidden + previous_to_hidden //this line causes error
...
     return outputs, hidden

Why is the operator to re-assign hidden[0] considered as an inplace operation?