RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation in the loop

I have a problem when I am using the network. For the first loop, I can get the gradient to the inputs. But when it runs the second time, I have the problem.
I have tested it in the forward pass and backpropagation for the first time. The backpropagtion is fine for a single loop. Thus, I am not quite sure about that.
My codes are in the followings:

import torch
import torch.nn as nn
import torch.nn.functional as F
#import torchvision
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from gumbel_softmax import gsm
torch.autograd.set_detect_anomaly(True)

#from torch.utils.tensorboard import SummaryWriter

#writer = SummaryWriter()

def max_matrix_multi(A, B):
    length_A = len(A)
    length_B = len(B)
    width_B = len(B[0])
    res = [[s_ for i in range(width_B)] for j in range(length_A)]
    for i in range(length_A):
        for j in range(width_B):
            temp = []
            for k in range(length_B):
                temp.append(A[i][k] + B[k][j])
            res[i][j] = max(temp)
    return res

# Initiate all locations as input variables
matrix_location = torch.LongTensor([[0,0],
                                     [0,2],
                                     [1,0],
                                     [1,2],
                                     [1,4],
                                     [2,0],
                                     [2,2],
                                     [2,4],
                                     [2,6],
                                     [3,0],
                                     [3,2],
                                     [3,4],
                                     [3,6],
                                     [4,1],
                                     [4,5],
                                     [5,1],
                                     [5,5],
                                     [5,7],
                                     [6,1],
                                     [6,3],
                                     [6,5],
                                     [6,7],
                                     [7,1],
                                     [7,3],
                                     [7,5],
                                     [7,7]])
matrix_positions = matrix_location.data.numpy()
input_size = matrix_location.shape[1] * 8 * matrix_location.shape[0]
hidden1_size = 300
hidden2_size = 140
hidden3_size = 60
output_size = matrix_location.shape[0]

class SamplingNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(SamplingNet, self).__init__()
        self.input_size = input_size
        self.hidden1_size = 300
        self.hidden2_size = 140
        self.hidden3_size = 60
        self.output_size = output_size
        self.InputLayer = nn.Linear(self.input_size, self.hidden1_size)
        self.InputLayer.weight.data.normal_(0, 0.3)
        self.Hidden1Layer = nn.Linear(self.hidden1_size, self.hidden2_size)
        self.Hidden1Layer.weight.data.normal_(0,0.3)
        self.Hidden2Layer = nn.Linear(self.hidden2_size, self.hidden3_size)
        self.Hidden2Layer.weight.data.normal_(0, 0.3)
        self.OutputLayer = nn.Linear(self.hidden3_size, self.output_size)
        self.OutputLayer.weight.data.normal_(0, 0.3)
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
    def forward(self, locations):
        Information = self.InputLayer(locations)
        Information = self.tanh(Information)
        hidInfo = self.Hidden1Layer(Information)
        hidInfo = self.tanh(hidInfo)
        extendInfo = self.Hidden2Layer(hidInfo)
        extendInfo = self.relu(extendInfo)
        output_lamda = self.OutputLayer(extendInfo)
        return output_lamda

class TestNet(nn.Module):
    def __init__(self):
        super(TestNet, self).__init__()
        self.forw = torch.nn.Linear(416, 26)
        
    def forward(self, ins):
        return self.forw(ins)
    
    
    
#Poisson = SamplingNet(input_size, output_size)
#position = matrix_location[0]
#lamda = Poisson(position)
#print(lamda)
# Construct the A0* B matrix for external input;
        
s_ = -float(10000)
A0_B = torch.FloatTensor([[s_ for _ in range(8)] for _ in range(8)])

# For each position in A0_B matrix, there exists a Poisson distribution. 
# We directly take the lamda + uniform(0, 1) as the value

learning_rate = 0.01

# Poisson = SamplingNet(input_size, output_size)
Poisson = SamplingNet(input_size, output_size)
Distribution_Optimizer = torch.optim.Adam(Poisson.parameters(), lr=learning_rate)
   
# Read simulation data into the system
source_data = pd.read_csv('SimulationData.csv',usecols=['u1','u2','u31','u32','u41','u42',
                                                           'u51','u52','q1','q2','q3','y1','q4',
                                                         'q5','q6','y2'])
training_data = source_data.values.tolist()
external_inputs = torch.FloatTensor([element[0:8] for element in training_data])
states = torch.FloatTensor([element[8:16] for element in training_data])
#img_grid = torchvision.utils.make_grid(states)


STEPS = len(external_inputs)

qval = torch.FloatTensor([[0 for i in range(8)] for step in range(STEPS)])
criterion = nn.L1Loss()
loss_list = []
soft_max = nn.Softmax(dim=0)
gumbel_softmax = gsm(t=0.01).apply
EPOCHS = 100

for epoch in range(EPOCHS):
    sum_loss = 0
    target = torch.randn(26)
    
    for step in range(STEPS):   
        qstep=torch.zeros(8, dtype=torch.float32)
        # One-hot encoding locations
        one_hot_location = F.one_hot(matrix_location,num_classes=8).view(-1).float()
        one_hot_location.requires_grad=True
        lamda_vector = Poisson(one_hot_location.float())
#        writer.add_graph(Poisson, one_hot_location.float())
#        writer.close()
        for p in range(0, matrix_location.shape[0]):
            i_index = int(matrix_positions[p][0])
            j_index = int(matrix_positions[p][1])
            lamda = lamda_vector[p]*1.0
            epion = torch.rand(1).detach()
            A0_B[i_index][j_index] = lamda * (1 + 0.5 * epion) 
        A0B = A0_B.clone()
        # Calculate the forward pass
        for index in range(len(A0B)):
            transition_timing = (A0B[index] + external_inputs[step]) * 1.0
            # qval[step][index] = torch.matmul(sm(gm(sm(tempd.clone()))), tempd.clone())
            qstep[index] = torch.matmul(soft_max(gumbel_softmax(soft_max(transition_timing))), transition_timing)  
#        Distribution_Optimizer.zero_grad()
#        LossTest.backward()
        qval[step] = qstep.clone()
 #      if step % 5 == 0:
 #       print(q[step])
        # Calculate the backpropagation
        qlog = torch.log(qstep.clone())
        loss = criterion(qlog,torch.log(states[step]))
        sum_loss += loss.item()
 #       for name, parms in Poisson.named_parameters():
 #           print('--name:',name,'--grad_requires:',parms.requires_grad,'--grad_value:',parms.grad)
        Distribution_Optimizer.zero_grad()    
        loss.backward(retain_graph=True)  
        oldvals = Poisson.InputLayer.weight * 1.0
        print(one_hot_location.grad)
        Distribution_Optimizer.step()
#        print(((oldvals==Poisson.InputLayer.weight).sum().float())/124800)
#    writer.add_scalar('sum_loss', sum_loss,epoch)
    loss_list.append(sum_loss)
    print('Epoch: %.f, Loss: %.4f' %((epoch + 1),sum_loss))

plt.plot(loss_list)
plt.show()

Hi,

Could you give more details about the stack trace? And also what you get when you enable anomaly mode please?

I have just solved that, it is the problem about the retain_graph. I am working on the testing.
It turns out that before each step, I have to refresh the matrix.

        A0_B = torch.FloatTensor([[s_ for _ in range(8)] for _ in range(8)])
        qstep = torch.zeros(8, dtype=torch.float32)
        # One-hot encoding locations

Thank you very much for the attention.
And in the backward process, I have to clear the retain_graph = True

        for p in range(0, matrix_location.shape[0]):
            i_index = int(matrix_positions[p][0])
            j_index = int(matrix_positions[p][1])
            lamda = lamda_vector[p]*1.0
            epion = torch.rand(1).detach()
            A0_B[i_index][j_index] = lamda * (1 + 0.5 * epion)
        for index in range(len(A0_B)):
            transition_timing = (A0_B[index].clone() + external_inputs[step]) * 1.0
            # qval[step][index] = torch.matmul(sm(gm(sm(tempd.clone()))), tempd.clone())
            qstep[index] = torch.matmul(soft_max(gumbel_softmax(soft_max(transition_timing))), transition_timing)  
        qval[step] = qstep
        qlog = torch.log(qstep)
        # Calculate the backpropagation
        loss = criterion(qlog,torch.log(states[step]))
        sum_loss += loss.item()
        Distribution_Optimizer.zero_grad()
        loss.backward()  
#        print(one_hot_location.grad)
        Distribution_Optimizer.step()

I am testing the result. It turns out that for each time to calculate the loss and backpropagation, the repeating usage of some components will generate the error. If naive regulate retain_graph = True, the leaf tensor will have a problem in the next calculation. Therefore, my solution is to initialize the repeating parts each time. The resulting codes are in the followings for others might have the same problem:

for epoch in range(EPOCHS):
    sum_loss = 0
    target = torch.randn(26)
    
    for step in range(STEPS): 
        A0_B = torch.FloatTensor([[s_ for _ in range(8)] for _ in range(8)])
        qstep = torch.zeros(8, dtype=torch.float32)
        one_hot_location = F.one_hot(matrix_location,num_classes=8).view(-1).float()
        one_hot_location.requires_grad=True
        lamda_vector = Poisson(one_hot_location.float())
        for p in range(0, matrix_location.shape[0]):
            i_index = int(matrix_positions[p][0])
            j_index = int(matrix_positions[p][1])
            lamda = lamda_vector[p]*1.0
            epion = torch.rand(1).detach()
            A0_B[i_index][j_index] = lamda * (1 + 0.5 * epion)
        for index in range(len(A0_B)):
            transition_timing = (A0_B[index].clone() + external_inputs[step]) * 1.0
            # qval[step][index] = torch.matmul(sm(gm(sm(tempd.clone()))), tempd.clone())
            qstep[index] = torch.matmul(soft_max(gumbel_softmax(soft_max(transition_timing))), transition_timing)  
        qval[step] = qstep
        qlog = torch.log(qstep)
        # Calculate the backpropagation
        loss = criterion(qlog,torch.log(states[step]))
        sum_loss += loss.item()
        Distribution_Optimizer.zero_grad()
        loss.backward()  
#        print(one_hot_location.grad)
        Distribution_Optimizer.step()
#        print(((oldvals==Poisson.InputLayer.weight).sum().float())/124800)
#    writer.add_scalar('sum_loss', sum_loss,epoch)
    loss_list.append(sum_loss)
    print('Epoch: %.f, Loss: %.4f' %((epoch + 1),sum_loss))
1 Like