Could you debug this? => RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

lattice · March 3, 2021, 4:41pm

Hey There, pytorch master passing by!, can you help me?
Below is the neural network model I made, a terrible error occurred while backpropagating.

class ConvFeatureExtractor(nn.Module):
    
    def __init__(self, input_channel = 3, output_channel = 32):
        super(ConvFeatureExtractor, self).__init__()
        self.output_channel = [4, 8, 16, output_channel]
        
        self.ConvNet = nn.Sequential(nn.Conv2d(input_channel, self.output_channel[0], kernel_size = 11, stride = 3, padding = 1),
                                     nn.ReLU(inplace = True),
                                     nn.MaxPool2d(3, 3), # 4x56x56
                                     
                                     nn.Conv2d(self.output_channel[0], self.output_channel[1], 3, 1, 1),
                                     nn.ReLU(True),
                                     nn.MaxPool2d(2, 2), # 8x28x28
                                     
                                     nn.Conv2d(self.output_channel[1], self.output_channel[2], 3, 1, 1), # 16x28x28
                                     nn.ReLU(True),
                                     nn.Conv2d(self.output_channel[2], self.output_channel[2], 3, 1, 1),
                                     nn.MaxPool2d(2, 2), # 16x14x14
                                     
                                     nn.Conv2d(self.output_channel[2], self.output_channel[3], 3, 1, 1, bias = False),
                                     nn.BatchNorm2d(self.output_channel[3]), nn.ReLU(True), # 32x14x14
                                     nn.Conv2d(self.output_channel[3], self.output_channel[3], 3, 1, 1, bias = False),
                                     nn.BatchNorm2d(self.output_channel[3]), nn.ReLU(True),
                                     nn.MaxPool2d(2, 2), # 32x7x7
                                     nn.Conv2d(self.output_channel[3], self.output_channel[3], 3, 1, 0), #32x5x5
                                     nn.BatchNorm2d(self.output_channel[3]), nn.ReLU(True),
                                     nn.MaxPool2d(5, 5) # 32x1x1
                                    )
        
    def forward(self, x):
        x = self.ConvNet(x) # [batch, channel, w, h]
        x = x.view(-1, self.output_channel[3]) # [batch, 32]
        
        return x


class RecurrentLayer(nn.Module):
    
    def __init__(self, input_size = 32, hidden_size = 128, output_size = 1):
        super(RecurrentLayer, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.rnn = nn.LSTMCell(input_size = self.input_size, hidden_size = self.hidden_size)
        self.hx_outputs = []
        self.cx_outputs = []
        
        self.out = nn.Linear(self.hidden_size, self.output_size)
        
    def forward(self, x):
        if len(self.hx_outputs) == 0 & len(self.cx_outputs) == 0:
            hx, cx = self.rnn(x) # h0, c0 = 0
            self.hx_outputs.append(hx)
            self.cx_outputs.append(cx)
        
        else:
            hx = self.hx_outputs[-1]
            cx = self.cx_outputs[-1]

            hx, cx = self.rnn(x, (hx, cx))
            self.hx_outputs.append(hx)
            self.cx_outputs.append(cx)

        prediction = self.out(self.hx_outputs[-1])

        return prediction

class Model(nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        
        self.FeatureExtraction = ConvFeatureExtractor()
        self.rnn = RecurrentLayer()
        
    def forward(self, input): # input : [batch, step, channel, w, h]
        input = input.reshape(-1, 5, 3, 512, 512)
        
        prediction_outputs = []
        for i in range(5):
            image = input[:, i, :, :, :] # [batch, channel, w, h]
            visual_feature = self.FeatureExtraction(image)
            prediction = self.rnn(visual_feature)
            prediction_outputs.append(prediction)
        
        return prediction_outputs[-1]

For example, if you put dimension of x = torch.randn(bactch_size, 5, 3, 512, 512) into the model, you get this

model = Model()
x = torch.randn(10, 5, 3, 512, 512)
model(x)

tensor([[0.0571],
[0.0822],
[0.0839],
[0.1006],
[0.0962],
[0.1026],
[0.0578],
[0.0550],
[0.0792],
[0.1104]], grad_fn=AddmmBackward)

However, when I try to calculate the loss and backpropagation using “loss.backward”, an error appears as shown below.

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [128, 512]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

I searched here and there on the pytorch forum, I realized that there is an inplace modification? in the variable, so an error occurs during the gradient calculation process.
and I was told to use “with torch.autograd.set_detect_anomaly(True):” to find the part where the error occurred and to add “.clone()” to solve the problem.
It’s so frustrating that I don’t know how to do it. Which part of my model is broken?

albanD · March 3, 2021, 6:37pm

You should just add at the beginning or your program torch.autograd.set_detect_anomaly(True).
Then you will see a second stack trace appear above the one in the error. And that one will point you to the forward call corresponding to the backward error and that will help you identify which part of your model is failing

lattice · March 4, 2021, 3:50am

Thanks for the answer. Could you please make sure I understand it correctly? Is it correct to use it like this?

class Model(nn.Module):
    
    def __init__(self):
        super(Model, self).__init__()
        
        self.FeatureExtraction = ConvFeatureExtractor()
        self.rnn = RecurrentLayer()
        
    def forward(self, input): # input : [batch, step, channel, w, h]
        input = input.reshape(-1, 5, 3, 512, 512)
        
        prediction_outputs = []
        with torch.autograd.set_detect_anomaly(True):
          for i in range(5):
            image = input[:, i, :, :, :] # [batch, channel, w, h]
            visual_feature = self.FeatureExtraction(image)
            prediction = self.rnn(visual_feature)
            prediction_outputs.append(prediction)
        
        return prediction_outputs[-1]

model = Model()

x = torch.randn(10, 5, 3, 512, 512)
y = torch.randn(10, 1)

yhat = model(x)

mse = nn.MSELoss()
loss = mse(y, yhat)
loss.backward(retain_graph=True)

I’m running it on a Jupiter notebook, but the error message should pop up, but it doesn’t.

albanD · March 4, 2021, 4:21pm

That code looks fine yes.
Note that the retain_graph=True is not needed here.

Given the error in your original code, what might be happening as well is that you’re doing an optimizer step between the forward and the backward. And that optimizer step modifies the weights inplace which could lead to that error message.