Why my model with inplace RELU 's training loss goes extremely large while with out-place RELU can be trained well?

With ReLU(inplace=True), my model can not be trained, and its loss goes to hundreds of thousands after a few iterations. However, when I replace with ReLU(inplace=False), all trouble disappear and my loss can be converge gradually.
Pytorch didn’t complain any about my inplace ReLU usage before, everything seemed peace, except the larger and large training loss. I have heard someone said “When pytorch doesnt give you error or warning about your inplace operation usage, it almost be correctly working”. But it seems some wrong internal , why did it happen?

Below is my model code:

class ResidualBlock(nn.Module):
    def __init__(self, ch_in, ch_out, shortcut = None):
        super(ResidualBlock, self).__init__()
        self.left = nn.Sequential(
            nn.Conv2d(ch_in, 128, 3, 1, 1),  # ch_in, ch_out, kernel_size, stride, pad
            nn.ReLU(inplace=False),
            nn.Conv2d(128, ch_out, 3, 1, 1),
            nn.ReLU(inplace=False),
        )
        self.right = shortcut
    
    def forward(self, x):
        out = self.left(x)
        residual = x if self.right is None else self.right(x)
        out += residual
        return F.relu(out, inplace=False)

class ContentWeightedCNN(BasicModule):
    def __init__(self, use_imp = True):
        super(ContentWeightedCNN, self).__init__()
        self.model_name = 'ContentWeightedCNN'
        self.use_imp = use_imp
        self.encoder = self.make_encoder()
        self.decoder = self.make_decoder()
        self.reset_parameters()
    
    def reset_parameters(self):
        self.apply(weights_initialization)
    
    def forward(self, x):
        mgdata = self.encoder(x)
        enc_data = mgdata
        dec_data = self.decoder(enc_data)
        return dec_data

    def make_encoder(self):
        layers = [
            nn.Conv2d(3, 128, 8, 4, 2),
            nn.ReLU(inplace=True), # 54

            ResidualBlock(128, 128),

            nn.Conv2d(128, 256, 4, 2, 1), # 115
            nn.ReLU(inplace=True),

            ResidualBlock(256, 256),

            nn.Conv2d(256, 256, 3, 1, 1), #192
            nn.ReLU(inplace=True),

            ResidualBlock(256, 256),

            nn.Conv2d(256, 64, 1, 1, 0),    # conv 4  64 is n
            nn.Sigmoid(),                    
        ]
        return nn.Sequential(*layers)

    def make_decoder(self):
        layers = [
            nn.Conv2d(64, 512, 3, 1, 1),
            nn.ReLU(inplace=True),

            ResidualBlock(512, 512),

            nn.Conv2d(512, 512, 3, 1, 1),
            nn.ReLU(inplace=True),

            ResidualBlock(512, 512),

            nn.PixelShuffle(2),

            nn.Conv2d(128, 256, 3, 1, 1),
            nn.ReLU(inplace=True),

            ResidualBlock(256, 256),

            nn.PixelShuffle(4),

            nn.Conv2d(16, 32, 3, 1, 1),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(32, 3, 1, 1, 0)   
        ]
        return nn.Sequential(*layers)
 
            

I have issued it on github, here is the page. Thank anyone who viewed it!