Error on loss.backward() (One of the variables needed for gradient computation has been modified by an inplace operation)

Hello, I am trying to build a pipeline to improve on the inference capabilities of quantised models. I use two really simple models on the MNIST dataset.

I am aware this is a recurrent error already discussed, but I cannot find an in-place call in my code. I do not fully understand how does PyTorch work behind the scenes, but I think the problem should be somehow related to me wanting to use the same criterion twice.

main

stats = {}
for epoch in range(1, 7):
    if epoch < 2:
        print("Regular NET training")
        stats = train(model, train_loader, model_opt, epoch, with_stats=True)
        test(model, test_loader)
    else:
        print("JOINT TRAINING")
        train_UDMC(model, prenet, train_loader,
                   model_opt, prenet_opt, epoch, stats)
        print("TESTING NET")
        test(model, test_loader)
        print("TESTING PRENET")
        test_prenet(model, prenet, test_loader)

Here I run a couple of epochs to train the main model alone and gather activation statistics for posterior quantization. Then, train with a second model (which I would like to turn into some form of transformation that made quantized inference better).

trainer

def train_UDMC(model, prenet, train_loader, model_opt, prenet_opt, epoch, stats=None, num_bits=4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    criterion = torch.nn.CrossEntropyLoss().cuda()

    progress_bar = tqdm(train_loader, desc='Epoch {}'.format(epoch))
    for batch_idx, (data, target) in enumerate(progress_bar):
        data, target = data.to(device), target.to(device)

        model.train()
        prenet.train()
        model_opt.zero_grad()
        prenet_opt.zero_grad()
        Z = prenet(data)

        outputs_T = model(Z)
   
        cross_entropy = criterion(outputs_T, target)
        cross_entropy.backward(retain_graph=True)
        model_opt.step()

        model.eval()
        prenet.train()
        retarget = quantForward(copy.deepcopy(
            model), data, stats, num_bits=num_bits, no_log=True)
        quant_loss = criterion(retarget, target)
        loss = cross_entropy + quant_loss
        loss.backward()
        prenet_opt.step()

        if batch_idx == 1:
            print ("[cross_entropy: {:f}] [quant_loss: {:f}]".format(cross_entropy.item(), quant_loss.item()))

        if batch_idx % 500 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

Here I want the secondary model to learn how to minimise the CE of both the full-precision model and the quantised model.

models

class Net(nn.Module):
    def __init__(self):

        super(Net, self).__init__()
        num_channels = 1
        self.conv1 = nn.Conv2d(num_channels, 20, 5, 1)
        self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc1 = nn.Linear(4*4*50, 500)
        self.flatten_shape = 4*4*50
        self.fc2 = nn.Linear(500, 10)

    def forward(self, x, out_features=False):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = x.view(-1, self.flatten_shape)
        features = x
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        if out_features:
            return x, features
        return x

class preNet(nn.Module):
    def __init__(self):
        super(preNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, padding=2)
        self.conv2 = nn.Conv2d(32, 1, kernel_size=5, padding=2)

    def forward(self, input):
        x = F.relu(self.conv1(input))
        x = self.conv2(x)
        return F.relu(input + x)

quantisation

QTensor = namedtuple('QTensor', ['tensor', 'scale', 'zero_point'])

def quantForward(model, x, stats, num_bits=8):
    x = quantize_tensor(
            x, min_val=stats['conv1']['min'], max_val=stats['conv1']['max'], num_bits=num_bits)
    x, scale_next, zero_point_next = quantizeLayer(
        x.tensor, model.conv1, stats['conv2'], x.scale, x.zero_point, num_bits=num_bits)
    x = F.max_pool2d(x, 2, 2)
    x, scale_next, zero_point_next = quantizeLayer(
        x, model.conv2, stats['fc1'], scale_next, zero_point_next, num_bits=num_bits)
    x = F.max_pool2d(x, 2, 2)
    x = x.view(-1, 4*4*50)
    x, scale_next, zero_point_next = quantizeLayer(
        x, model.fc1, stats['fc2'], scale_next, zero_point_next, num_bits=num_bits)
    # Dequantise for final layer
    x = dequantize_tensor(
        QTensor(tensor=x, scale=scale_next, zero_point=zero_point_next))
    x = model.fc2(x)
    return x

def quantizeLayer(x, layer, stat, scale_x, zp_x, num_bits=8):
    x = x.clone()

    # cache old values
    W = layer.weight.data
    B = layer.bias.data

    w = quantize_tensor(layer.weight.data, num_bits=num_bits)
    b = quantize_tensor(layer.bias.data, num_bits=num_bits)

    layer.weight.data = w.tensor.float()
    layer.bias.data = b.tensor.float()

    scale_w = w.scale
    zp_w = w.zero_point
    scale_b = b.scale
    zp_b = b.zero_point

    scale_next, zero_point_next = calcScaleZeroPoint(
            min_val=stat['min'], max_val=stat['max'])

    X = x.float() - zp_x
    layer.weight.data = ((scale_x * scale_w) /
                            scale_next)*(layer.weight.data - zp_w)
    layer.bias.data = (scale_b/scale_next)*(layer.bias.data + zp_b)

    x = (layer(X)) + zero_point_next

    # cast to int
    x.round_()

    # Perform leaky relu
    x = F.leaky_relu(x)

    # Reset weights for next forward pass
    layer.weight.data = W
    layer.bias.data = B

    return x, scale_next, zero_point_next

def quantize_tensor(x, num_bits=8, min_val=None, max_val=None):
    if not min_val and not max_val:
        min_val, max_val = x.min(), x.max()

    qmin = 0.
    qmax = 2.**num_bits - 1.

    scale, zero_point = calcScaleZeroPoint(min_val, max_val, num_bits)
    q_x = zero_point + x / scale
    q_x.clamp_(qmin, qmax).round_()
    q_x = q_x.round().byte()

    return QTensor(tensor=q_x, scale=scale, zero_point=zero_point)

def dequantize_tensor(q_x):
    return q_x.scale * (q_x.tensor.float() - q_x.zero_point)

error
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [500, 10]], which is output 0 of TBackward, is at version 940; expected version 939 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Thank you for any feedback.

Hi,

Do you see a second backtrace that points to a forward function as mentioned in the message? Can you share it here?

You do a few inplace I think: the optimizer steps are doing inplace ops, also all the ops finishing with _ are inplace (like round_() that you use).