Hello, I am trying to build a pipeline to improve on the inference capabilities of quantised models. I use two really simple models on the MNIST dataset.
I am aware this is a recurrent error already discussed, but I cannot find an in-place call in my code. I do not fully understand how does PyTorch work behind the scenes, but I think the problem should be somehow related to me wanting to use the same criterion twice.
main
stats = {}
for epoch in range(1, 7):
if epoch < 2:
print("Regular NET training")
stats = train(model, train_loader, model_opt, epoch, with_stats=True)
test(model, test_loader)
else:
print("JOINT TRAINING")
train_UDMC(model, prenet, train_loader,
model_opt, prenet_opt, epoch, stats)
print("TESTING NET")
test(model, test_loader)
print("TESTING PRENET")
test_prenet(model, prenet, test_loader)
Here I run a couple of epochs to train the main model alone and gather activation statistics for posterior quantization. Then, train with a second model (which I would like to turn into some form of transformation that made quantized inference better).
trainer
def train_UDMC(model, prenet, train_loader, model_opt, prenet_opt, epoch, stats=None, num_bits=4):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = torch.nn.CrossEntropyLoss().cuda()
progress_bar = tqdm(train_loader, desc='Epoch {}'.format(epoch))
for batch_idx, (data, target) in enumerate(progress_bar):
data, target = data.to(device), target.to(device)
model.train()
prenet.train()
model_opt.zero_grad()
prenet_opt.zero_grad()
Z = prenet(data)
outputs_T = model(Z)
cross_entropy = criterion(outputs_T, target)
cross_entropy.backward(retain_graph=True)
model_opt.step()
model.eval()
prenet.train()
retarget = quantForward(copy.deepcopy(
model), data, stats, num_bits=num_bits, no_log=True)
quant_loss = criterion(retarget, target)
loss = cross_entropy + quant_loss
loss.backward()
prenet_opt.step()
if batch_idx == 1:
print ("[cross_entropy: {:f}] [quant_loss: {:f}]".format(cross_entropy.item(), quant_loss.item()))
if batch_idx % 500 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
Here I want the secondary model to learn how to minimise the CE of both the full-precision model and the quantised model.
models
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
num_channels = 1
self.conv1 = nn.Conv2d(num_channels, 20, 5, 1)
self.conv2 = nn.Conv2d(20, 50, 5, 1)
self.fc1 = nn.Linear(4*4*50, 500)
self.flatten_shape = 4*4*50
self.fc2 = nn.Linear(500, 10)
def forward(self, x, out_features=False):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, self.flatten_shape)
features = x
x = F.relu(self.fc1(x))
x = self.fc2(x)
if out_features:
return x, features
return x
class preNet(nn.Module):
def __init__(self):
super(preNet, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=5, padding=2)
self.conv2 = nn.Conv2d(32, 1, kernel_size=5, padding=2)
def forward(self, input):
x = F.relu(self.conv1(input))
x = self.conv2(x)
return F.relu(input + x)
quantisation
QTensor = namedtuple('QTensor', ['tensor', 'scale', 'zero_point'])
def quantForward(model, x, stats, num_bits=8):
x = quantize_tensor(
x, min_val=stats['conv1']['min'], max_val=stats['conv1']['max'], num_bits=num_bits)
x, scale_next, zero_point_next = quantizeLayer(
x.tensor, model.conv1, stats['conv2'], x.scale, x.zero_point, num_bits=num_bits)
x = F.max_pool2d(x, 2, 2)
x, scale_next, zero_point_next = quantizeLayer(
x, model.conv2, stats['fc1'], scale_next, zero_point_next, num_bits=num_bits)
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, 4*4*50)
x, scale_next, zero_point_next = quantizeLayer(
x, model.fc1, stats['fc2'], scale_next, zero_point_next, num_bits=num_bits)
# Dequantise for final layer
x = dequantize_tensor(
QTensor(tensor=x, scale=scale_next, zero_point=zero_point_next))
x = model.fc2(x)
return x
def quantizeLayer(x, layer, stat, scale_x, zp_x, num_bits=8):
x = x.clone()
# cache old values
W = layer.weight.data
B = layer.bias.data
w = quantize_tensor(layer.weight.data, num_bits=num_bits)
b = quantize_tensor(layer.bias.data, num_bits=num_bits)
layer.weight.data = w.tensor.float()
layer.bias.data = b.tensor.float()
scale_w = w.scale
zp_w = w.zero_point
scale_b = b.scale
zp_b = b.zero_point
scale_next, zero_point_next = calcScaleZeroPoint(
min_val=stat['min'], max_val=stat['max'])
X = x.float() - zp_x
layer.weight.data = ((scale_x * scale_w) /
scale_next)*(layer.weight.data - zp_w)
layer.bias.data = (scale_b/scale_next)*(layer.bias.data + zp_b)
x = (layer(X)) + zero_point_next
# cast to int
x.round_()
# Perform leaky relu
x = F.leaky_relu(x)
# Reset weights for next forward pass
layer.weight.data = W
layer.bias.data = B
return x, scale_next, zero_point_next
def quantize_tensor(x, num_bits=8, min_val=None, max_val=None):
if not min_val and not max_val:
min_val, max_val = x.min(), x.max()
qmin = 0.
qmax = 2.**num_bits - 1.
scale, zero_point = calcScaleZeroPoint(min_val, max_val, num_bits)
q_x = zero_point + x / scale
q_x.clamp_(qmin, qmax).round_()
q_x = q_x.round().byte()
return QTensor(tensor=q_x, scale=scale, zero_point=zero_point)
def dequantize_tensor(q_x):
return q_x.scale * (q_x.tensor.float() - q_x.zero_point)
error
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [500, 10]], which is output 0 of TBackward, is at version 940; expected version 939 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Thank you for any feedback.