Hello,
I have trained a simple linear model using QAT. I have also lowered this model to the XNNPACK backend via ExecuTorch. However, I would like to extract the actual int8 weights, weight scales, layer output scales, and zero points from the model. The reason for this is that I want to manually deploy the model to my embedded device using CMSIS-NN, and compare the speed/size with the lowered ExecuTorch model.
Currently, whenever I try printing out the weights (quantized_model.linear.weight), I only see floats, and no q_scale available to quantize those floats into int8. I remember back in torch 2.6.0, there was a very helpful int_repr() method that allowed you to do just that. The new PT2E framework seems to have discarded that?
Minimal example:
import torch as th
import torch.nn as nn
from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
from executorch.exir import to_edge_transform_and_lower
from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import XNNPACKQuantizer
from torchao.quantization.pt2e.quantize_pt2e import (
convert_pt2e,
prepare_qat_pt2e,
)
from torchao.quantization.pt2e import move_exported_model_to_eval
from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
get_symmetric_quantization_config,
)
IN_NODES = 10
OUT_NODES = 10
USE_BIAS = True
BATCH_SIZE = 32
NUM_EPOCHS = 10
INPUT_SHAPE = (1, IN_NODES)
class Model(nn.Module):
def __init__(self, in_nodes, out_nodes, use_bias):
super(Model, self).__init__()
self.linear = nn.Linear(in_nodes, out_nodes, bias=use_bias)
def forward(self, x):
x = self.linear(x)
return x
def train(model, in_nodes, out_nodes, batch_size, num_epochs):
optimizer = th.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(num_epochs):
x = th.randn(batch_size, in_nodes)
y = th.randn(batch_size, out_nodes)
optimizer.zero_grad()
preds = model(x)
loss = th.nn.functional.mse_loss(preds, y)
loss.backward()
optimizer.step()
return model
# Allow model to accept variable batch sizes
sample_inputs = (th.rand(BATCH_SIZE, IN_NODES),)
batch_dim = th.export.Dim("batch", min=1)
dynamic_shapes = ({0: batch_dim},)
# Prepare model for QAT
model = Model(IN_NODES, OUT_NODES, USE_BIAS)
model = th.export.export(model, sample_inputs, dynamic_shapes=dynamic_shapes).module()
quantizer = XNNPACKQuantizer().set_global(
get_symmetric_quantization_config(is_qat=True)
)
model = prepare_qat_pt2e(model, quantizer)
# Train
model = train(model, IN_NODES, OUT_NODES, BATCH_SIZE, NUM_EPOCHS)
# Convert
quantized_model = convert_pt2e(model, fold_quantize=False)
quantized_model = move_exported_model_to_eval(quantized_model)
# Export using ExecuTorch
sample_inputs = (th.rand(1, IN_NODES),)
exported_program = th.export.export(quantized_model, sample_inputs)
et_program = to_edge_transform_and_lower(
exported_program, partitioner=[XnnpackPartitioner()]
).to_executorch()
with open("./model.pte", "wb") as f:
f.write(et_program.buffer)
# Where are relevant q_scales?
print(f"Model weights are in float:\n {quantized_model.linear.weight}")
Python 3.12.12
Mac M4
executorch 1.0.1
pytorch-tokenizers 1.1.0
torch 2.9.1
torch-xla2 0.0.1.dev202412041639
torchao 0.14.0
Any help would be very appreciated, thank you.