Extracting int8 weights and other quant params after convert_pt2e

BProvost · February 3, 2026, 2:45pm

Hello,

I have trained a simple linear model using QAT. I have also lowered this model to the XNNPACK backend via ExecuTorch. However, I would like to extract the actual int8 weights, weight scales, layer output scales, and zero points from the model. The reason for this is that I want to manually deploy the model to my embedded device using CMSIS-NN, and compare the speed/size with the lowered ExecuTorch model.

Currently, whenever I try printing out the weights (quantized_model.linear.weight), I only see floats, and no q_scale available to quantize those floats into int8. I remember back in torch 2.6.0, there was a very helpful int_repr() method that allowed you to do just that. The new PT2E framework seems to have discarded that?

Minimal example:

import torch as th
import torch.nn as nn

from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
from executorch.exir import to_edge_transform_and_lower

from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import XNNPACKQuantizer
from torchao.quantization.pt2e.quantize_pt2e import (
    convert_pt2e,
    prepare_qat_pt2e,
)
from torchao.quantization.pt2e import move_exported_model_to_eval
from executorch.backends.xnnpack.quantizer.xnnpack_quantizer import (
    get_symmetric_quantization_config,
)

IN_NODES = 10
OUT_NODES = 10
USE_BIAS = True
BATCH_SIZE = 32
NUM_EPOCHS = 10
INPUT_SHAPE = (1, IN_NODES)

class Model(nn.Module):
    def __init__(self, in_nodes, out_nodes, use_bias):
        super(Model, self).__init__()
        self.linear = nn.Linear(in_nodes, out_nodes, bias=use_bias)

    def forward(self, x):
        x = self.linear(x)
        return x

def train(model, in_nodes, out_nodes, batch_size, num_epochs):
    optimizer = th.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(num_epochs):
        x = th.randn(batch_size, in_nodes)
        y = th.randn(batch_size, out_nodes)
        optimizer.zero_grad()
        preds = model(x)
        loss = th.nn.functional.mse_loss(preds, y)
        loss.backward()
        optimizer.step()
    return model

# Allow model to accept variable batch sizes
sample_inputs = (th.rand(BATCH_SIZE, IN_NODES),)
batch_dim = th.export.Dim("batch", min=1)
dynamic_shapes = ({0: batch_dim},)

# Prepare model for QAT
model = Model(IN_NODES, OUT_NODES, USE_BIAS)
model = th.export.export(model, sample_inputs, dynamic_shapes=dynamic_shapes).module()
quantizer = XNNPACKQuantizer().set_global(
    get_symmetric_quantization_config(is_qat=True)
)
model = prepare_qat_pt2e(model, quantizer)

# Train
model = train(model, IN_NODES, OUT_NODES, BATCH_SIZE, NUM_EPOCHS)

# Convert
quantized_model = convert_pt2e(model, fold_quantize=False)
quantized_model = move_exported_model_to_eval(quantized_model)

# Export using ExecuTorch
sample_inputs = (th.rand(1, IN_NODES),)
exported_program = th.export.export(quantized_model, sample_inputs)
et_program = to_edge_transform_and_lower(
    exported_program, partitioner=[XnnpackPartitioner()]
).to_executorch()
with open("./model.pte", "wb") as f:
    f.write(et_program.buffer)

# Where are relevant q_scales?
print(f"Model weights are in float:\n {quantized_model.linear.weight}")

Python 3.12.12
Mac M4

executorch                1.0.1
pytorch-tokenizers        1.1.0
torch                     2.9.1
torch-xla2                0.0.1.dev202412041639
torchao                   0.14.0

Any help would be very appreciated, thank you.

jerryzh168 · February 17, 2026, 11:17pm

yeah the scales are in the graph itself, you can print the graph / walk the graph to get scale/zero point, see PyTorch 2 Export Post Training Quantization — torchao main documentation for more details on what the graph / quantized model looks like after convert_pt2e