I recently read a post and attempted to execute the sample code provided, I changed a little bit, below. However, I encountered an AssertionError
.
The error message I received stated:
ERROR: torch_tensorrt [TensorRT Conversion Context]: IBuilder::buildSerializedNetwork: Error Code 4: Internal Error (Calibration failure occurred with no scaling factors detected. This could be due to no int8 calibrator or insufficient custom scales for network layers. Please see int8 sample to setup calibration correctly.).
However, it appears that the calibration was performed by using:
mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
Could someone help me understand the reason for this error? Additionally, I would appreciate any guidance on how to resolve this issue.
Thank you in advance for your assistance!
Codes:
import modelopt.torch.quantization as mtq
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_tensorrt as torchtrt
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from modelopt.torch.quantization.utils import export_torch_mode
import torchvision.models as models
class VGG(nn.Module):
def __init__(self, layer_spec, num_classes=1000, init_weights=False):
super(VGG, self).__init__()
layers = []
in_channels = 3
for l in layer_spec:
if l == "pool":
layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
else:
layers += [
nn.Conv2d(in_channels, l, kernel_size=3, padding=1),
nn.BatchNorm2d(l),
nn.ReLU(),
]
in_channels = l
self.features = nn.Sequential(*layers)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.classifier = nn.Sequential(
nn.Linear(512 * 1 * 1, 4096),
nn.ReLU(),
nn.Dropout(),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Dropout(),
nn.Linear(4096, num_classes),
)
if init_weights:
self._initialize_weights()
def _initialize_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
def vgg16(num_classes=1000, init_weights=False):
vgg16_cfg = [
64,
64,
"pool",
128,
128,
"pool",
256,
256,
256,
"pool",
512,
512,
512,
"pool",
512,
512,
512,
"pool",
]
return VGG(vgg16_cfg, num_classes, init_weights)
model = models.vgg16(pretrained=True)
model.classifier[6] = nn.Linear(4096, 10) # Change output layer from 1000 to 10 classes
model = model.cuda()
batch_size = 128
training_dataset = datasets.CIFAR10(
root="./data",
train=True,
download=True,
transform=transforms.Compose(
[
transforms.RandomCrop(32, padding=4),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]
),
)
training_dataloader = torch.utils.data.DataLoader(
training_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=2,
drop_last=True,
)
data = iter(training_dataloader)
images, _ = next(data)
crit = nn.CrossEntropyLoss()
def calibrate_loop(model):
# calibrate over the training dataset
total = 0
correct = 0
loss = 0.0
for data, labels in training_dataloader:
data, labels = data.cuda(), labels.cuda(non_blocking=True)
out = model(data)
loss += crit(out, labels)
preds = torch.max(out, 1)[1]
total += labels.size(0)
correct += (preds == labels).sum().item()
print("PTQ Loss: {:.5f} Acc: {:.2f}%".format(loss / total, 100 * correct / total))
quantize_type = "int8"
if quantize_type == "int8":
quant_cfg = mtq.INT8_DEFAULT_CFG
elif quantize_type == "fp8":
quant_cfg = mtq.FP8_DEFAULT_CFG
# PTQ with in-place replacement to quantized modules
mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
# model has FP8 qdq nodes at this point
# Load the testing dataset
testing_dataset = datasets.CIFAR10(
root="./data",
train=False,
download=True,
transform=transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]
),
)
testing_dataloader = torch.utils.data.DataLoader(
testing_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=2,
drop_last=True,
) # set drop_last=True to drop the last incomplete batch for static shape `torchtrt.dynamo.compile()`
with torch.no_grad():
with export_torch_mode():
# Compile the model with Torch-TensorRT Dynamo backend
input_tensor = images.cuda()
# torch.export.export() failed due to RuntimeError: Attempting to use FunctionalTensor on its own. Instead, please use it with a corresponding FunctionalTensorMode()
from torch.export._trace import _export
exp_program = _export(model, (input_tensor,))
if quantize_type == "int8":
enabled_precisions = {torch.int8}
elif quantize_type == "fp8":
enabled_precisions = {torch.float8_e4m3fn}
trt_model = torchtrt.dynamo.compile(
exp_program,
inputs=[input_tensor],
enabled_precisions=enabled_precisions,
min_block_size=1,
debug=False,
)
# You can also use torch compile path to compile the model with Torch-TensorRT:
# trt_model = torch.compile(model, backend="tensorrt")
# Inference compiled Torch-TensorRT model over the testing dataset
total = 0
correct = 0
loss = 0.0
class_probs = []
class_preds = []
for data, labels in testing_dataloader:
data, labels = data.cuda(), labels.cuda(non_blocking=True)
out = trt_model(data)
loss += crit(out, labels)
preds = torch.max(out, 1)[1]
class_probs.append([F.softmax(i, dim=0) for i in out])
class_preds.append(preds)
total += labels.size(0)
correct += (preds == labels).sum().item()
test_probs = torch.cat([torch.stack(batch) for batch in class_probs])
test_preds = torch.cat(class_preds)
test_loss = loss / total
test_acc = correct / total
print("Test Loss: {:.5f} Test Acc: {:.2f}%".format(test_loss, 100 * test_acc))
Output 1:
Inserted 60 quantizers
PTQ Loss: 0.07708 Acc: 9.56%
VGG(
(features): Sequential(
(0): QuantConv2d(
3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=2.7537 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.1174, 1.2726](64) calibrator=MaxCalibrator quant)
)
(1): ReLU(inplace=True)
(2): QuantConv2d(
64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=24.3053 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0975, 0.5394](64) calibrator=MaxCalibrator quant)
)
(3): ReLU(inplace=True)
(4): QuantMaxPool2d(
kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=54.2164 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
)
(5): QuantConv2d(
64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=54.2164 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0775, 0.5991](128) calibrator=MaxCalibrator quant)
)
(6): ReLU(inplace=True)
(7): QuantConv2d(
128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=71.7766 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.1018, 0.3622](128) calibrator=MaxCalibrator quant)
)
(8): ReLU(inplace=True)
(9): QuantMaxPool2d(
kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=195.4991 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
)
(10): QuantConv2d(
128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=195.4991 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0749, 0.5836](256) calibrator=MaxCalibrator quant)
)
(11): ReLU(inplace=True)
(12): QuantConv2d(
256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=177.7932 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0684, 0.3851](256) calibrator=MaxCalibrator quant)
)
(13): ReLU(inplace=True)
(14): QuantConv2d(
256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=195.0479 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0760, 0.6032](256) calibrator=MaxCalibrator quant)
)
(15): ReLU(inplace=True)
(16): QuantMaxPool2d(
kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=226.3649 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
)
(17): QuantConv2d(
256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=226.3649 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0598, 0.4622](512) calibrator=MaxCalibrator quant)
)
(18): ReLU(inplace=True)
(19): QuantConv2d(
512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=244.4577 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0524, 0.2732](512) calibrator=MaxCalibrator quant)
)
(20): ReLU(inplace=True)
(21): QuantConv2d(
512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=179.2412 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0538, 0.2263](512) calibrator=MaxCalibrator quant)
)
(22): ReLU(inplace=True)
(23): QuantMaxPool2d(
kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=143.1362 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
)
(24): QuantConv2d(
512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=143.1362 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0568, 0.3162](512) calibrator=MaxCalibrator quant)
)
(25): ReLU(inplace=True)
(26): QuantConv2d(
512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=96.6732 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0512, 0.2501](512) calibrator=MaxCalibrator quant)
)
(27): ReLU(inplace=True)
(28): QuantConv2d(
512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=75.1997 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0270, 0.1323](512) calibrator=MaxCalibrator quant)
)
(29): ReLU(inplace=True)
(30): QuantMaxPool2d(
kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=54.2732 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
)
)
(avgpool): QuantAdaptiveAvgPool2d(
output_size=(7, 7)
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=54.2732 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
)
(classifier): Sequential(
(0): QuantLinear(
in_features=25088, out_features=4096, bias=True
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=54.2732 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0072, 0.0617](4096) calibrator=MaxCalibrator quant)
)
(1): ReLU(inplace=True)
(2): Dropout(p=0.5, inplace=False)
(3): QuantLinear(
in_features=4096, out_features=4096, bias=True
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=116.5422 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0064, 0.0609](4096) calibrator=MaxCalibrator quant)
)
(4): ReLU(inplace=True)
(5): Dropout(p=0.5, inplace=False)
(6): QuantLinear(
in_features=4096, out_features=10, bias=True
(input_quantizer): TensorQuantizer(8 bit fake per-tensor amax=57.4124 calibrator=MaxCalibrator quant)
(output_quantizer): TensorQuantizer(disabled)
(weight_quantizer): TensorQuantizer(8 bit fake axis=0 amax=[0.0156, 0.0156](10) calibrator=MaxCalibrator quant)
)
)
)
Output 2:
Files already downloaded and verified
INFO:torch_tensorrt.dynamo._compiler:Compilation Settings: CompilationSettings(enabled_precisions={<dtype.i8: 3>}, debug=False, workspace_size=0, min_block_size=1, torch_executed_ops=set(), pass_through_build_failures=False, max_aux_streams=None, version_compatible=False, optimization_level=None, use_python_runtime=False, truncate_double=False, use_fast_partitioner=True, enable_experimental_decompositions=False, device=Device(type=DeviceType.GPU, gpu_id=0), require_full_compilation=False, disable_tf32=False, assume_dynamic_shape_support=False, sparse_weights=False, refit=False, engine_capability=<EngineCapability.STANDARD: 1>, num_avg_timing_iters=1, dla_sram_size=1048576, dla_local_dram_size=1073741824, dla_global_dram_size=536870912, dryrun=False, hardware_compatible=False, timing_cache_path='/tmp/timing_cache.bin')
INFO:torch_tensorrt.dynamo._compiler:Partitioning the graph via the fast partitioner
INFO:torch_tensorrt [TensorRT Conversion Context]:[MemUsageChange] Init CUDA: CPU +0, GPU +0, now: CPU 1790, GPU 3542 (MiB)
INFO:torch_tensorrt.dynamo.conversion._TRTInterpreter:TRT INetwork construction elapsed time: 0:00:00.002286
INFO:torch_tensorrt [TensorRT Conversion Context]:BuilderFlag::kTF32 is set but hardware does not support TF32. Disabling TF32.
WARNING:torch_tensorrt [TensorRT Conversion Context]:Calibrator is not being used. Users must provide dynamic range for all tensors that are not Int32 or Bool.
ERROR:torch_tensorrt [TensorRT Conversion Context]:IBuilder::buildSerializedNetwork: Error Code 4: Internal Error (Calibration failure occurred with no scaling factors detected. This could be due to no int8 calibrator or insufficient custom scales for network layers. Please see int8 sample to setup calibration correctly.)
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
Cell In[15], line 34
32 elif quantize_type == "fp8":
33 enabled_precisions = {torch.float8_e4m3fn}
---> 34 trt_model = torchtrt.dynamo.compile(
35 exp_program,
36 inputs=[input_tensor],
37 enabled_precisions=enabled_precisions,
38 min_block_size=1,
39 debug=False,
40 )
41 # You can also use torch compile path to compile the model with Torch-TensorRT:
42 # trt_model = torch.compile(model, backend="tensorrt")
43
44 # Inference compiled Torch-TensorRT model over the testing dataset
45 total = 0
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/_compiler.py:230, in compile(exported_program, inputs, device, disable_tf32, assume_dynamic_shape_support, sparse_weights, enabled_precisions, engine_capability, refit, debug, num_avg_timing_iters, workspace_size, dla_sram_size, dla_local_dram_size, dla_global_dram_size, truncate_double, require_full_compilation, min_block_size, torch_executed_ops, torch_executed_modules, pass_through_build_failures, max_aux_streams, version_compatible, optimization_level, use_python_runtime, use_fast_partitioner, enable_experimental_decompositions, dryrun, hardware_compatible, timing_cache_path, **kwargs)
228 settings = CompilationSettings(**compilation_options)
229 logger.info("Compilation Settings: %s\n", settings)
--> 230 trt_gm = compile_module(gm, inputs, settings)
231 return trt_gm
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/_compiler.py:418, in compile_module(gm, sample_inputs, settings)
416 # Create TRT engines from submodule
417 if not settings.dryrun:
--> 418 trt_module = convert_module(
419 submodule,
420 submodule_inputs,
421 settings=settings,
422 name=name,
423 )
425 trt_modules[name] = trt_module
427 sample_outputs = gm(
428 *get_torch_inputs(sample_inputs, to_torch_device(settings.device))
429 )
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_conversion.py:106, in convert_module(module, inputs, settings, name)
91 def convert_module(
92 module: torch.fx.GraphModule,
93 inputs: Sequence[Input],
94 settings: CompilationSettings = CompilationSettings(),
95 name: str = "",
96 ) -> PythonTorchTensorRTModule | TorchTensorRTModule:
97 """Convert an FX module to a TRT module
98 Args:
99 module: FX GraphModule to convert
(...)
104 _PythonTorchTensorRTModule or TorchTensorRTModule
105 """
--> 106 interpreter_result = interpret_module_to_result(module, inputs, settings)
108 if settings.use_python_runtime or not ENABLED_FEATURES.torch_tensorrt_runtime:
109 if not settings.use_python_runtime:
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_conversion.py:87, in interpret_module_to_result(module, inputs, settings)
73 output_dtypes = infer_module_output_dtypes(
74 module,
75 inputs,
76 settings.device,
77 truncate_double=settings.truncate_double,
78 )
80 interpreter = TRTInterpreter(
81 module,
82 inputs,
(...)
85 compilation_settings=settings,
86 )
---> 87 interpreter_result = interpreter.run()
88 return interpreter_result
File /usr/local/lib/python3.8/dist-packages/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py:344, in TRTInterpreter.run(self, strict_type_constraints, algorithm_selector, tactic_sources)
337 self._create_timing_cache(
338 builder_config, self.compilation_settings.timing_cache_path
339 )
341 serialized_engine = self.builder.build_serialized_network(
342 self.ctx.net, builder_config
343 )
--> 344 assert serialized_engine
346 _LOGGER.info(
347 f"Build TRT engine elapsed time: {datetime.now() - build_engine_start_time}"
348 )
349 _LOGGER.info(f"TRT Engine uses: {serialized_engine.nbytes} bytes of Memory")
AssertionError:
Environment:
Python: 3.8
PyTorch: 2.4.1+cu121
Torch-TensorRT: 2.4.0+cu121