# 实例化模型并进行量化准备
quantized_model = GACNFuseNet()
quantized_model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
quantized_model_prepared = torch.quantization.prepare(quantized_model, inplace=True)
quantized_model_int8 = torch.quantization.convert(quantized_model_prepared, inplace=True)
quantized_model_int8.eval()
quantized_model_int8.load_state_dict(torch.load('gacn_quant.pth'))
quantized_model_int8.to('cuda')
# 将模型移动到GPU
model.to('cuda')
# 两张输入图片
img1_tensor = img1_tensor.to("cuda")
img2_tensor = img2_tensor.to("cuda")
t1 = time.time()
# output = quantized_model_int8(input_tensor)
# 推理
quantized_model_int8.eval()
with torch.no_grad():
output = quantized_model_int8(img1_tensor, img2_tensor)
t2 = time.time()
model.eval()
with torch.no_grad():
output_2 = model(img1_tensor, img2_tensor)
t3 = time.time()
print(f'Done. ({(1E3 * (t3 - t2)):.1f}ms) Inference.')
print(f'Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference.')
How to use the GPU to infer the model? I get the quantized model ‘gacn_quant.pth’, then I give model and inputs to the cuda, but ‘img1_tensor.to(“cuda”)’ get an error.
/home/ubuntu/anaconda3/envs/zzy-quant/bin/python3.8 /home/ubuntu/Data1/zzy/GACN/gacn_quant.py
/home/ubuntu/anaconda3/envs/zzy-quant/lib/python3.8/site-packages/torch/ao/quantization/observer.py:220: UserWarning: Please use quant_min and quant_max to specify the range for observers. reduce_range will be deprecated in a future release of PyTorch.
warnings.warn(
/home/ubuntu/anaconda3/envs/zzy-quant/lib/python3.8/site-packages/torch/ao/quantization/observer.py:1272: UserWarning: must run observer before calling calculate_qparams. Returning default scale and zero point
warnings.warn(
/home/ubuntu/anaconda3/envs/zzy-quant/lib/python3.8/site-packages/torch/_utils.py:403: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
device=storage.device,
img1.shape: (520, 520, 3)
img1.shape: torch.Size([1, 1, 520, 520])
Process finished with exit code 139
segment fault (core dumped)(段错误 (核心已转储))
I also try to convert the model to onnx and quantize it, but it doesn’t run correctly in the cuda, the inference time is same as cpu, it’s also same as unquantized model? why?
onnx_model_path = "mymodel.quant_static.onnx"
# onnx_model_path = "mymodel.onnx"
DEVICE_NAME = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE_NAME)
DEVICE_INDEX = 0
DEVICE = f'{DEVICE_NAME}:{DEVICE_INDEX}'
# 获取模型的输入名称
session = onnxruntime.InferenceSession(onnx_model_path,
providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider',
'CPUExecutionProvider'])
# session = create_session(onnx_model_path, "cuda")
input_names = [input.name for input in session.get_inputs()]
output_names = [output.name for output in session.get_outputs()]
print("Model input names:", input_names)
print("Model output names:", output_names)
# 准备输入字典,确保名称与模型输入名称匹配
inputs = {
input_names[0]: img1_tensor.numpy(),
input_names[1]: img2_tensor.numpy()
}
t1 = time.time()
# # [array([5., 7., 9.], dtype=float32)]t1 = time.time()
# t2 = time.time()
# print(f'Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference.')
print(session.run(None, inputs))
# [ 2. 4. 6. 8. 10.]
t3 = time.time()
print(f'Done. ({(1E3 * (t3 - t1)):.1f}ms) Inference.')