RuntimeError: quantized::conv2d_prepack() is missing value for argument 'stride'

I try to use torch.ao.quantization to quantize a model. However, it will give me
an error. If I remove the lines about quantization, it can work.

Traceback (most recent call last):
  File "/dev/shm/test/compress.py", line 99, in <module>
    x_hat = compress_decompress(model, x)
  File "/dev/shm/test/compress.py", line 73, in compress_decompress
    compressed = model(x_padded)
  File "/usr/lib/python3.13/site-packages/torch/fx/graph_module.py", line 830, in call_wrapped
    return self._wrapped_call(self, *args, **kwargs)
           ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.13/site-packages/torch/fx/graph_module.py", line 406, in __call__
    raise e
  File "/usr/lib/python3.13/site-packages/torch/fx/graph_module.py", line 393, in __call__
    return super(self.cls, obj).__call__(*args, **kwargs)  # type: ignore[misc]
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
    return forward_call(*args, **kwargs)
  File "<eval_with_key>.7", line 34, in forward
    conv2d_prepack = torch.ops.quantized.conv2d_prepack(quantize_per_channel, sub);  quantize_per_channel = sub = None
  File "/usr/lib/python3.13/site-packages/torch/_ops.py", line 1158, in __call__
    return self._op(*args, **(kwargs or {}))
           ~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: quantized::conv2d_prepack() is missing value for argument 'stride'. Declaration: quantized::conv2d_prepack(Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups) -> __torch__.torch.classes.quantized.Conv2dPackedParamsBase

Another method to make it work is to comment _lower_static_weighted_ref_functional.
If I do, the printed time will be 0.6259967585404714
If I remove the lines about quantization, the printed time will be 1.8434200982252757

from time import time

import torch
import torch.nn.functional as F
import torchvision.transforms as transforms
from compressai.zoo import cheng2020_anchor
from PIL import Image
from torch.ao.quantization import get_default_qconfig_mapping
from torch.quantization.quantize_fx import convert_fx, prepare_fx
qconfig_mapping = get_default_qconfig_mapping()
device = "cpu"
def pad(x, p):
    h, w = x.size(2), x.size(3)
    new_h = (h + p - 1) // p * p
    new_w = (w + p - 1) // p * p
    padding_left = (new_w - w) // 2
    padding_right = new_w - w - padding_left
    padding_top = (new_h - h) // 2
    padding_bottom = new_h - h - padding_top
    x_padded = F.pad(
        x,
        (padding_left, padding_right, padding_top, padding_bottom),
        mode="constant",
        value=0,
    )
    return x_padded, (padding_left, padding_right, padding_top, padding_bottom)

def pad_to_multiple_of_64(image_tensor):
    _, _, h, w = image_tensor.shape
    pad_h = (64 - h % 64) % 64
    pad_w = (64 - w % 64) % 64
    if pad_h == 0 and pad_w == 0:
        return image_tensor, (h, w)

    padded = F.pad(image_tensor, (0, pad_w, 0, pad_h), mode="reflect")
    return padded, (h, w)


def unpad(image_tensor, original_size):
    h, w = original_size
    return image_tensor[:, :, :h, :w]


def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])
    img = Image.open(image_path).convert("RGB")
    x = transform(img).unsqueeze(0)  # [1, 3, H, W]
    return x


def compress_decompress(model, x):
    x_padded, original_size = pad_to_multiple_of_64(x)

    sum = 0
    img_list = []
    import os
    path = "/home/wzy/Pictures/Kodak-Lossless-True-Color-Image-Suite/PhotoCD_PCD0992"
    for file in os.listdir(path):
        if file[-3:] in ["jpg", "png", "peg"]:
            img_list.append(file)
    count = 0
    with torch.no_grad():
        # breakpoint()
        for img_name in img_list:
            img_path = os.path.join(path, img_name)
            img = transforms.ToTensor()(Image.open(img_path).convert('RGB')).to(device)
            x = img.unsqueeze(0)
            x_padded, padding = pad(x, 64)
            count += 1
            t = time()
            compressed = model(x_padded)
            t = time() - t
            sum += t
            # decompressed_padded = model.decompress(
            #     compressed["strings"], compressed["shape"]
            # )["x_hat"]
    sum /= count
    print(f"time: {sum}")

_model = cheng2020_anchor(quality=3, pretrained=True)

image_path = "/home/wzy/Pictures/Kodak-Lossless-True-Color-Image-Suite/PhotoCD_PCD0992/23.png"  # 替换为你的图像路径
x = preprocess_image(image_path).to(next(_model.parameters()).device)

input_batch, original_size = pad_to_multiple_of_64(x)

model = _model
model.forward = model.compress

# remove these lines, it can work
model = prepare_fx(model, qconfig_mapping, example_inputs=input_batch)
model(input_batch)
model = convert_fx(model)

model.eval().to("cpu")

x_hat = compress_decompress(model, x)

Is it a bug? Or what something wrong I did?
TIA!

OS: linux 6.15.2
python: 3.13.3
pytorch: 2.7.0