Need help debugging torch.compile issue

vivekvpandya · March 26, 2025, 9:38am

I have build an experimental torch.compile inductor backend for a custom accelerator.
Its mostly based on cpu codegen only but I have done some monkey patching which will be removed later.
It works fine for tests but if I try to use it with cpu inductor backend in same test I see following error:

    args = (CppCSEVariable(name: tmp0, bounds: VR[-oo, oo], is_vec: False, dtype: torch.float32, dependent_itervars: {x0}),)
kwargs = {}

    def inner(*args, **kwargs):
        bounds = CSEProxy._bound_variable(name, *args, **kwargs)

>       value = getattr(parent_handler, name)(*args, **kwargs)  # type: ignore[has-type]
E       torch._dynamo.exc.BackendCompilerFailed: backend='inductor' raised:
E       TypeError: CppOverrides.sigmoid() takes 1 positional argument but 2 were given
E
E       Set TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information
E
E
E       You can suppress this exception and fall back to eager by setting:
E           import torch._dynamo
E           torch._dynamo.config.suppress_errors = True

Could someone please guide me on how to debug this further?

Here is my test for reference:

@pytest.mark.parametrize("dtype", [torch.float, torch.float16])
def test_sigmoid_mean_mul(dtype):
    def sigmoid(x):
        return torch.sigmoid(x)

    def sigmoid_with_mean_reduction(x):
        sig = sigmoid(x)
        mean_sig = torch.mean(sig)
        return mean_sig

    def pointwise_computation(x, mean_sig):
        return x * mean_sig

    def fn(x):
        mean_sig = sigmoid_with_mean_reduction(x)
        result = pointwise_computation(x, mean_sig)
        return result

    x = torch.randn((10, 10000), device="my", dtype=dtype)
    fn_compile = torch.compile(
        fn,
        backend="my-inductor",
        fullgraph=True,
    )
    out_compile = fn_compile(x)
    fn_compile_cpu = torch.compile(
        fn,
        backend="inductor",
        fullgraph=True,
    )
    out = fn_compile_cpu(x.cpu())
    assert torch.allclose(out.cpu(), out_compile.cpu(), rtol=0.01, atol=0.01)