Segmentation fault: Call native cpu operator in custom cpu operator

Hi,
I register a cpu custom operator sum_op_forward for aten::sum.dim_IntList. If input.dim() !=4, call back to the native cpu operator at::sum. If I call at::sum directly in sum_op_forward, it will actually call sum_op_forward back. Finally, the program exits abnormally due to infinite recursive calls. So how can I call the native cpu at::sum ?

Could anyone give me some help? thanks.

Pytorch version: 2.2

the code is as follows:

#include <torch/all.h>
#include <ATen/ATen.h>
#include <ATen/NativeFunctions.h>
#include <torch/csrc/autograd/custom_function.h>
#include "torch/library.h"

namespace torch_ex {
namespace cpu {

class ExSumOp : public torch::autograd::Function<ExSumOp> {
 public:
  static at::Tensor forward(
    torch::autograd::AutogradContext* ctx,
    const at::Tensor& input,
    c10::OptionalIntArrayRef dim,
    bool keepdim,
    c10::optional<c10::ScalarType> dtype){

      // some custom forward implementations
      // ....
      return out;
    }

  static torch::autograd::variable_list backward(
      torch::autograd::AutogradContext* ctx,
      torch::autograd::variable_list grad_outputs){

      // some custom backward implementation 
      // ......
      return {grad_input, at::Tensor(), at::Tensor(), at::Tensor()};    
    }
};

at::Tensor sum_op(
    const at::Tensor& input,
    c10::OptionalIntArrayRef dim,
    bool keepdim,
    c10::optional<c10::ScalarType> dtype){

    return ExSumOp::apply(input, dim, keepdim, dtype);
}

at::Tensor sum_op_forward(
    const at::Tensor& input,
    c10::OptionalIntArrayRef dim,
    bool keepdim,
    c10::optional<c10::ScalarType> dtype){

  auto ndim = input.dim();
  if (ndim == 4){
    // call custom operator
    return sum_op(input, dim, keepdim, dtype);
  }else{

    //// actually, it will call custom sum_op_forward. Recursive call to itself causes program to exit 
    //// abnormally, so how to call native at::sum ????
    // return at::sum(input, dim, keepdim, dtype);
 
   }
}

TORCH_LIBRARY_FRAGMENT(aten, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("aten::sum.dim_IntList"),
      c10::DispatchKey::CPU,
      torch_ex::cpu::sum_op_forward);

  m.impl(
      TORCH_SELECTIVE_NAME("aten::sum.dim_IntList"),
      c10::DispatchKey::AutogradCPU,
      torch_ex::cpu::sum_op_forward);
}

} // namespace cpu
} // namespace torch_ex