C++ extension is slower than python

Hi everyone, I write a C++ extension to accelerate, but I find C++ is slower than python. Here are codes:
test.cpp:

#include <iostream>
#include <torch/extension.h>
#include <math.h>
using namespace torch::autograd;

torch::Tensor heaviside_step(const torch::Tensor & x)
{   
    return x.ge(0).to(x);  // bool -> float
}
class sigmoid_atf: public Function<sigmoid_atf>
{
    public:
  static torch::Tensor forward(AutogradContext *ctx, const torch::Tensor & x, const torch::Tensor & alpha)
  {
      if (x.requires_grad())
      {
        ctx->save_for_backward({x, alpha});
      }
      return heaviside_step(x);
  }
  static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
  {
      auto saved = ctx->get_saved_variables();
      auto x = saved[0];
      auto alpha = saved[1];
      auto grad_x = alpha * torch::sigmoid_backward(grad_outputs[0], torch::sigmoid(x * alpha));  // main contribution to acceleration
      return {grad_x, torch::Tensor()};
  }
};

torch::Tensor sigmoid_apply(const torch::Tensor & x, const torch::Tensor & alpha)
{
    return sigmoid_atf::apply(x, alpha);
}

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    m.def("sigmoid_apply", &sigmoid_apply);
}

test.py:

from torch.utils import cpp_extension
import torch
import time
import numpy as np
def heaviside_step(x: torch.Tensor):
    return x.ge(0).to(x)

class sigmoid(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, alpha):
        if x.requires_grad:
            ctx.save_for_backward(x, alpha)
        return heaviside_step(x)

    @staticmethod
    def backward(ctx, grad_output):
        grad_x = None
        if ctx.needs_input_grad[0]:
            sgax = (ctx.saved_tensors[0] * ctx.saved_tensors[1]).sigmoid_()
            grad_x = grad_output * (1 - sgax) * sgax * ctx.saved_tensors[1]

        return grad_x, None
    
def cal_fun_t(n, f, *args, **kwargs):
    # warm up
    f(*args, **kwargs)
    torch.cuda.synchronize()
    t_start = time.time()
    for _ in range(n):
        f(*args, **kwargs)
    torch.cuda.synchronize()
    return (time.time() - t_start) / n

cext = cpp_extension.load(name='test',
                          sources=['./test.cpp'], verbose=True)

device = 'cuda:0'
x = torch.rand([1024], device=device)
alpha = torch.rand([1], device=device)
with torch.no_grad():
    t1 = cal_fun_t(100, sigmoid.apply, x, alpha)
    t2 = cal_fun_t(100, cext.sigmoid_apply, x, alpha)
    print(f'python:{t1}, c++:{t2}')