Hi everyone, I write a C++ extension to accelerate, but I find C++ is slower than python. Here are codes:
test.cpp:
#include <iostream>
#include <torch/extension.h>
#include <math.h>
using namespace torch::autograd;
torch::Tensor heaviside_step(const torch::Tensor & x)
{
return x.ge(0).to(x); // bool -> float
}
class sigmoid_atf: public Function<sigmoid_atf>
{
public:
static torch::Tensor forward(AutogradContext *ctx, const torch::Tensor & x, const torch::Tensor & alpha)
{
if (x.requires_grad())
{
ctx->save_for_backward({x, alpha});
}
return heaviside_step(x);
}
static tensor_list backward(AutogradContext *ctx, tensor_list grad_outputs)
{
auto saved = ctx->get_saved_variables();
auto x = saved[0];
auto alpha = saved[1];
auto grad_x = alpha * torch::sigmoid_backward(grad_outputs[0], torch::sigmoid(x * alpha)); // main contribution to acceleration
return {grad_x, torch::Tensor()};
}
};
torch::Tensor sigmoid_apply(const torch::Tensor & x, const torch::Tensor & alpha)
{
return sigmoid_atf::apply(x, alpha);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("sigmoid_apply", &sigmoid_apply);
}
test.py:
from torch.utils import cpp_extension
import torch
import time
import numpy as np
def heaviside_step(x: torch.Tensor):
return x.ge(0).to(x)
class sigmoid(torch.autograd.Function):
@staticmethod
def forward(ctx, x, alpha):
if x.requires_grad:
ctx.save_for_backward(x, alpha)
return heaviside_step(x)
@staticmethod
def backward(ctx, grad_output):
grad_x = None
if ctx.needs_input_grad[0]:
sgax = (ctx.saved_tensors[0] * ctx.saved_tensors[1]).sigmoid_()
grad_x = grad_output * (1 - sgax) * sgax * ctx.saved_tensors[1]
return grad_x, None
def cal_fun_t(n, f, *args, **kwargs):
# warm up
f(*args, **kwargs)
torch.cuda.synchronize()
t_start = time.time()
for _ in range(n):
f(*args, **kwargs)
torch.cuda.synchronize()
return (time.time() - t_start) / n
cext = cpp_extension.load(name='test',
sources=['./test.cpp'], verbose=True)
device = 'cuda:0'
x = torch.rand([1024], device=device)
alpha = torch.rand([1], device=device)
with torch.no_grad():
t1 = cal_fun_t(100, sigmoid.apply, x, alpha)
t2 = cal_fun_t(100, cext.sigmoid_apply, x, alpha)
print(f'python:{t1}, c++:{t2}')