In the following example, it takes 900ns just to add 4 numbers.
Is there a way to reduce this time?
c++ without torch can calculate in less than 1ns.
#include <benchmark/benchmark.h>
#include <torch/torch.h>
static void BM_TorchAdd(benchmark::State& state) {
torch::NoGradGuard no_grad;
torch::Tensor tensor = torch::rand({2, 2});
for (auto _ : state)
auto v = tensor + tensor;
}
// Register the function as a benchmark
BENCHMARK(BM_TorchAdd);
static void BM_SimpleAdd(benchmark::State& state) {
float tensor[4];
float buf[4];
for (auto _ : state)
for (int i=0;i<4;i++)
buf[i] = tensor[i] + tensor[i];
}
// Register the function as a benchmark
BENCHMARK(BM_SimpleAdd);
BENCHMARK_MAIN();
$ g++ --version
g++ (Ubuntu 11.2.0-19ubuntu1) 11.2.0
$ cat /usr/build-version # The version of libtorch
1.11.0+cu113
$ g++ -O libtorch-bench.cpp -ltorch -lc10 -lbenchmark -ltorch_cpu
$ ./a.out
2022-10-25T23:04:51+09:00
Running ./a.out
Run on (16 X 4890 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x8)
L1 Instruction 32 KiB (x8)
L2 Unified 512 KiB (x8)
L3 Unified 16384 KiB (x1)
Load Average: 0.27, 0.46, 0.57
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-------------------------------------------------------
Benchmark Time CPU Iterations
-------------------------------------------------------
BM_TorchAdd 894 ns 894 ns 785041
BM_SimpleAdd 0.309 ns 0.309 ns 1000000000