Libtorch backward is really too slow, compared with MxNet

I tested libtorch (cpu version) with my project data. Both training and testing data are about 500MB with 7000 thousands of instances.

The following are my observations:

  1. Each iteration takes 17 seconds (Mxnet only uses 5 seconds on the same dataset)
  2. When I comment the backward step: “loss.backward()”, time cost reduces to 4 seconds.
    Thus, I doubt that “loss.backward()” is really too slow. I know that dynamic graph may be slower than static graph. However, I’m still wondering if there is any way to improve the efficiency of libtorch (cpu version). Thanks a lot.

The following is my demo code:

class YYModel : public torch::nn::Module
{
public:
    YYModel(int dim) {
        fc1 = register_module("fc1", torch::nn::Linear(dim, 64));
        fc2 = register_module("fc2", torch::nn::Linear(64, 32));
        fc3 = register_module("fc3", torch::nn::Linear(32, 1));
        bn = register_module("bn", torch::nn::BatchNorm(dim));
    }

    torch::Tensor forward(torch::Tensor& x) {
        x = torch::relu(fc1->forward(bn->forward(x)));
        x = torch::relu(fc2->forward(x));
        x = fc3->forward(x);
        return x.squeeze(1);
    }
private:
    torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
    torch::nn::BatchNorm bn{nullptr};
};

std::string train_data = "train_data";
std::string test_data = "test_data";
int epoches = 50;
int dim = 134;
int batch_size = 64;

auto train_dataset = YYDataSet(train_data, 134).map(torch::data::transforms::Stack<>());
auto test_dataset = YYDataSet(test_data, 134).map(torch::data::transforms::Stack<>());
int train_size = train_dataset.size().value();
int test_size = test_dataset.size().value();
auto train_iter = torch::data::make_data_loader(std::move(train_dataset), torch::data::DataLoaderOptions(batch_size).workers(4));
auto test_iter = torch::data::make_data_loader(std::move(test_dataset), torch::data::DataLoaderOptions(batch_size).workers(4));

auto model = std::make_shared<YYModel>(dim);
torch::optim::Adam optimizer(model->parameters(), torch::optim::AdamOptions(0.001));

for (int i = 0; i < epoches; i++) {
    float error = 0.0;
    model->train();
    for (auto& batch : *train_iter) {
        model->zero_grad();
        auto pred = model->forward(batch.data);
        torch::Tensor loss = torch::mse_loss(pred, batch.target, 0);
        error += loss.sum().item<float>();
        loss.backward();
        optimizer.step();
    }

    float eval_loss = 0.0;
    model->eval();
    for (auto& batch : *test_iter) {
        auto pred = model->forward(batch.data);
        torch::Tensor loss = torch::mse_loss(pred, batch.target, 0);
        eval_loss += loss.sum().item<float>();
    }

    std::cerr << "Epoch " << i << " with train_error=" << error/train_size << " test_error=" << eval_loss/test_size << std::endl;
}

How are you compiling your code? Did you turn optimizations on?

I used -O2 optimization。The following is my CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(yy_trainer)

set(CMAKE_PREFIX_PATH /mnt/e/libtorch/libtorch)

include_directories(include)

list(APPEND SRCS src/trainer.cpp)
list(APPEND SRCS src/YYDataSet.cpp)

set(CMAKE_CXX_FLAGS "-g -O2")

find_package(Torch REQUIRED)
add_executable(yy_trainer ${SRCS})
target_link_libraries(yy_trainer "${TORCH_LIBRARIES}")
set_property(TARGET yy_trainer PROPERTY CXX_STANDARD 11)