I tested libtorch (cpu version) with my project data. Both training and testing data are about 500MB with 7000 thousands of instances.
The following are my observations:
- Each iteration takes 17 seconds (Mxnet only uses 5 seconds on the same dataset)
- When I comment the backward step: “loss.backward()”, time cost reduces to 4 seconds.
Thus, I doubt that “loss.backward()” is really too slow. I know that dynamic graph may be slower than static graph. However, I’m still wondering if there is any way to improve the efficiency of libtorch (cpu version). Thanks a lot.
The following is my demo code:
class YYModel : public torch::nn::Module
{
public:
YYModel(int dim) {
fc1 = register_module("fc1", torch::nn::Linear(dim, 64));
fc2 = register_module("fc2", torch::nn::Linear(64, 32));
fc3 = register_module("fc3", torch::nn::Linear(32, 1));
bn = register_module("bn", torch::nn::BatchNorm(dim));
}
torch::Tensor forward(torch::Tensor& x) {
x = torch::relu(fc1->forward(bn->forward(x)));
x = torch::relu(fc2->forward(x));
x = fc3->forward(x);
return x.squeeze(1);
}
private:
torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
torch::nn::BatchNorm bn{nullptr};
};
std::string train_data = "train_data";
std::string test_data = "test_data";
int epoches = 50;
int dim = 134;
int batch_size = 64;
auto train_dataset = YYDataSet(train_data, 134).map(torch::data::transforms::Stack<>());
auto test_dataset = YYDataSet(test_data, 134).map(torch::data::transforms::Stack<>());
int train_size = train_dataset.size().value();
int test_size = test_dataset.size().value();
auto train_iter = torch::data::make_data_loader(std::move(train_dataset), torch::data::DataLoaderOptions(batch_size).workers(4));
auto test_iter = torch::data::make_data_loader(std::move(test_dataset), torch::data::DataLoaderOptions(batch_size).workers(4));
auto model = std::make_shared<YYModel>(dim);
torch::optim::Adam optimizer(model->parameters(), torch::optim::AdamOptions(0.001));
for (int i = 0; i < epoches; i++) {
float error = 0.0;
model->train();
for (auto& batch : *train_iter) {
model->zero_grad();
auto pred = model->forward(batch.data);
torch::Tensor loss = torch::mse_loss(pred, batch.target, 0);
error += loss.sum().item<float>();
loss.backward();
optimizer.step();
}
float eval_loss = 0.0;
model->eval();
for (auto& batch : *test_iter) {
auto pred = model->forward(batch.data);
torch::Tensor loss = torch::mse_loss(pred, batch.target, 0);
eval_loss += loss.sum().item<float>();
}
std::cerr << "Epoch " << i << " with train_error=" << error/train_size << " test_error=" << eval_loss/test_size << std::endl;
}