I have implemented the time_sequence_prediction example in C++ as following:
#include <torch/torch.h>
#include <iostream>
#include <string>
#include <cmath>
#include <vector>
#include <random>
#include <chrono>
using Tensor = torch::Tensor;
void generateData(const std::string filePath, int exampleNum, int pointNum) {
const int pointPerRad = 20;
Tensor index = torch::range(0, exampleNum * pointNum - 1, 1).reshape({exampleNum, pointNum});
std::cout << index.sizes() << std::endl;
std::cout << index << std::endl;
Tensor shift = torch::randint((-4 * pointPerRad), (4 * pointPerRad), {exampleNum, 1});
std::cout << shift.sizes() << std::endl;
std::cout << shift << std::endl;
index = index.sub(shift).div(pointPerRad);
std::cout << index << std::endl;
auto data = torch::sin(index);
std::cout << data << std::endl;
torch::save(data, filePath);
}
std::vector<Tensor> createInput(Tensor dataTensor, float split) {
const int trainNum = dataTensor.size(0) * split;
const int validNum = dataTensor.size(0) - trainNum;
const int pointNum = dataTensor.size(1);
std::vector<Tensor> rc;
Tensor trainTensor = torch::narrow(dataTensor, 0, 0, trainNum);
Tensor inputTensor = torch::narrow(trainTensor, 1, 0, pointNum- 1);
Tensor targetTensor = torch::narrow(trainTensor, 1, 1, pointNum - 1);
rc.push_back(inputTensor);
rc.push_back(targetTensor);
Tensor validTensor = torch::narrow(dataTensor, 0, trainNum, validNum);
Tensor validInputTensor = torch::narrow(validTensor, 1, 0, pointNum - 1);
Tensor validTargetTensor = torch::narrow(validTensor, 1, 1, pointNum - 1);
rc.push_back(validInputTensor);
rc.push_back(validTargetTensor);
return rc;
}
struct Net: torch::nn::Module {
const int lstmHiddenSize;
torch::nn::LSTM lstm0;
torch::nn::LSTM lstm1;
torch::nn::Linear fc;
Net(int seqLen, int hiddenSize): lstmHiddenSize(hiddenSize),
lstm0(torch::nn::LSTM(torch::nn::LSTMOptions(seqLen, hiddenSize))),
lstm1(torch::nn::LSTM(torch::nn::LSTMOptions(hiddenSize, hiddenSize))),
fc (torch::nn::Linear(hiddenSize, 1)) {
register_module("lstm0", lstm0);
register_module("lstm1", lstm1);
register_module("fc", fc);
}
Tensor forward (Tensor input) {
auto inputs = torch::chunk(input, input.size(1), 1);
std::vector<Tensor> outputs;
Tensor state0;
Tensor state1;
for (auto input: inputs) {
input.set_requires_grad(true);
auto rnnOutput0 = lstm0->forward(input, state0);
state0 = rnnOutput0.state;
auto rnnOutput1 = lstm1->forward(rnnOutput0.output, state1);
state1 = rnnOutput1.state;
auto output = fc->forward(rnnOutput1.output);
outputs.push_back(output.view({output.size(0), output.size(1)}));
}
return torch::cat(outputs, 1);
}
};
void train(const std::string filePath, float split) {
const int SeqLen = 1;
const int HiddenCell = 51;
const int EpochNum = 2;
const int BatchSize = 64;
Tensor raw = readData(filePath);
std::vector<Tensor> datas = createInput(raw, split);
Tensor input = datas[0];
Tensor target = datas[1];
input.set_requires_grad(true);
std::cout << "Target " << target.sizes() << std::endl;
Net net(SeqLen, HiddenCell);
torch::optim::LBFGS optimizer (net.parameters(), torch::optim::LBFGSOptions(0.8));
int index = 0;
auto cost = [&]() {
optimizer.zero_grad();
auto output = net.forward(input);
auto loss = torch::mse_loss(output, target);
loss.backward();
index ++;
return loss;
};
for (int i = 0; i < EpochNum; i ++) {
std::cout << "Step " << i << std::endl;
auto startIndex = index;
auto start = std::chrono::high_resolution_clock::now();
auto loss = optimizer.step(cost);
auto stop = std::chrono::high_resolution_clock::now();
std::cout << "loss " << i << loss << std::endl;
std::cout << "run cost times: " << (index - startIndex)
<< ", duration: " << std::chrono::duration_cast<std::chrono::seconds>(stop - start).count()
<< "seconds" << std::endl;
}
{
Tensor validInput = datas[2];
Tensor validTarget = datas[3];
auto validOutput = net.forward(validInput);
auto validLoss = torch::mse_loss(validOutput, validTarget);
std::cout << "valid loss " << validLoss << std::endl;
}
}
int main() {
const std::string filePath = "./data/sin/testdata1000.pt";
const float split = 0.99;
// generateData(filePath, 1000, 100);
train(filePath, split);
}
The output of the performance is:
Step 0
loss 00.506419
[ CPUFloatType{} ]
run cost times: 20, duration: 1892seconds
Step 1
loss 10.0468959
[ CPUFloatType{} ]
run cost times: 20, duration: 1819seconds
valid loss 0.0021026
[ CPUFloatType{} ]
But the original py implementation performance is:
Step 0
time 50.72736859321594 seconds
closure: 20
Step 1
time 43.49410343170166 seconds
closure: 20
My environment:
CPU,
pytorch 1.4.0a0
Python 3.7.5
Why C++ implementation takes 30+ times longer than original python version?