C++ time_sequence_prediction.py Slow

I have implemented the time_sequence_prediction example in C++ as following:

#include <torch/torch.h>


#include <iostream>
#include <string>
#include <cmath>
#include <vector>
#include <random>
#include <chrono>

using Tensor = torch::Tensor;

void generateData(const std::string filePath, int exampleNum, int pointNum) {
	const int pointPerRad = 20;
	Tensor index = torch::range(0, exampleNum * pointNum - 1, 1).reshape({exampleNum, pointNum});
	std::cout << index.sizes() << std::endl;
	std::cout << index << std::endl;

	Tensor shift = torch::randint((-4 * pointPerRad), (4 * pointPerRad), {exampleNum, 1});
	std::cout << shift.sizes() << std::endl;
	std::cout << shift << std::endl;

	index = index.sub(shift).div(pointPerRad);
	std::cout << index << std::endl;

	auto data = torch::sin(index);
	std::cout << data << std::endl;

	torch::save(data, filePath);
}

std::vector<Tensor> createInput(Tensor dataTensor, float split) {
	const int trainNum = dataTensor.size(0) * split;
	const int validNum = dataTensor.size(0) - trainNum;
	const int pointNum = dataTensor.size(1);
	std::vector<Tensor> rc;

	Tensor trainTensor = torch::narrow(dataTensor, 0, 0, trainNum);
	Tensor inputTensor = torch::narrow(trainTensor, 1, 0, pointNum- 1);
	Tensor targetTensor = torch::narrow(trainTensor, 1, 1, pointNum - 1);
	rc.push_back(inputTensor);
	rc.push_back(targetTensor);

	Tensor validTensor = torch::narrow(dataTensor, 0, trainNum, validNum);
	Tensor validInputTensor = torch::narrow(validTensor, 1, 0, pointNum - 1);
	Tensor validTargetTensor = torch::narrow(validTensor, 1, 1, pointNum - 1);
	rc.push_back(validInputTensor);
	rc.push_back(validTargetTensor);

	return rc;
}

struct Net: torch::nn::Module {
	const int lstmHiddenSize;
	torch::nn::LSTM lstm0;
	torch::nn::LSTM lstm1;
	torch::nn::Linear fc;

	Net(int seqLen, int hiddenSize): lstmHiddenSize(hiddenSize),
			lstm0(torch::nn::LSTM(torch::nn::LSTMOptions(seqLen, hiddenSize))),
			lstm1(torch::nn::LSTM(torch::nn::LSTMOptions(hiddenSize, hiddenSize))),
			fc (torch::nn::Linear(hiddenSize, 1)) {
		register_module("lstm0", lstm0);
		register_module("lstm1", lstm1);
		register_module("fc", fc);
	}

	Tensor forward (Tensor input) {
		auto inputs = torch::chunk(input, input.size(1), 1);
		std::vector<Tensor> outputs;

		Tensor state0;
		Tensor state1;

		for (auto input: inputs) {
			input.set_requires_grad(true);

			auto rnnOutput0 = lstm0->forward(input, state0);
			state0 = rnnOutput0.state;
			auto rnnOutput1 = lstm1->forward(rnnOutput0.output, state1);
			state1 = rnnOutput1.state;
			auto output = fc->forward(rnnOutput1.output);

			outputs.push_back(output.view({output.size(0), output.size(1)}));
		}

		return torch::cat(outputs, 1);
	}
};

void train(const std::string filePath, float split) {
	const int SeqLen = 1;
	const int HiddenCell = 51;
	const int EpochNum = 2;
	const int BatchSize = 64;

	Tensor raw = readData(filePath);
	std::vector<Tensor> datas = createInput(raw, split);

	Tensor input = datas[0];
	Tensor target = datas[1];
	input.set_requires_grad(true);
	std::cout << "Target " << target.sizes() << std::endl;

	Net net(SeqLen, HiddenCell);

	torch::optim::LBFGS optimizer (net.parameters(), torch::optim::LBFGSOptions(0.8));

	int index = 0;
	auto cost = [&]() {
		optimizer.zero_grad();
		auto output = net.forward(input);
		auto loss = torch::mse_loss(output, target);
		loss.backward();
		index ++;

		return loss;
	};

	for (int i = 0; i < EpochNum; i ++) {
		std::cout << "Step " << i << std::endl;
		auto startIndex = index;
		auto start = std::chrono::high_resolution_clock::now();
		auto loss = optimizer.step(cost);
		auto stop = std::chrono::high_resolution_clock::now();
		std::cout << "loss " << i << loss << std::endl;
		std::cout << "run cost times: " << (index - startIndex)
				<< ", duration: " << std::chrono::duration_cast<std::chrono::seconds>(stop - start).count()
				<< "seconds" << std::endl;
	}

	{
		Tensor validInput = datas[2];
		Tensor validTarget = datas[3];
		auto validOutput = net.forward(validInput);
		auto validLoss = torch::mse_loss(validOutput, validTarget);
		std::cout << "valid loss " << validLoss << std::endl;
	}
}


int main() {
	const std::string filePath = "./data/sin/testdata1000.pt";
	const float split = 0.99;
//	generateData(filePath, 1000, 100);

	train(filePath, split);
}

The output of the performance is:

Step 0
loss 00.506419
[ CPUFloatType{} ]
run cost times: 20, duration: 1892seconds
Step 1
loss 10.0468959
[ CPUFloatType{} ]
run cost times: 20, duration: 1819seconds
valid loss 0.0021026
[ CPUFloatType{} ]

But the original py implementation performance is:

Step 0
time  50.72736859321594  seconds
closure:  20
Step 1
time  43.49410343170166  seconds
closure:  20

My environment:
CPU,
pytorch 1.4.0a0
Python 3.7.5

Why C++ implementation takes 30+ times longer than original python version?

Run in AWS t2.2xlarge:
Step 0
loss 00.503087
[ CPUFloatType{} ]
run cost times: 20, duration: 1508seconds
Step 1
loss 10.0161174
[ CPUFloatType{} ]
run cost times: 20, duration: 1504seconds
valid loss 0.00212624
[ CPUFloatType{} ]

Error in test codes