When i test model infer by libtorch. I found cp tensor(size is [600]) from cuda to cpu always used long time. Like codes:
int textcnn_simcse_test() {
// load model
torch::jit::Module model = torch::jit::load(
"../textcnn_simcse_qqsearch_jit.pt", torch::Device({torch::kCUDA, 0}));
model.eval();
while (true) {
torch::Tensor tensor = torch::rand({601, 50});
tensor = tensor.to(torch::kInt64).to(torch::kCUDA);
std::vector<torch::jit::IValue> inputs;
inputs.emplace_back(tensor);
torch::Tensor out = model(inputs).toTensor();
auto begin = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
torch::Tensor query = out.index({torch::indexing::Slice(0, 1)});
torch::Tensor titles =
out.index({torch::indexing::Slice(1, torch::indexing::None)});
torch::Tensor result = torch::sum(query * titles, 1);
auto calc_end = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
// Copy tensor from cuda to cpu.
result = result.to(torch::kCPU);
auto cp_end = std::chrono::duration_cast<std::chrono::microseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
std::cout << "calc: " << calc_end - begin
<< " us, cp: " << cp_end - calc_end << " us" << std::endl;
std::this_thread::sleep_for(std::chrono::seconds(1));
}
return 0;
}
Log like:
calc: 84 us, cp: 5616 us
calc: 83 us, cp: 5600 us
calc: 82 us, cp: 5587 us
calc: 82 us, cp: 5606 us
I don’t know why. Is there some wrong in my codes?