Hello everyone,
I’m implementing a network in C++ using libtorch in Qt. I successfully built and trained a network, but I have now a problem while trying to deploy it.
To summarize my issue, it seems that when I loop over the forward function of my model, it runs well for several iterations, then suddenly run around 40x slower.
Here is an example of what I get using a simple network.
main.cpp:
#include <torch/torch.h>
#include <torch/script.h>
#include <QDebug>
#include <QElapsedTimer>
// Simple 5 linear layers network structure
struct NetImpl : torch::nn::Module {
torch::nn::Linear linear_1;
torch::nn::Linear linear_2;
torch::nn::Linear linear_3;
torch::nn::Linear linear_4;
torch::nn::Linear linear_5;
NetImpl(std::vector<int64_t> linear_dim_in, std::vector<int64_t> linear_dim_out)
: linear_1(linear_dim_in[0], linear_dim_out[0]),
linear_2(linear_dim_in[1], linear_dim_out[1]),
linear_3(linear_dim_in[2], linear_dim_out[2]),
linear_4(linear_dim_in[3], linear_dim_out[3]),
linear_5(linear_dim_in[4], linear_dim_out[4])
{
register_module("linear_1", linear_1);
register_module("linear_2", linear_2);
register_module("linear_3", linear_3);
register_module("linear_4", linear_4);
register_module("linear_5", linear_5);
}
torch::Tensor forward(torch::Tensor x)
{
x = torch::flatten(x, 1, -1); // Flatten
x = linear_1->forward(x);
x = linear_2->forward(x);
x = linear_3->forward(x);
x = linear_4->forward(x);
x = linear_5->forward(x);
return x;
}
};
TORCH_MODULE(Net); // creates module holder for NetImpl
int main()
{
// Detect CUDA device
torch::Device device("cpu");
if (torch::cuda::is_available())
{
device = torch::Device("cuda:0");
}
// Declare network build variables
std::vector<int64_t> linear_dim_in;
linear_dim_in.push_back(72960);
linear_dim_in.push_back(1024);
linear_dim_in.push_back(512);
linear_dim_in.push_back(256);
linear_dim_in.push_back(128);
std::vector<int64_t> linear_dim_out;
linear_dim_out.push_back(1024);
linear_dim_out.push_back(512);
linear_dim_out.push_back(256);
linear_dim_out.push_back(128);
linear_dim_out.push_back(3);
// Create a network
Net net(linear_dim_in, linear_dim_out);
// Set the model in eval mode
net->eval();
// Set the device in GPU
net->to(device);
// Declare a Timer and the desired loop number
QElapsedTimer timer;
int loop_nbr = 100;
{
// Disable grads computation. If not, grad values accumulate in GPU memory.
torch::NoGradGuard no_grad;
// For loop_nbr times
for (int i = 0; i <= loop_nbr; i++)
{
qDebug() << "\n" << "Loop nbr.: " << i;
// Start timer
timer.restart();
// Declare a random input
torch::Tensor X_batch = torch::rand({1,1,240,304}, device);
// Show the elapsed time
qDebug() << "T_tensorDeclaration: " << timer.nsecsElapsed() << "ns";
// Run the model on the input data.
torch::Tensor prediction = net->forward(X_batch);
// Show the elapsed time
qDebug() << "T_forward: " << timer.nsecsElapsed() << "ns";
}
}
return 0;
}
project.pro:
QT += core gui
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
CONFIG += c++11
CONFIG += no_keywords
# The following define makes your compiler emit warnings if you use
# any Qt feature that has been marked deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS
SOURCES += main.cpp
INCLUDEPATH += $$PWD/../../libtorch/include
DEPENDPATH += $$PWD/../../libtorch/include
INCLUDEPATH += $$PWD/../../libtorch/include/torch/csrc/api/include
DEPENDPATH += $$PWD/../../libtorch/include/torch/csrc/api/include
LIBS += -L$$PWD/../../libtorch/lib/ -ltorch -lc10
Results:
20:08:08: Debugging starts
20:26:42: Debugging starts
Loop nbr.: 0
T_tensorDeclaration: 73586 ns
T_forward: 672337 ns
Loop nbr.: 1
T_tensorDeclaration: 16926 ns
T_forward: 106928 ns
Loop nbr.: 2
T_tensorDeclaration: 12575 ns
T_forward: 96994 ns
Loop nbr.: 3
T_tensorDeclaration: 12155 ns
T_forward: 95574 ns
Loop nbr.: 4
T_tensorDeclaration: 11896 ns
T_forward: 95337 ns
Loop nbr.: 5
T_tensorDeclaration: 11910 ns
T_forward: 95338 ns
Loop nbr.: 6
T_tensorDeclaration: 11818 ns
T_forward: 94641 ns
Loop nbr.: 7
T_tensorDeclaration: 11889 ns
T_forward: 95124 ns
Loop nbr.: 8
T_tensorDeclaration: 11942 ns
T_forward: 95305 ns
Loop nbr.: 9
T_tensorDeclaration: 11877 ns
T_forward: 94863 ns
Loop nbr.: 10
T_tensorDeclaration: 11875 ns
T_forward: 94936 ns
Loop nbr.: 11
T_tensorDeclaration: 11967 ns
T_forward: 94624 ns
Loop nbr.: 12
T_tensorDeclaration: 11690 ns
T_forward: 94956 ns
Loop nbr.: 13
T_tensorDeclaration: 11882 ns
T_forward: 94759 ns
Loop nbr.: 14
T_tensorDeclaration: 11825 ns
T_forward: 94982 ns
Loop nbr.: 15
T_tensorDeclaration: 11894 ns
T_forward: 94875 ns
Loop nbr.: 16
T_tensorDeclaration: 11854 ns
T_forward: 95101 ns
Loop nbr.: 17
T_tensorDeclaration: 11891 ns
T_forward: 94930 ns
Loop nbr.: 18
T_tensorDeclaration: 11838 ns
T_forward: 94664 ns
Loop nbr.: 19
T_tensorDeclaration: 11817 ns
T_forward: 94819 ns
Loop nbr.: 20
T_tensorDeclaration: 11778 ns
T_forward: 94940 ns
Loop nbr.: 21
T_tensorDeclaration: 11930 ns
T_forward: 95169 ns
Loop nbr.: 22
T_tensorDeclaration: 11840 ns
T_forward: 95371 ns
Loop nbr.: 23
T_tensorDeclaration: 11811 ns
T_forward: 94682 ns
Loop nbr.: 24
T_tensorDeclaration: 11885 ns
T_forward: 94658 ns
Loop nbr.: 25
T_tensorDeclaration: 11908 ns
T_forward: 94840 ns
Loop nbr.: 26
T_tensorDeclaration: 11973 ns
T_forward: 100463 ns
Loop nbr.: 27
T_tensorDeclaration: 13257 ns
T_forward: 98160 ns
Loop nbr.: 28
T_tensorDeclaration: 11934 ns
T_forward: 95079 ns
Loop nbr.: 29
T_tensorDeclaration: 11860 ns
T_forward: 95174 ns
Loop nbr.: 30
T_tensorDeclaration: 11840 ns
T_forward: 95035 ns
Loop nbr.: 31
T_tensorDeclaration: 11814 ns
T_forward: 96194 ns
Loop nbr.: 32
T_tensorDeclaration: 11870 ns
T_forward: 95375 ns
Loop nbr.: 33
T_tensorDeclaration: 11822 ns
T_forward: 95376 ns
Loop nbr.: 34
T_tensorDeclaration: 11726 ns
T_forward: 94956 ns
Loop nbr.: 35
T_tensorDeclaration: 11861 ns
T_forward: 95158 ns
Loop nbr.: 36
T_tensorDeclaration: 11707 ns
T_forward: 95276 ns
Loop nbr.: 37
T_tensorDeclaration: 11738 ns
T_forward: 95323 ns
Loop nbr.: 38
T_tensorDeclaration: 11945 ns
T_forward: 95335 ns
Loop nbr.: 39
T_tensorDeclaration: 11914 ns
T_forward: 95224 ns
Loop nbr.: 40
T_tensorDeclaration: 11771 ns
T_forward: 95094 ns
Loop nbr.: 41
T_tensorDeclaration: 11677 ns
T_forward: 95076 ns
Loop nbr.: 42
T_tensorDeclaration: 11944 ns
T_forward: 95172 ns
Loop nbr.: 43
T_tensorDeclaration: 11729 ns
T_forward: 95244 ns
Loop nbr.: 44
T_tensorDeclaration: 11973 ns
T_forward: 95500 ns
Loop nbr.: 45
T_tensorDeclaration: 11858 ns
T_forward: 95404 ns
Loop nbr.: 46
T_tensorDeclaration: 11948 ns
T_forward: 95370 ns
Loop nbr.: 47
T_tensorDeclaration: 11728 ns
T_forward: 95435 ns
Loop nbr.: 48
T_tensorDeclaration: 12010 ns
T_forward: 95462 ns
Loop nbr.: 49
T_tensorDeclaration: 11789 ns
T_forward: 95408 ns
Loop nbr.: 50
T_tensorDeclaration: 11750 ns
T_forward: 95111 ns
Loop nbr.: 51
T_tensorDeclaration: 11722 ns
T_forward: 95340 ns
Loop nbr.: 52
T_tensorDeclaration: 11809 ns
T_forward: 95448 ns
Loop nbr.: 53
T_tensorDeclaration: 11802 ns
T_forward: 95283 ns
Loop nbr.: 54
T_tensorDeclaration: 11733 ns
T_forward: 94973 ns
Loop nbr.: 55
T_tensorDeclaration: 11885 ns
T_forward: 95266 ns
Loop nbr.: 56
T_tensorDeclaration: 11904 ns
T_forward: 95313 ns
Loop nbr.: 57
T_tensorDeclaration: 11845 ns
T_forward: 95340 ns
Loop nbr.: 58
T_tensorDeclaration: 11844 ns
T_forward: 94570 ns
Loop nbr.: 59
T_tensorDeclaration: 11853 ns
T_forward: 95225 ns
Loop nbr.: 60
T_tensorDeclaration: 11875 ns
T_forward: 94859 ns
Loop nbr.: 61
T_tensorDeclaration: 11816 ns
T_forward: 95111 ns
Loop nbr.: 62
T_tensorDeclaration: 11861 ns
T_forward: 95106 ns
Loop nbr.: 63
T_tensorDeclaration: 12010 ns
T_forward: 96033 ns
Loop nbr.: 64
T_tensorDeclaration: 11941 ns
T_forward: 97326 ns
Loop nbr.: 65
T_tensorDeclaration: 11956 ns
T_forward: 95006 ns
Loop nbr.: 66
T_tensorDeclaration: 11985 ns
T_forward: 122965 ns
Loop nbr.: 67
T_tensorDeclaration: 25619 ns
T_forward: 109450 ns
Loop nbr.: 68
T_tensorDeclaration: 11941 ns
T_forward: 95234 ns
Loop nbr.: 69
T_tensorDeclaration: 11843 ns
T_forward: 95319 ns
Loop nbr.: 70
T_tensorDeclaration: 11895 ns
T_forward: 95040 ns
Loop nbr.: 71
T_tensorDeclaration: 11932 ns
T_forward: 95334 ns
Loop nbr.: 72
T_tensorDeclaration: 11891 ns
T_forward: 95289 ns
Loop nbr.: 73
T_tensorDeclaration: 11805 ns
T_forward: 95366 ns
Loop nbr.: 74
T_tensorDeclaration: 11795 ns
T_forward: 95348 ns
Loop nbr.: 75
T_tensorDeclaration: 11955 ns
T_forward: 95634 ns
Loop nbr.: 76
T_tensorDeclaration: 11753 ns
T_forward: 94713 ns
Loop nbr.: 77
T_tensorDeclaration: 11856 ns
T_forward: 95359 ns
Loop nbr.: 78
T_tensorDeclaration: 11912 ns
T_forward: 94837 ns
Loop nbr.: 79
T_tensorDeclaration: 11818 ns
T_forward: 95065 ns
Loop nbr.: 80
T_tensorDeclaration: 11819 ns
T_forward: 94696 ns
Loop nbr.: 81
T_tensorDeclaration: 11834 ns
T_forward: 95074 ns
Loop nbr.: 82
T_tensorDeclaration: 11797 ns
T_forward: 95438 ns
Loop nbr.: 83
T_tensorDeclaration: 11821 ns
T_forward: 95284 ns
Loop nbr.: 84
T_tensorDeclaration: 11881 ns
T_forward: 94846 ns
Loop nbr.: 85
T_tensorDeclaration: 11867 ns
T_forward: 95104 ns
Loop nbr.: 86
T_tensorDeclaration: 11830 ns
T_forward: 94943 ns
Loop nbr.: 87
T_tensorDeclaration: 11865 ns
T_forward: 95280 ns
Loop nbr.: 88
T_tensorDeclaration: 11604 ns
T_forward: 94547 ns
Loop nbr.: 89
T_tensorDeclaration: 11758 ns
T_forward: 94695 ns
Loop nbr.: 90
T_tensorDeclaration: 11761 ns
T_forward: 94878 ns
Loop nbr.: 91
T_tensorDeclaration: 11788 ns
T_forward: 95150 ns
Loop nbr.: 92
T_tensorDeclaration: 11890 ns
T_forward: 95654 ns
Loop nbr.: 93
T_tensorDeclaration: 11898 ns
T_forward: 94998 ns
Loop nbr.: 94
T_tensorDeclaration: 11762 ns
T_forward: 95416 ns
Loop nbr.: 95
T_tensorDeclaration: 11848 ns
T_forward: 3099025 ns
Loop nbr.: 96
T_tensorDeclaration: 12879 ns
T_forward: 4235451 ns
Loop nbr.: 97
T_tensorDeclaration: 12708 ns
T_forward: 4268075 ns
Loop nbr.: 98
T_tensorDeclaration: 12960 ns
T_forward: 4278626 ns
Loop nbr.: 99
T_tensorDeclaration: 12897 ns
T_forward: 4275762 ns
Loop nbr.: 100
T_tensorDeclaration: 12794 ns
T_forward: 4086642 ns
20:26:49: Debugging has finished
...
The forward time is around 100us for the 94 first loop, but go to 4000us after that. In comparison, the tensor declaration time stays roughly the same.
It looks a bit like a memory issue, but I don’t see from where it comes from.
The model is used in “eval” mode and “torch::NoGradGuard no_grad;” is set.
I found this recent post which looked similar, but the reply doesn’t seem to help.
Is there anything obvious that I’m missing ? What could I do to solve this issue ?
Here is my configuration:
OS: Kubuntu
Processor: Intel® Core™ i7-8850H CPU @ 2.60GHz
GPU: GP107GLM [Quadro P1000 Mobile]
Cuda version: 9.1
libtorch version: libtorch-cxx11-abi-shared-with-deps-1.4.0+cu92
Thanks a lot, I wish you a great day !
Best regards,
Florent