Have somre tricks about reduce run time with libtorch?

Hi, i tried to export mdel from pytorch to libtorch, and i found libtorch model forward time is same with pytorch(my model is resnet50), so Is there any guide to help me improve the speed of libtorch? For example, gcc flag, function considerations, and so on… and what factors will affect the running time of net->forward?
i know that Python is not necessarily slower than C++, i just want to make sure i can get the highest performance with libtorch.:pray:

This is the CMakeLists I use:

cmake_minimum_required(VERSION 3.0)
project(pvanet_release)

set(Torch_DIR "/data/home/ryankang/Workspace/pvanet/libtorch_example/pvanet_libtorch/libtorch/share/cmake/Torch/")
#set(Torch_DIR "/data/home/ryankang/Workspace/pvanet/libtorch_example/pvanet_libtorch/test_0723/ft_local/libtorch/share/cmake/Torch/")
# set(OpenCV_DIR /usr/local/opt/opencv@2/share/OpenCV)
find_package(Torch REQUIRED)
find_package(OpenCV REQUIRED)

message(STATUS "Pytorch status:")
message(STATUS "    libraries: ${TORCH_LIBRARIES}")
message(STATUS "OpenCV library status:")
message(STATUS "    version: ${OpenCV_VERSION}")
message(STATUS "    libraries: ${OpenCV_LIBS}")
message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")

set(CMAKE_CXX_STANDARD 11)
set(CMAKE_VERBOSE_MAKEFILE ON)

include_directories(${TORCH_INCLUDE_DIRS})
include_directories(${OpenCV_INCLUDE_DIRS})

add_compile_options( -fopenmp
                     -pthread
                     -msse3
                     -Wall
                     -Wextra
                     -Wno-unused-parameter)


set(CUDA_NVCC_FLAGS "-std=c++11;")
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;-Wall;")
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;-Wextra;")
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;-Wno-unused-parameter;")
list(APPEND CUDA_NVCC_FLAGS "--compiler-options;-Wno-unknown-pragmas;")
set(CUDA_NVCC_FLAGS_RELEASE "-O3;-DNDEBUG;")
set(CUDA_NVCC_FLAGS_DEBUG "-g;-G;-O0;")
set(CUDA_PROPAGATE_HOST_FLAGS ON)


set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pthread -D__INTRIN__ENABLED__=1 -Wpedantic -funroll-loops -ftree-vectorize -std=c++0x -fPIC")

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result -Wsign-compare -Wunreachable-code -fno-common -d -DNDEBUG -g -fwrapv -O3 -Wall -g -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11 ")

set(SOURCE_FILES
        anchors.cpp
        anchors.h
        boxutils.cpp
        boxutils.h
        config.cpp
        config.h
        image_utils.cpp
        image_utils.h
        nms.cpp
        nms.h
        proposal_layer.cpp
        proposal_layer.h
        dataset_classes.cpp
        dataset_classes.h
        debug.h
        debug.cpp
        roi_pooling.h
        roi_pooling.cpp
        nms/nms.h
        nms/nms.cpp
        nms/nms_cuda.h
        nms/nms_cuda.cu
        roipool/roipool_cuda.h
        roipool/roipool_cuda.cu
        )


cuda_add_library("${CMAKE_PROJECT_NAME}_lib" STATIC ${SOURCE_FILES})
target_link_libraries("${CMAKE_PROJECT_NAME}_lib" ${REQUIRED_LIBS})

add_executable(pvanet_release pvanet_demo.cpp ${SOURCE_FILES})
target_link_libraries(pvanet_release "${CMAKE_PROJECT_NAME}_lib" ${TORCH_LIBRARIES} ${OpenCV_LIBS})

thank you!

Currently, libtorch has no advantage in inference speed, and there are some problems with parallel inference. See this https://github.com/pytorch/pytorch/issues/26948