Cannot forward model in Pytorch Lite 1.9.0 in Android native app

Huy_Huynh_Hoang · July 30, 2024, 4:49am

Hi everyone, I am building a android app with Java, C++, libtorch. My light model is VQA which task 4 args: image_features, input_ids, attention_mask, token_type_ids.

Currently, I cannot forward the inputs. It raise c10::Error:

"index out of range in self\n  \n  Debug info for handle(s): -1, was not found.\n  \nException raised from operator() at ../aten/src/ATen/native/TensorAdvancedIndexing.cpp:980 (most recent call first):\n(no backtrace available)"

I don’t know what it means, I have search but there is no solution. Please help me.

This is my CMakeLists.txt:

cmake_minimum_required(VERSION 3.4.1)
# THIS HAS TO COME BEFORE THE PROJECT LINE
set(CMAKE_C_COMPILER "gcc")
set(CMAKE_CXX_COMPILER "g++")
set(CMAKE_CXX_FLAGS_DEBUG  "")
set(CMAKE_CXX_FLAGS_RELEASE  "")
#set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)


set(CMAKE_CXX_FLAGS  "-D_GLIBCXX_USE_CXX11_ABI=1")
set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=1")

#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
#set(TORCH_CXX_FLAGS "${TORCH_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")



# THIS HAS TO COME BEFORE THE PROJECT LINE
set(TARGET pytorch_nativeapp_VQA)
set(CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "" FORCE)

project(${TARGET} VERSION 0.0.0 LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 17)

if("${ANDROID_ABI}" STREQUAL "")
    EXECUTE_PROCESS(COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE)
    set(ANDROID_ABI ${ARCHITECTURE})
endif()
set(ANDROID_ABI "arm64-v8a")
message(STATUS "Compile for architecture: ${ANDROID_ABI}")
message("Compile for architecture: ${ANDROID_ABI}")

set(build_DIR ${CMAKE_SOURCE_DIR}/build)

set(pytorch_testapp_cpp_DIR ${CMAKE_CURRENT_LIST_DIR}/src/main/cpp)
message("CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}")
message("CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}")
message("CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")

file(GLOB pytorch_testapp_SOURCES
        ${pytorch_testapp_cpp_DIR}/native-lib.cpp
        )

message("pytorch_testapp_SOURCES: ${pytorch_testapp_SOURCES}")


# OpenCV
set(distribution_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../distribution)
set(OpenCV_DIR ${distribution_DIR}/libs/arm64-v8a/sdk/native/jni)
message("distribution_DIR: ${distribution_DIR}")
message("OpenCV_DIR: ${OpenCV_DIR}")
find_package(OpenCV 4.5.4 REQUIRED)



#set(OPENCV_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/path/to/opencv/include") # Replace with the path to your OpenCV include directory
#set(OPENCV_LIB_DIR "${CMAKE_CURRENT_LIST_DIR}/path/to/opencv/libs/${ANDROID_ABI}") # Replace with the path to your OpenCV libs directory

include_directories(${distribution_DIR}/libs/arm64-v8a/opencv2)

# Find PyTorch libraries and include directories
#set(PYTORCH_ROOT_DIR $ENV{HOME}/.gradle/caches/modules-2/files-2.1/org.pytorch)
set(PYTORCH_ROOT_DIR ${distribution_DIR}/libs/arm64-v8a/org.pytorch)




file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers")
file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}")
message("######### PYTORCH_INCLUDE_DIRS: ${PYTORCH_INCLUDE_DIRS}")
message("######### PYTORCH_LINK_DIRS: ${PYTORCH_LINK_DIRS}")

find_library(PYTORCH_LIBRARY pytorch_jni_lite
        PATHS ${PYTORCH_LINK_DIRS}
        NO_CMAKE_FIND_ROOT_PATH)
find_library(FBJNI_LIBRARY fbjni
        PATHS ${PYTORCH_LINK_DIRS}
        NO_CMAKE_FIND_ROOT_PATH)

message("######### PYTORCH_LIBRARY: ${PYTORCH_LIBRARY}")
message("######### FBJNI_LIBRARY: ${FBJNI_LIBRARY}")


# PyTorch
add_library(${TARGET} SHARED
        ${pytorch_testapp_SOURCES}
        )


target_include_directories(${TARGET} PRIVATE
        ${CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES}
        ${OpenCV_INCLUDE_DIRS}
        ${PYTORCH_INCLUDE_DIRS}
        )

#target_link_libraries(${TARGET} -lpytorch_jni ${OpenCV_LIBS} log)
target_link_libraries(${TARGET}
        ${OpenCV_LIBS}
        -pthread
        ${PYTORCH_LIBRARY} # Sử dụng biến PYTORCH_LIBRARY đã tìm thấy
        ${FBJNI_LIBRARY}

        log)

target_compile_options(${TARGET} PRIVATE -fexceptions)

And this is my gradle.build:


plugins {
    id 'com.android.application'
}
android {
    configurations {
        extractForNativeBuild
    }

    compileSdk 34
    ndkVersion '21.4.7075529'

    defaultConfig {
        applicationId "org.pytorch.helloworld"
        minSdk 28
        targetSdk 34
        versionCode 1
        versionName "1.0"
        ndk {
            abiFilters "arm64-v8a"
        }

        testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
        signingConfig signingConfigs.debug
        externalNativeBuild {
            cmake {
                cppFlags ''
                arguments "-DANDROID_STL=c++_shared"
            }
        }
    }
    externalNativeBuild {
        cmake {
            path './CMakeLists.txt'
            version '3.22.1'

        }
    }
    buildTypes {
        release {
            minifyEnabled false
        }
    }
    buildFeatures {
        viewBinding true
    }
    sourceSets {
        main {
            jniLibs.srcDirs = ['src/main/jniLibs']
        }
    }



}

dependencies {
    implementation 'androidx.core:core-ktx:1.8.0'
    implementation 'androidx.appcompat:appcompat:1.6.1'
    implementation 'com.google.android.material:material:1.4.0'
    implementation 'androidx.constraintlayout:constraintlayout:2.1.4'

    implementation 'com.google.code.gson:gson:2.8.2'
    implementation 'org.jetbrains.bio:npy:0.3.5'
    implementation 'org.pytorch:pytorch_android_lite:1.10.0'
    extractForNativeBuild 'org.pytorch:pytorch_android_lite:1.10.0'

}
task extractAARForNativeBuild {
    doLast {
        configurations.extractForNativeBuild.files.each {
            def file = it.absoluteFile
            copy {
                from zipTree(file)
                into "$buildDir/$file.name"
                include "headers/**"
                include "jni/**"
            }
        }
    }
}
tasks.whenTaskAdded { task ->
    if (task.name.contains('externalNativeBuild')) {
        task.dependsOn(extractAARForNativeBuild)
    }
}

This is my native-lib.cpp:

#include <jni.h>
#include <string>
//#include <ATen/ATen.h>
#include <torch/script.h>
//#include <cnpy.h>
#include <android/log.h>
#include <torch/script.h>
#include <torch/csrc/jit/api/module.h>
//#include <torch/csrc/module.h>
#include <iostream>
#include <fstream>
#include <memory>
#include <vector>

#include "ATen/ATen.h"
#include "torch/csrc/autograd/grad_mode.h"
#include "torch/script.h"
#include <torch/csrc/jit/mobile/function.h>
#include <torch/csrc/jit/mobile/import.h>
#include <torch/csrc/jit/mobile/interpreter.h>
//#include <torch/csrc/jit/api/module.h>
#include <torch/csrc/jit/mobile/observer.h>
#include "opencv2/opencv.hpp"

#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include "fastBPE.hpp"
#include <sstream>
#include <unordered_map>
#include "npy.hpp"
#include <c10/core/TensorOptions.h>
//#include <torch/csrc/api/include/torch/torch.h>

using namespace std;
using namespace fastBPE;



std::unordered_map<std::string, int> loadVocab(const std::string& filename) {
    std::unordered_map<std::string, int> vocab;
    std::ifstream file(filename);

    if (!file.is_open()) {
        std::cerr << "Không thể mở file " << filename << std::endl;
        return vocab;
    }

    std::string line;
    bool inVocabSection = false;
    while (std::getline(file, line)) {
        // Tìm phần bắt đầu của "vocab" trong JSON
        if (line.find("\"vocab\": {") != std::string::npos) {
            inVocabSection = true;
            continue;
        }

        // Kết thúc phần "vocab"
        bool found = (line.find("},") != std::string::npos) ? true : false;
        if (inVocabSection && found) {
            file.close();
            break;
        }
        // Xử lý các dòng trong phần "vocab"
        if (inVocabSection) {
            int colonPosFirst = line.find('"');
            size_t colonPos = line.find_last_of('"');

            if (colonPos != std::string::npos) {
                std::string key = line.substr(colonPosFirst + 1, colonPos - colonPosFirst - 1); // Bỏ dấu ngoặc kép
                key.erase(0, key.find_first_not_of(" \t")); // Xóa khoảng trắng đầu
                key.erase(key.find_last_not_of(" \t") + 1); // Xóa khoảng trắng cuối

                std::string valueStr = line.substr(colonPos + 2);
                valueStr.erase(0, valueStr.find_first_not_of(" \t")); // Xóa khoảng trắng đầu
                valueStr.erase(valueStr.find_last_not_of(" \t,") + 1); // Xóa khoảng trắng và dấu phẩy cuối

                int value = std::stoi(valueStr);
                vocab[key] = value;
            }
        }
    }

    // file.close();
    return vocab;
}



extern "C" JNIEXPORT jstring JNICALL
Java_org_pytorch_helloworld_NativeInterface_processNpyWithModel(
        JNIEnv* env,
        jclass clazz,
        jstring imagePath,
        jstring segmentedText,
        jstring modelPath) {

    // Convert jstring to std::string

    const char *modelPathCStr = env->GetStringUTFChars(modelPath, nullptr);
    std::string modelPathStr(modelPathCStr);

    auto options1 = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU);
    auto img_tensor_1 = torch::ones({1, 3, 224, 224}, options1);
    auto input_ids_tensor_1 = torch::ones({1, 20}, torch::TensorOptions().dtype(torch::kInt32));
    auto attention_mask_tensor_1 = torch::ones({1, 20}, torch::TensorOptions().dtype(torch::kInt32));
    auto token_type_ids_tensor_1 = torch::ones({1, 20}, torch::TensorOptions().dtype(torch::kInt32));

    std::vector<torch::jit::IValue> inputs;
    inputs.push_back(img_tensor_1);
    inputs.push_back(input_ids_tensor_1);
    inputs.push_back(attention_mask_tensor_1);
    inputs.push_back(token_type_ids_tensor_1);
    // Load the PyTorch model
    c10::optional<at::Device> device = at::Device(at::kCPU);


    torch::jit::ExtraFilesMap extra_files;
    torch::jit::mobile::Module model = torch::jit::_load_for_mobile(modelPathStr, device, extra_files);


    torch::Tensor outputTensor;
    try {
        outputTensor = model.forward(inputs).toTensor();
        std::cout << "Output tensor: " << outputTensor.sizes() << ", " << outputTensor.dtype() << std::endl;
    } catch (const c10::Error& e) {
        std::cerr << "Error during forward pass: " << e.what() << std::endl;
    }

    std::string result = "Processed image and question with model.";
    return env->NewStringUTF(result.c_str());
}

This is script I convert file pt into ptl:

import torch
from torch.utils.mobile_optimizer import optimize_for_mobile
from classifier import SimpleClassifier, StudentSimpleClassifier
from PIL import Image
import numpy as np
from transformers import ViTFeatureExtractor,AutoTokenizer
import argparse
import base_model
import mobilenetv3
from collections import namedtuple
import torch.utils
import torch.utils.bundled_inputs
from transformers import BatchEncoding
import torch.nn as nn

def get_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('-gpu', type=str, default='0')
    
    # Choices of attention models
    parser.add_argument('--model', type=str, default='CrossAtt', choices=['CMSA', 'CrossAtt', 'GuidedAtt'],
                        help='the model we use')

    # Model setting
    parser.add_argument('--object_detection',  action='store_true', default=False, help='Use Object Detection model?')
    parser.add_argument('--vit_backbone', type=str, default='vit')
    parser.add_argument('--vit_image_pretrained', type=str, default='google/vit-base-patch16-224-in21k')
    parser.add_argument('--cnn_backbone', type=str, default='resnet34')
    parser.add_argument('--cnn_image_pretrained', type=str, default='google/vit-base-patch16-224-in21k')
    parser.add_argument('--bert_type', type=str, default='phobert')
    parser.add_argument('--bert_pretrained', type=str, default='vinai/phobert-base')
    parser.add_argument('--input_size', type=int, default=224)
    parser.add_argument('--data_dir', type=str, default='/content/dataset')
    parser.add_argument('--output', type=str, default='/content')
    
    # Define dimensions
    parser.add_argument('--v_vit_dim', type=int, default=768,
                        help='dim of image features')
    parser.add_argument('--v_cnn_dim', type=int, default=512,
                        help='dim of image features')
    parser.add_argument('--q_dim', type=int, default=768,
                        help='dim of bert question features')
    parser.add_argument('--f_mid_dim', type=int, default=1024,
                        help='dim of middle layer of fusion layer')
    parser.add_argument('--joint_dim', type=int, default=1024,
                        help='dim of joint features of fusion layer')
    parser.add_argument('--glimpse', type=int, default=1,
                        help='number of glimpse for the attention reduction')

    # Multihead self-attention config
    parser.add_argument('--hidden_dim', type=int, default=2048,
                        help='dim of hidden layer of feed forward layers of transformers')
    parser.add_argument('--num_heads', type=int, default=8,
                        help='number of heads of transformers encoder')
    
    # BAN - Bilinear Attention Networks
    parser.add_argument('--gamma', type=int, default=2,
                        help='glimpse in Bilinear Attention Networks')
    # Choices of RNN models
    parser.add_argument('--rnn', type=str, default='LSTM', choices=['LSTM', 'GRU'],
                        help='the RNN we use')
    parser.add_argument('--op', type=str, default='c',
                        help='concatenated 600-D word embedding')
    parser.add_argument('--question_len', default=20, type=int, metavar='N',
                        help='maximum length of input question')
    parser.add_argument('--tfidf', type=bool, default=None,
                        help='tfitrain_log_df word embedding?')
    # Activation function + dropout for classification module
    parser.add_argument('--activation', type=str, default='relu', choices=['relu'],
                        help='the activation to use for final classifier')
    parser.add_argument('--dropout', default=0.2, type=float, metavar='dropout',
                        help='dropout of rate of final classifier')
    parser.add_argument('--clip_norm', default=.25, type=float, metavar='NORM',
                        help='clip threshold of gradients')

    # Training setting
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--nepochs', type=int, default=100)
    parser.add_argument('--resume_epoch', type=int, default=100)
    parser.add_argument('--train_fold', type=str, default='/content')
    parser.add_argument('--run_id', type=int, default=-1)
    parser.add_argument('--T', type=int, default=2)

    # Optimizer setting
    parser.add_argument('--init_lr', type=float, default=1e-5)
    parser.add_argument('--max_lr', type=float, default=5e-5)
    parser.add_argument('--weight_decay', type=float, default=5e-4)
    parser.add_argument('--momentum', type=float, default=0.9)
    parser.add_argument('--warmup_steps', type=int, default=20)
    parser.add_argument('--label_smooth', type=float, default=0.0)
    parser.add_argument('--threshold', type=float, default=0.7)

    parser.add_argument('--print_summary', action='store_true', default=False, help='Print model summary?')

    parser.add_argument('--save_every', type=int, default=5)
    parser.add_argument('--log_every', type=int, default=25)
    parser.add_argument('--emb_init', type=str, default='biowordvec', choices=['glove', 'biowordvec', 'biosentvec'])
    parser.add_argument('--self_att', action='store_true', default=False, help='Use Self Attention?')
    parser.add_argument('--use_spatial', action='store_true', default=False, help='Use spatial feature?')
    parser.add_argument('--use_cma', action='store_true', default=False, help='Use CMA?')
    parser.add_argument('--result_fold', type=str, default='results')

    return parser.parse_args()

args = get_arguments()

feature_extractor = ViTFeatureExtractor().from_pretrained("google/vit-base-patch16-224-in21k")

tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")



def Model():
    # device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
    device = "cpu"
    args.device = device
    mobiNetCNN = mobilenetv3.mobilenetv3_small()
    mobiNetViT = mobilenetv3.mobilenetv3_smallViT()
    current_model_dict = mobiNetCNN.state_dict()
    loaded_state_dict = torch.load("./pretrained_mobileNet/mobilenetv3-small-55df8e1f.pth")
    new_state_dict={k:v if v.size()==current_model_dict[k].size()  else  current_model_dict[k] for k,v in zip(current_model_dict.keys(), loaded_state_dict.values())}
    mobiNetCNN.load_state_dict(new_state_dict, strict=False)

    current_model_dict = mobiNetViT.state_dict()
    loaded_state_dict = torch.load("./pretrained_mobileNet/mobilenetv3-small-55df8e1f.pth")
    new_state_dict={k:v if v.size()==current_model_dict[k].size()  else  current_model_dict[k] for k,v in zip(current_model_dict.keys(), loaded_state_dict.values())}
    mobiNetViT.load_state_dict(new_state_dict, strict=False)
    args.num_classes = 353
    student_model = base_model.build_GuidedAtt_replaceResNetViT(args, mobiNetCNN,mobiNetViT)
    return student_model


def createInput(img_path = "/home/dmp/1.Users/05.huy.hhoang/1/1271.jpg", question = "có bao nhiêu người đàn ông trong nhà bếp, hai người đang ngồi trên quầy"):
    img = Image.open(img_path)
    img_np = np.array(img)
    if len(img_np.shape) == 2:
        img_np = np.stack((img_np, img_np, img_np), axis=-1)
        img = Image.fromarray(img_np.astype(np.uint8))

    visual_feature = feature_extractor([img], return_tensors='pt')

    embed_question = tokenizer([question], padding='max_length', 
                            max_length=20, truncation=True, return_tensors='pt')
    
    return (visual_feature, embed_question)

class MyModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, image_features, input_ids, attention_mask, token_type_ids):
        print("start forward mymodel")
        
        output= self.model(image_features, (input_ids, attention_mask, token_type_ids))
        print("--------------")
        return output

def optimizeSaver():
    print("START CONVERT")
    model = Model()
    model.load_state_dict(torch.load("GuidedAtt_vit_resnet34_phobert_31_12_2023__01_19_41.pt", map_location='cpu'),strict=True)
    model.eval()

    my_model = MyModel(model)

    print("DONE LOAD MODEL")
    temp_input = createInput()
    visual_feature, embed_question = temp_input
    print("DONE CREATE INPUT")
    #test forward input:
    print("embed_question:",embed_question)

    print("==============================================")
    image_features = visual_feature['pixel_values']
    input_ids = embed_question['input_ids']
    attention_mask = embed_question['attention_mask']
    token_type_ids = embed_question['token_type_ids']


    out, _, _ = my_model.forward(image_features, input_ids, attention_mask, token_type_ids)
    print("mymodel out:",out)
    print("mymodel out shape:",out.shape)
    
    # print("inputs:",inputs)
    module = torch.jit.trace(my_model, (image_features, input_ids, attention_mask, token_type_ids))
    traced_script_module_optimized = optimize_for_mobile(module)
    #save
    traced_script_module_optimized._save_for_lite_interpreter("common.ptl")
    print("DONE")

optimizeSaver()

I have re-test shape of tensors and confirm that:

image_features: torch.Size([1, 3, 224, 224])
attention_mask: torch.Size([1, 20])
input_ids: torch.Size([1, 20])
token_type_ids: torch.Size([1, 20])