Hi everyone, I am building a android app with Java, C++, libtorch. My light model is VQA which task 4 args: image_features, input_ids, attention_mask, token_type_ids.
Currently, I cannot forward the inputs. It raise c10::Error:
"index out of range in self\n \n Debug info for handle(s): -1, was not found.\n \nException raised from operator() at ../aten/src/ATen/native/TensorAdvancedIndexing.cpp:980 (most recent call first):\n(no backtrace available)"
I don’t know what it means, I have search but there is no solution. Please help me.
This is my CMakeLists.txt:
cmake_minimum_required(VERSION 3.4.1)
# THIS HAS TO COME BEFORE THE PROJECT LINE
set(CMAKE_C_COMPILER "gcc")
set(CMAKE_CXX_COMPILER "g++")
set(CMAKE_CXX_FLAGS_DEBUG "")
set(CMAKE_CXX_FLAGS_RELEASE "")
#set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
set(CMAKE_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=1")
set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=1")
#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
#set(TORCH_CXX_FLAGS "${TORCH_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
# THIS HAS TO COME BEFORE THE PROJECT LINE
set(TARGET pytorch_nativeapp_VQA)
set(CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "" FORCE)
project(${TARGET} VERSION 0.0.0 LANGUAGES C CXX)
set(CMAKE_CXX_STANDARD 17)
if("${ANDROID_ABI}" STREQUAL "")
EXECUTE_PROCESS(COMMAND uname -m COMMAND tr -d '\n' OUTPUT_VARIABLE ARCHITECTURE)
set(ANDROID_ABI ${ARCHITECTURE})
endif()
set(ANDROID_ABI "arm64-v8a")
message(STATUS "Compile for architecture: ${ANDROID_ABI}")
message("Compile for architecture: ${ANDROID_ABI}")
set(build_DIR ${CMAKE_SOURCE_DIR}/build)
set(pytorch_testapp_cpp_DIR ${CMAKE_CURRENT_LIST_DIR}/src/main/cpp)
message("CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}")
message("CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}")
message("CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}")
file(GLOB pytorch_testapp_SOURCES
${pytorch_testapp_cpp_DIR}/native-lib.cpp
)
message("pytorch_testapp_SOURCES: ${pytorch_testapp_SOURCES}")
# OpenCV
set(distribution_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../distribution)
set(OpenCV_DIR ${distribution_DIR}/libs/arm64-v8a/sdk/native/jni)
message("distribution_DIR: ${distribution_DIR}")
message("OpenCV_DIR: ${OpenCV_DIR}")
find_package(OpenCV 4.5.4 REQUIRED)
#set(OPENCV_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/path/to/opencv/include") # Replace with the path to your OpenCV include directory
#set(OPENCV_LIB_DIR "${CMAKE_CURRENT_LIST_DIR}/path/to/opencv/libs/${ANDROID_ABI}") # Replace with the path to your OpenCV libs directory
include_directories(${distribution_DIR}/libs/arm64-v8a/opencv2)
# Find PyTorch libraries and include directories
#set(PYTORCH_ROOT_DIR $ENV{HOME}/.gradle/caches/modules-2/files-2.1/org.pytorch)
set(PYTORCH_ROOT_DIR ${distribution_DIR}/libs/arm64-v8a/org.pytorch)
file(GLOB PYTORCH_INCLUDE_DIRS "${build_DIR}/pytorch_android*.aar/headers")
file(GLOB PYTORCH_LINK_DIRS "${build_DIR}/pytorch_android*.aar/jni/${ANDROID_ABI}")
message("######### PYTORCH_INCLUDE_DIRS: ${PYTORCH_INCLUDE_DIRS}")
message("######### PYTORCH_LINK_DIRS: ${PYTORCH_LINK_DIRS}")
find_library(PYTORCH_LIBRARY pytorch_jni_lite
PATHS ${PYTORCH_LINK_DIRS}
NO_CMAKE_FIND_ROOT_PATH)
find_library(FBJNI_LIBRARY fbjni
PATHS ${PYTORCH_LINK_DIRS}
NO_CMAKE_FIND_ROOT_PATH)
message("######### PYTORCH_LIBRARY: ${PYTORCH_LIBRARY}")
message("######### FBJNI_LIBRARY: ${FBJNI_LIBRARY}")
# PyTorch
add_library(${TARGET} SHARED
${pytorch_testapp_SOURCES}
)
target_include_directories(${TARGET} PRIVATE
${CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES}
${OpenCV_INCLUDE_DIRS}
${PYTORCH_INCLUDE_DIRS}
)
#target_link_libraries(${TARGET} -lpytorch_jni ${OpenCV_LIBS} log)
target_link_libraries(${TARGET}
${OpenCV_LIBS}
-pthread
${PYTORCH_LIBRARY} # Sử dụng biến PYTORCH_LIBRARY đã tìm thấy
${FBJNI_LIBRARY}
log)
target_compile_options(${TARGET} PRIVATE -fexceptions)
And this is my gradle.build:
plugins {
id 'com.android.application'
}
android {
configurations {
extractForNativeBuild
}
compileSdk 34
ndkVersion '21.4.7075529'
defaultConfig {
applicationId "org.pytorch.helloworld"
minSdk 28
targetSdk 34
versionCode 1
versionName "1.0"
ndk {
abiFilters "arm64-v8a"
}
testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
signingConfig signingConfigs.debug
externalNativeBuild {
cmake {
cppFlags ''
arguments "-DANDROID_STL=c++_shared"
}
}
}
externalNativeBuild {
cmake {
path './CMakeLists.txt'
version '3.22.1'
}
}
buildTypes {
release {
minifyEnabled false
}
}
buildFeatures {
viewBinding true
}
sourceSets {
main {
jniLibs.srcDirs = ['src/main/jniLibs']
}
}
}
dependencies {
implementation 'androidx.core:core-ktx:1.8.0'
implementation 'androidx.appcompat:appcompat:1.6.1'
implementation 'com.google.android.material:material:1.4.0'
implementation 'androidx.constraintlayout:constraintlayout:2.1.4'
implementation 'com.google.code.gson:gson:2.8.2'
implementation 'org.jetbrains.bio:npy:0.3.5'
implementation 'org.pytorch:pytorch_android_lite:1.10.0'
extractForNativeBuild 'org.pytorch:pytorch_android_lite:1.10.0'
}
task extractAARForNativeBuild {
doLast {
configurations.extractForNativeBuild.files.each {
def file = it.absoluteFile
copy {
from zipTree(file)
into "$buildDir/$file.name"
include "headers/**"
include "jni/**"
}
}
}
}
tasks.whenTaskAdded { task ->
if (task.name.contains('externalNativeBuild')) {
task.dependsOn(extractAARForNativeBuild)
}
}
This is my native-lib.cpp:
#include <jni.h>
#include <string>
//#include <ATen/ATen.h>
#include <torch/script.h>
//#include <cnpy.h>
#include <android/log.h>
#include <torch/script.h>
#include <torch/csrc/jit/api/module.h>
//#include <torch/csrc/module.h>
#include <iostream>
#include <fstream>
#include <memory>
#include <vector>
#include "ATen/ATen.h"
#include "torch/csrc/autograd/grad_mode.h"
#include "torch/script.h"
#include <torch/csrc/jit/mobile/function.h>
#include <torch/csrc/jit/mobile/import.h>
#include <torch/csrc/jit/mobile/interpreter.h>
//#include <torch/csrc/jit/api/module.h>
#include <torch/csrc/jit/mobile/observer.h>
#include "opencv2/opencv.hpp"
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include "fastBPE.hpp"
#include <sstream>
#include <unordered_map>
#include "npy.hpp"
#include <c10/core/TensorOptions.h>
//#include <torch/csrc/api/include/torch/torch.h>
using namespace std;
using namespace fastBPE;
std::unordered_map<std::string, int> loadVocab(const std::string& filename) {
std::unordered_map<std::string, int> vocab;
std::ifstream file(filename);
if (!file.is_open()) {
std::cerr << "Không thể mở file " << filename << std::endl;
return vocab;
}
std::string line;
bool inVocabSection = false;
while (std::getline(file, line)) {
// Tìm phần bắt đầu của "vocab" trong JSON
if (line.find("\"vocab\": {") != std::string::npos) {
inVocabSection = true;
continue;
}
// Kết thúc phần "vocab"
bool found = (line.find("},") != std::string::npos) ? true : false;
if (inVocabSection && found) {
file.close();
break;
}
// Xử lý các dòng trong phần "vocab"
if (inVocabSection) {
int colonPosFirst = line.find('"');
size_t colonPos = line.find_last_of('"');
if (colonPos != std::string::npos) {
std::string key = line.substr(colonPosFirst + 1, colonPos - colonPosFirst - 1); // Bỏ dấu ngoặc kép
key.erase(0, key.find_first_not_of(" \t")); // Xóa khoảng trắng đầu
key.erase(key.find_last_not_of(" \t") + 1); // Xóa khoảng trắng cuối
std::string valueStr = line.substr(colonPos + 2);
valueStr.erase(0, valueStr.find_first_not_of(" \t")); // Xóa khoảng trắng đầu
valueStr.erase(valueStr.find_last_not_of(" \t,") + 1); // Xóa khoảng trắng và dấu phẩy cuối
int value = std::stoi(valueStr);
vocab[key] = value;
}
}
}
// file.close();
return vocab;
}
extern "C" JNIEXPORT jstring JNICALL
Java_org_pytorch_helloworld_NativeInterface_processNpyWithModel(
JNIEnv* env,
jclass clazz,
jstring imagePath,
jstring segmentedText,
jstring modelPath) {
// Convert jstring to std::string
const char *modelPathCStr = env->GetStringUTFChars(modelPath, nullptr);
std::string modelPathStr(modelPathCStr);
auto options1 = torch::TensorOptions().dtype(torch::kFloat32).device(torch::kCPU);
auto img_tensor_1 = torch::ones({1, 3, 224, 224}, options1);
auto input_ids_tensor_1 = torch::ones({1, 20}, torch::TensorOptions().dtype(torch::kInt32));
auto attention_mask_tensor_1 = torch::ones({1, 20}, torch::TensorOptions().dtype(torch::kInt32));
auto token_type_ids_tensor_1 = torch::ones({1, 20}, torch::TensorOptions().dtype(torch::kInt32));
std::vector<torch::jit::IValue> inputs;
inputs.push_back(img_tensor_1);
inputs.push_back(input_ids_tensor_1);
inputs.push_back(attention_mask_tensor_1);
inputs.push_back(token_type_ids_tensor_1);
// Load the PyTorch model
c10::optional<at::Device> device = at::Device(at::kCPU);
torch::jit::ExtraFilesMap extra_files;
torch::jit::mobile::Module model = torch::jit::_load_for_mobile(modelPathStr, device, extra_files);
torch::Tensor outputTensor;
try {
outputTensor = model.forward(inputs).toTensor();
std::cout << "Output tensor: " << outputTensor.sizes() << ", " << outputTensor.dtype() << std::endl;
} catch (const c10::Error& e) {
std::cerr << "Error during forward pass: " << e.what() << std::endl;
}
std::string result = "Processed image and question with model.";
return env->NewStringUTF(result.c_str());
}
This is script I convert file pt into ptl:
import torch
from torch.utils.mobile_optimizer import optimize_for_mobile
from classifier import SimpleClassifier, StudentSimpleClassifier
from PIL import Image
import numpy as np
from transformers import ViTFeatureExtractor,AutoTokenizer
import argparse
import base_model
import mobilenetv3
from collections import namedtuple
import torch.utils
import torch.utils.bundled_inputs
from transformers import BatchEncoding
import torch.nn as nn
def get_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('-gpu', type=str, default='0')
# Choices of attention models
parser.add_argument('--model', type=str, default='CrossAtt', choices=['CMSA', 'CrossAtt', 'GuidedAtt'],
help='the model we use')
# Model setting
parser.add_argument('--object_detection', action='store_true', default=False, help='Use Object Detection model?')
parser.add_argument('--vit_backbone', type=str, default='vit')
parser.add_argument('--vit_image_pretrained', type=str, default='google/vit-base-patch16-224-in21k')
parser.add_argument('--cnn_backbone', type=str, default='resnet34')
parser.add_argument('--cnn_image_pretrained', type=str, default='google/vit-base-patch16-224-in21k')
parser.add_argument('--bert_type', type=str, default='phobert')
parser.add_argument('--bert_pretrained', type=str, default='vinai/phobert-base')
parser.add_argument('--input_size', type=int, default=224)
parser.add_argument('--data_dir', type=str, default='/content/dataset')
parser.add_argument('--output', type=str, default='/content')
# Define dimensions
parser.add_argument('--v_vit_dim', type=int, default=768,
help='dim of image features')
parser.add_argument('--v_cnn_dim', type=int, default=512,
help='dim of image features')
parser.add_argument('--q_dim', type=int, default=768,
help='dim of bert question features')
parser.add_argument('--f_mid_dim', type=int, default=1024,
help='dim of middle layer of fusion layer')
parser.add_argument('--joint_dim', type=int, default=1024,
help='dim of joint features of fusion layer')
parser.add_argument('--glimpse', type=int, default=1,
help='number of glimpse for the attention reduction')
# Multihead self-attention config
parser.add_argument('--hidden_dim', type=int, default=2048,
help='dim of hidden layer of feed forward layers of transformers')
parser.add_argument('--num_heads', type=int, default=8,
help='number of heads of transformers encoder')
# BAN - Bilinear Attention Networks
parser.add_argument('--gamma', type=int, default=2,
help='glimpse in Bilinear Attention Networks')
# Choices of RNN models
parser.add_argument('--rnn', type=str, default='LSTM', choices=['LSTM', 'GRU'],
help='the RNN we use')
parser.add_argument('--op', type=str, default='c',
help='concatenated 600-D word embedding')
parser.add_argument('--question_len', default=20, type=int, metavar='N',
help='maximum length of input question')
parser.add_argument('--tfidf', type=bool, default=None,
help='tfitrain_log_df word embedding?')
# Activation function + dropout for classification module
parser.add_argument('--activation', type=str, default='relu', choices=['relu'],
help='the activation to use for final classifier')
parser.add_argument('--dropout', default=0.2, type=float, metavar='dropout',
help='dropout of rate of final classifier')
parser.add_argument('--clip_norm', default=.25, type=float, metavar='NORM',
help='clip threshold of gradients')
# Training setting
parser.add_argument('--seed', type=int, default=1234)
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--nepochs', type=int, default=100)
parser.add_argument('--resume_epoch', type=int, default=100)
parser.add_argument('--train_fold', type=str, default='/content')
parser.add_argument('--run_id', type=int, default=-1)
parser.add_argument('--T', type=int, default=2)
# Optimizer setting
parser.add_argument('--init_lr', type=float, default=1e-5)
parser.add_argument('--max_lr', type=float, default=5e-5)
parser.add_argument('--weight_decay', type=float, default=5e-4)
parser.add_argument('--momentum', type=float, default=0.9)
parser.add_argument('--warmup_steps', type=int, default=20)
parser.add_argument('--label_smooth', type=float, default=0.0)
parser.add_argument('--threshold', type=float, default=0.7)
parser.add_argument('--print_summary', action='store_true', default=False, help='Print model summary?')
parser.add_argument('--save_every', type=int, default=5)
parser.add_argument('--log_every', type=int, default=25)
parser.add_argument('--emb_init', type=str, default='biowordvec', choices=['glove', 'biowordvec', 'biosentvec'])
parser.add_argument('--self_att', action='store_true', default=False, help='Use Self Attention?')
parser.add_argument('--use_spatial', action='store_true', default=False, help='Use spatial feature?')
parser.add_argument('--use_cma', action='store_true', default=False, help='Use CMA?')
parser.add_argument('--result_fold', type=str, default='results')
return parser.parse_args()
args = get_arguments()
feature_extractor = ViTFeatureExtractor().from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
def Model():
# device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu")
device = "cpu"
args.device = device
mobiNetCNN = mobilenetv3.mobilenetv3_small()
mobiNetViT = mobilenetv3.mobilenetv3_smallViT()
current_model_dict = mobiNetCNN.state_dict()
loaded_state_dict = torch.load("./pretrained_mobileNet/mobilenetv3-small-55df8e1f.pth")
new_state_dict={k:v if v.size()==current_model_dict[k].size() else current_model_dict[k] for k,v in zip(current_model_dict.keys(), loaded_state_dict.values())}
mobiNetCNN.load_state_dict(new_state_dict, strict=False)
current_model_dict = mobiNetViT.state_dict()
loaded_state_dict = torch.load("./pretrained_mobileNet/mobilenetv3-small-55df8e1f.pth")
new_state_dict={k:v if v.size()==current_model_dict[k].size() else current_model_dict[k] for k,v in zip(current_model_dict.keys(), loaded_state_dict.values())}
mobiNetViT.load_state_dict(new_state_dict, strict=False)
args.num_classes = 353
student_model = base_model.build_GuidedAtt_replaceResNetViT(args, mobiNetCNN,mobiNetViT)
return student_model
def createInput(img_path = "/home/dmp/1.Users/05.huy.hhoang/1/1271.jpg", question = "có bao nhiêu người đàn ông trong nhà bếp, hai người đang ngồi trên quầy"):
img = Image.open(img_path)
img_np = np.array(img)
if len(img_np.shape) == 2:
img_np = np.stack((img_np, img_np, img_np), axis=-1)
img = Image.fromarray(img_np.astype(np.uint8))
visual_feature = feature_extractor([img], return_tensors='pt')
embed_question = tokenizer([question], padding='max_length',
max_length=20, truncation=True, return_tensors='pt')
return (visual_feature, embed_question)
class MyModel(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
def forward(self, image_features, input_ids, attention_mask, token_type_ids):
print("start forward mymodel")
output= self.model(image_features, (input_ids, attention_mask, token_type_ids))
print("--------------")
return output
def optimizeSaver():
print("START CONVERT")
model = Model()
model.load_state_dict(torch.load("GuidedAtt_vit_resnet34_phobert_31_12_2023__01_19_41.pt", map_location='cpu'),strict=True)
model.eval()
my_model = MyModel(model)
print("DONE LOAD MODEL")
temp_input = createInput()
visual_feature, embed_question = temp_input
print("DONE CREATE INPUT")
#test forward input:
print("embed_question:",embed_question)
print("==============================================")
image_features = visual_feature['pixel_values']
input_ids = embed_question['input_ids']
attention_mask = embed_question['attention_mask']
token_type_ids = embed_question['token_type_ids']
out, _, _ = my_model.forward(image_features, input_ids, attention_mask, token_type_ids)
print("mymodel out:",out)
print("mymodel out shape:",out.shape)
# print("inputs:",inputs)
module = torch.jit.trace(my_model, (image_features, input_ids, attention_mask, token_type_ids))
traced_script_module_optimized = optimize_for_mobile(module)
#save
traced_script_module_optimized._save_for_lite_interpreter("common.ptl")
print("DONE")
optimizeSaver()
I have re-test shape of tensors and confirm that:
- image_features: torch.Size([1, 3, 224, 224])
- attention_mask: torch.Size([1, 20])
- input_ids: torch.Size([1, 20])
- token_type_ids: torch.Size([1, 20])