Hello guys. I’m facing this issue when serving my model: TorchServe use a lot of system RAM when run with GPU, but when serving with CPU, it doesn’t. My model are in .ONNX format and I was converted them to .MAR format. This is handler_face_detect.py file:
import onnxruntime
import torch
import cv2
import numpy as np
from ts.torch_handler.base_handler import BaseHandler
from PIL import Image
import os
import io
import urllib.request
from ultils_v8 import *
class FaceDetection(BaseHandler):
def __init__(self):
self._context = None
self.initialized = False
self.session = None
self.img = None
self.blob = None
self.idx_frame = None
self.count = 0
def initialize(self, context):
self._context = context
self.manifest = context.manifest
properties = context.system_properties
model_dir = properties.get("model_dir")
# Read model serialize/pt file
serialized_file = self.manifest["model"]["serializedFile"]
model_file_path = os.path.join(model_dir, serialized_file)
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
sess_options = onnxruntime.SessionOptions()
self.session = onnxruntime.InferenceSession(
model_file_path, sess_options=sess_options, providers=providers
)
self.initialized = True
def preprocess(self, data):
image = None
data_get = data[0].get("data")
data_1 = None
if data_get is None:
data_get = data[0].get("body")
if isinstance(data_get, str):
req = urllib.request.urlopen(data_get)
image = Image.open(io.BytesIO(req.read()))
else:
data_1 = io.BytesIO(data_get)
image = Image.open(data_1)
raw_data = np.array(image)
self.img = raw_data
img = cv2.cvtColor(raw_data, cv2.COLOR_RGB2BGR)
im = pre_process(img)
return im.detach().numpy()
def inference(self, blob):
self.blob = blob
outputs = self.session.run(None, {self.session.get_inputs()[0].name: blob})
# print("time :{:.3f} s".format(time.perf_counter()-start))
return outputs[0]
def postprocess(self, preds):
# if self.count>1000000:
# sys.exit()
res = []
# logging.info(preds)
preds = non_max_suppression(torch.from_numpy(np.asarray(preds)))[0]
# print(preds)
bbox = scale_boxes([640, 640], preds[:, :4], self.img.shape).round()
score = preds[:, 4]
cls = preds[:, 5]
preds = preds.detach().numpy()
bbox = bbox.detach().numpy()
# self.count+=1
res.append(
{
"output": preds.tolist(),
"bbox": bbox.tolist(),
"label": cls.tolist(),
"score": score.tolist(),
}
)
return [res]
# def handle(self, data, context):
# if not self.initialized:
# self.initialized(context)
# model_input=self.preprocess(data)
# model_output=self.inference(model_input)
# return self.postprocess(model_output)
my_handler.py:
from handler_face_detect import FaceDetection
_service = FaceDetection()
def handle(data, context):
if not _service.initialized:
_service.initialize(context)
if data is None:
return None
data = _service.preprocess(data)
data = _service.inference(data)
data = _service.postprocess(data)
return data
archive.sh file to generate .MAR file:
torch-model-archiver --model-name FaceDetection \
--version 1.0 \
--serialized-file ./yolov8n-face.onnx \
--extra-files ./ultils_v8.py,./handler_face_detect.py \
--handler my_handler.py \
--export-path model-store -f
config.properties:
inference_address=http://0.0.0.0:8090
management_address=http://0.0.0.0:8091
metrics_address=http://0.0.0.0:8092
load_models=all
install_py_dep_per_model=true
model_store=model-store
That are 1 of 5 models I choose to serve in my server. My server hardware specs:
CPU: I5-10400F
GPU: RTX 3060 12GB VRAM
RAM: 8GB
I’m just searching issue on Github repo but there are few people have same issue like me and not have any solution.