TorchServe use a lot of system RAM when run with GPU

Hello guys. I’m facing this issue when serving my model: TorchServe use a lot of system RAM when run with GPU, but when serving with CPU, it doesn’t. My model are in .ONNX format and I was converted them to .MAR format. This is handler_face_detect.py file:

import onnxruntime
import torch
import cv2
import numpy as np

from ts.torch_handler.base_handler import BaseHandler
from PIL import Image
import os
import io
import urllib.request
from ultils_v8 import *


class FaceDetection(BaseHandler):
    def __init__(self):
        self._context = None
        self.initialized = False
        self.session = None
        self.img = None
        self.blob = None
        self.idx_frame = None
        self.count = 0

    def initialize(self, context):
        self._context = context
        self.manifest = context.manifest
        properties = context.system_properties
        model_dir = properties.get("model_dir")

        # Read model serialize/pt file
        serialized_file = self.manifest["model"]["serializedFile"]
        model_file_path = os.path.join(model_dir, serialized_file)

        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
        sess_options = onnxruntime.SessionOptions()
        self.session = onnxruntime.InferenceSession(
            model_file_path, sess_options=sess_options, providers=providers
        )
        self.initialized = True

    def preprocess(self, data):
        image = None
        data_get = data[0].get("data")
        data_1 = None
        if data_get is None:
            data_get = data[0].get("body")
        if isinstance(data_get, str):
            req = urllib.request.urlopen(data_get)
            image = Image.open(io.BytesIO(req.read()))
        else:
            data_1 = io.BytesIO(data_get)
            image = Image.open(data_1)
        raw_data = np.array(image)
        self.img = raw_data
        img = cv2.cvtColor(raw_data, cv2.COLOR_RGB2BGR)
        im = pre_process(img)
        return im.detach().numpy()

    def inference(self, blob):
        self.blob = blob
        outputs = self.session.run(None, {self.session.get_inputs()[0].name: blob})
        # print("time :{:.3f} s".format(time.perf_counter()-start))
        return outputs[0]

    def postprocess(self, preds):
        # if self.count>1000000:
        #     sys.exit()
        res = []
        # logging.info(preds)
        preds = non_max_suppression(torch.from_numpy(np.asarray(preds)))[0]
        # print(preds)
        bbox = scale_boxes([640, 640], preds[:, :4], self.img.shape).round()
        score = preds[:, 4]
        cls = preds[:, 5]
        preds = preds.detach().numpy()
        bbox = bbox.detach().numpy()

        # self.count+=1
        res.append(
            {
                "output": preds.tolist(),
                "bbox": bbox.tolist(),
                "label": cls.tolist(),
                "score": score.tolist(),
            }
        )
        return [res]

    # def handle(self, data, context):
    #     if not self.initialized:
    #         self.initialized(context)
    #     model_input=self.preprocess(data)
    #     model_output=self.inference(model_input)
    #     return self.postprocess(model_output)

my_handler.py:

from handler_face_detect import FaceDetection

_service = FaceDetection()


def handle(data, context):
    if not _service.initialized:
        _service.initialize(context)

    if data is None:
        return None

    data = _service.preprocess(data)
    data = _service.inference(data)
    data = _service.postprocess(data)

    return data

archive.sh file to generate .MAR file:

torch-model-archiver --model-name FaceDetection \
--version 1.0 \
--serialized-file ./yolov8n-face.onnx \
--extra-files ./ultils_v8.py,./handler_face_detect.py \
--handler my_handler.py  \
--export-path model-store -f 

config.properties:

inference_address=http://0.0.0.0:8090
management_address=http://0.0.0.0:8091
metrics_address=http://0.0.0.0:8092
load_models=all
install_py_dep_per_model=true
model_store=model-store

That are 1 of 5 models I choose to serve in my server. My server hardware specs:
CPU: I5-10400F
GPU: RTX 3060 12GB VRAM
RAM: 8GB

I’m just searching issue on Github repo but there are few people have same issue like me and not have any solution.