TorchServe use a lot of system RAM when run with GPU

Hello guys. I’m facing this issue when serving my model: TorchServe use a lot of system RAM when run with GPU, but when serving with CPU, it doesn’t. My model are in .ONNX format and I was converted them to .MAR format. This is file:

import onnxruntime
import torch
import cv2
import numpy as np

from ts.torch_handler.base_handler import BaseHandler
from PIL import Image
import os
import io
import urllib.request
from ultils_v8 import *

class FaceDetection(BaseHandler):
    def __init__(self):
        self._context = None
        self.initialized = False
        self.session = None
        self.img = None
        self.blob = None
        self.idx_frame = None
        self.count = 0

    def initialize(self, context):
        self._context = context
        self.manifest = context.manifest
        properties = context.system_properties
        model_dir = properties.get("model_dir")

        # Read model serialize/pt file
        serialized_file = self.manifest["model"]["serializedFile"]
        model_file_path = os.path.join(model_dir, serialized_file)

        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
        sess_options = onnxruntime.SessionOptions()
        self.session = onnxruntime.InferenceSession(
            model_file_path, sess_options=sess_options, providers=providers
        self.initialized = True

    def preprocess(self, data):
        image = None
        data_get = data[0].get("data")
        data_1 = None
        if data_get is None:
            data_get = data[0].get("body")
        if isinstance(data_get, str):
            req = urllib.request.urlopen(data_get)
            image =
            data_1 = io.BytesIO(data_get)
            image =
        raw_data = np.array(image)
        self.img = raw_data
        img = cv2.cvtColor(raw_data, cv2.COLOR_RGB2BGR)
        im = pre_process(img)
        return im.detach().numpy()

    def inference(self, blob):
        self.blob = blob
        outputs =, {self.session.get_inputs()[0].name: blob})
        # print("time :{:.3f} s".format(time.perf_counter()-start))
        return outputs[0]

    def postprocess(self, preds):
        # if self.count>1000000:
        #     sys.exit()
        res = []
        preds = non_max_suppression(torch.from_numpy(np.asarray(preds)))[0]
        # print(preds)
        bbox = scale_boxes([640, 640], preds[:, :4], self.img.shape).round()
        score = preds[:, 4]
        cls = preds[:, 5]
        preds = preds.detach().numpy()
        bbox = bbox.detach().numpy()

        # self.count+=1
                "output": preds.tolist(),
                "bbox": bbox.tolist(),
                "label": cls.tolist(),
                "score": score.tolist(),
        return [res]

    # def handle(self, data, context):
    #     if not self.initialized:
    #         self.initialized(context)
    #     model_input=self.preprocess(data)
    #     model_output=self.inference(model_input)
    #     return self.postprocess(model_output)

from handler_face_detect import FaceDetection

_service = FaceDetection()

def handle(data, context):
    if not _service.initialized:

    if data is None:
        return None

    data = _service.preprocess(data)
    data = _service.inference(data)
    data = _service.postprocess(data)

    return data file to generate .MAR file:

torch-model-archiver --model-name FaceDetection \
--version 1.0 \
--serialized-file ./yolov8n-face.onnx \
--extra-files ./,./ \
--handler  \
--export-path model-store -f


That are 1 of 5 models I choose to serve in my server. My server hardware specs:
CPU: I5-10400F

I’m just searching issue on Github repo but there are few people have same issue like me and not have any solution.