I have the following Torchserve handler on GCP, but I’m getting prediction failed:
%%writefile predictor/custom_handler.py
from ts.torch_handler.base_handler import BaseHandler
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import datetime
import logging
import json
import os
logger = logging.getLogger(__name__)
class TransformershHandler(BaseHandler):
"""
The handler takes an input string and returns the classification text
based on the serialized transformers checkpoint.
"""
def __init__(self):
super(TransformersHandler, self).__init__()
self.chat_history = []
self.chat_history_ids = None
def initialize(self, ctx):
""" Loads the model.pt file and initialized the model object.
Instantiates Tokenizer for preprocessor to use
Loads labels to name mapping file for post-processing inference response
"""
self.manifest = ctx.manifest
properties = ctx.system_properties
model_dir = properties.get("model_dir")
self.device = torch.device(
"cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")
# Read model serialize/pt file
serialized_file = self.manifest["model"]["serializedFile"]
model_pt_path = os.path.join(model_dir, serialized_file)
if not os.path.isfile(model_pt_path):
raise RuntimeError(
"Missing the model.pt or pytorch_model.bin file")
# Load model
self.model = AutoModelWithLMHead.from_pretrained(model_dir)
self.model.to(self.device)
self.model.eval()
logger.debug(
'Transformer model from path {0} loaded successfully'.format(model_dir))
# Ensure to use the same tokenizer used during training
self.tokenizer = AutoTokenizer.from_pretrained(
'microsoft/DialoGPT-small')
def preprocess(self, data):
""" Preprocessing input request by tokenizing
Extend with your own preprocessing steps as needed
"""
user_message = data[0].get("data")
if user_message is None:
user_message = data[0].get("body")
#user_message = text.decode('utf-8')
####
####
logger.info("Received text: '%s'", user_message)
# Tokenize the texts
# encode the new user message to be used by our model
inputs = self.tokenizer.encode(
user_message + self.tokenizer.eos_token, return_tensors='pt')
# append the encoded message to the past history so the model is aware of past context
if self.chat_history_ids is not None:
inputs = torch.cat([self.chat_history_ids, inputs], dim=-1)
return inputs
def inference(self, inputs):
""" Predict the class of a text using a trained transformer model.
"""
self.chat_history_ids = self.model.generate(inputs,
pad_token_id=self.tokenizer.eos_token_id,
do_sample=True,
max_length=1000,
top_k=100,
top_p=0.8,
temperature=0.8,
)
decoded_message = self.tokenizer.decode(
self.chat_history_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)
return decoded_message
def postprocess(self, inference_output):
return inference_output
%%bash -s $APP_NAME
APP_NAME=$1
cat << EOF > ./predictor/Dockerfile
FROM pytorch/torchserve:latest-cpu
# install dependencies
RUN python3 -m pip install --upgrade pip
RUN pip3 install transformers
USER model-server
# copy model artifacts, custom handler and other dependencies
COPY ./custom_handler.py /home/model-server/
COPY ./model/config.json /home/model-server/
COPY ./model/eval_results.txt /home/model-server/
COPY ./model/merges.txt /home/model-server/
COPY ./model/pytorch_model.bin /home/model-server/
COPY ./model/special_tokens_map.json /home/model-server/
COPY ./model/tokenizer_config.json /home/model-server/
COPY ./model/tokenizer.json /home/model-server/
COPY ./model/training_args.bin /home/model-server/
COPY ./model/vocab.json /home/model-server/
#COPY ./model/$APP_NAME/ /home/model-server/
# create torchserve configuration file
USER root
RUN printf "\nservice_envelope=json" >> /home/model-server/config.properties
RUN printf "\ninference_address=http://0.0.0.0:7080" >> /home/model-server/config.properties
RUN printf "\nmanagement_address=http://0.0.0.0:7081" >> /home/model-server/config.properties
USER model-server
# expose health and prediction listener ports from the image
EXPOSE 7080
EXPOSE 7081
# create model archive file packaging model artifacts and dependencies
RUN torch-model-archiver -f \
--model-name=$APP_NAME \
--version=1.0 \
--serialized-file=/home/model-server/pytorch_model.bin \
--handler=/home/model-server/custom_handler.py \
--extra-files "/home/model-server/config.json,/home/model-server/tokenizer.json,/home/model-server/training_args.bin,/home/model-server/tokenizer_config.json,/home/model-server/special_tokens_map.json,/home/model-server/vocab.json,/home/model-server/merges.txt,/home/model-server/eval_results.txt" \
--export-path=/home/model-server/model-store
# run Torchserve HTTP serve to respond to prediction requests
CMD ["torchserve", \
"--start", \
"--ts-config=/home/model-server/config.properties", \
"--models", \
"$APP_NAME=$APP_NAME.mar", \
"--model-store", \
"/home/model-server/model-store"]
EOF
echo "Writing ./predictor/Dockerfile"
…
%%bash -s $APP_NAME
APP_NAME=$1
cat > ./predictor/instances.json <<END
{
“instances”: [
{
“data”: {
“message”: “Hello”
}
}
]
}
END
curl -s -X POST
-H “Content-Type: application/json; charset=utf-8”
-d @./predictor/instances.json
http://localhost:7080/predictions/$APP_NAME/
{
“code”: 503,
“type”: “InternalServerException”,
“message”: “Prediction failed”
}
What am I doing wrong please