Why is torchserve returning InternalServerException?

I have the following Torchserve handler on GCP, but I’m getting prediction failed:

%%writefile predictor/custom_handler.py

from ts.torch_handler.base_handler import BaseHandler
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import datetime
import logging
import json
import os

logger = logging.getLogger(__name__)

class TransformershHandler(BaseHandler):
    The handler takes an input string and returns the classification text
    based on the serialized transformers checkpoint.

    def __init__(self):
        super(TransformersHandler, self).__init__()
        self.chat_history = []
        self.chat_history_ids = None

    def initialize(self, ctx):
        """ Loads the model.pt file and initialized the model object.
        Instantiates Tokenizer for preprocessor to use
        Loads labels to name mapping file for post-processing inference response
        self.manifest = ctx.manifest

        properties = ctx.system_properties
        model_dir = properties.get("model_dir")
        self.device = torch.device(
            "cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu")

        # Read model serialize/pt file
        serialized_file = self.manifest["model"]["serializedFile"]
        model_pt_path = os.path.join(model_dir, serialized_file)
        if not os.path.isfile(model_pt_path):
            raise RuntimeError(
                "Missing the model.pt or pytorch_model.bin file")

        # Load model
        self.model = AutoModelWithLMHead.from_pretrained(model_dir)
            'Transformer model from path {0} loaded successfully'.format(model_dir))

        # Ensure to use the same tokenizer used during training
        self.tokenizer = AutoTokenizer.from_pretrained(

    def preprocess(self, data):
        """ Preprocessing input request by tokenizing
            Extend with your own preprocessing steps as needed
        user_message = data[0].get("data")
        if user_message is None:
            user_message = data[0].get("body")
        #user_message = text.decode('utf-8')
        logger.info("Received text: '%s'", user_message)

        # Tokenize the texts
        # encode the new user message to be used by our model
        inputs = self.tokenizer.encode(
            user_message + self.tokenizer.eos_token, return_tensors='pt')

        # append the encoded message to the past history so the model is aware of past context
        if self.chat_history_ids is not None:
            inputs = torch.cat([self.chat_history_ids, inputs], dim=-1)

        return inputs

    def inference(self, inputs):
        """ Predict the class of a text using a trained transformer model.
        self.chat_history_ids = self.model.generate(inputs,
        decoded_message = self.tokenizer.decode(
            self.chat_history_ids[:, inputs.shape[-1]:][0], skip_special_tokens=True)

        return decoded_message

    def postprocess(self, inference_output):
        return inference_output

%%bash -s $APP_NAME


cat << EOF > ./predictor/Dockerfile

FROM pytorch/torchserve:latest-cpu

# install dependencies
RUN python3 -m pip install --upgrade pip
RUN pip3 install transformers

USER model-server

# copy model artifacts, custom handler and other dependencies
COPY ./custom_handler.py /home/model-server/
COPY ./model/config.json /home/model-server/
COPY ./model/eval_results.txt /home/model-server/
COPY ./model/merges.txt /home/model-server/
COPY ./model/pytorch_model.bin /home/model-server/
COPY ./model/special_tokens_map.json /home/model-server/
COPY ./model/tokenizer_config.json /home/model-server/
COPY ./model/tokenizer.json /home/model-server/
COPY ./model/training_args.bin /home/model-server/
COPY ./model/vocab.json /home/model-server/

#COPY ./model/$APP_NAME/ /home/model-server/

# create torchserve configuration file
USER root
RUN printf "\nservice_envelope=json" >> /home/model-server/config.properties
RUN printf "\ninference_address=" >> /home/model-server/config.properties
RUN printf "\nmanagement_address=" >> /home/model-server/config.properties
USER model-server

# expose health and prediction listener ports from the image

# create model archive file packaging model artifacts and dependencies
RUN torch-model-archiver -f \
  --model-name=$APP_NAME \
  --version=1.0 \
  --serialized-file=/home/model-server/pytorch_model.bin \
  --handler=/home/model-server/custom_handler.py \
  --extra-files "/home/model-server/config.json,/home/model-server/tokenizer.json,/home/model-server/training_args.bin,/home/model-server/tokenizer_config.json,/home/model-server/special_tokens_map.json,/home/model-server/vocab.json,/home/model-server/merges.txt,/home/model-server/eval_results.txt" \

# run Torchserve HTTP serve to respond to prediction requests
CMD ["torchserve", \
     "--start", \
     "--ts-config=/home/model-server/config.properties", \
     "--models", \
     "$APP_NAME=$APP_NAME.mar", \
     "--model-store", \

echo "Writing ./predictor/Dockerfile"

%%bash -s $APP_NAME


cat > ./predictor/instances.json <<END
“instances”: [
“data”: {
“message”: “Hello”

curl -s -X POST
-H “Content-Type: application/json; charset=utf-8”
-d @./predictor/instances.json

“code”: 503,
“type”: “InternalServerException”,
“message”: “Prediction failed”

What am I doing wrong please

Internal server exception is a bad name, it usually indicated an error in your handler so you can debug it by looking at logs/model_log.log in the same directory where you ran torchserve --start