From ec3b99291e3c333afcfeae764beb4c8d22cb3c80 Mon Sep 17 00:00:00 2001 From: Sidharth Rajaram Date: Tue, 27 Jun 2023 11:56:38 -0700 Subject: [PATCH] Handler for Instruction Embedding models (and a typo fix) (#2431) * fixed arg parser link * handler for instruction embedding models * fixed some formatting, pylint fixes * explain output and what to do with it * spellcheck, formatting --- docs/internals.md | 2 +- examples/instruction_embedding/README.md | 99 +++++++++++++++++++ .../instructor_embedding_handler.py | 37 +++++++ .../instruction_embedding/requirements.txt | 1 + ts_scripts/spellcheck_conf/wordlist.txt | 2 + 5 files changed, 140 insertions(+), 1 deletion(-) create mode 100644 examples/instruction_embedding/README.md create mode 100644 examples/instruction_embedding/instructor_embedding_handler.py create mode 100644 examples/instruction_embedding/requirements.txt diff --git a/docs/internals.md b/docs/internals.md index a82003ce9d..b27c00fba7 100644 --- a/docs/internals.md +++ b/docs/internals.md @@ -38,7 +38,7 @@ And backend is the Python code (most Pytorch specific stuff) ### Backend (Python) -https://github.com/pytorch/serve/blob/master/ts/arg_parser.py#L64 +https://github.com/pytorch/serve/blob/master/ts/arg_parser.py * Arg parser controls config/not workflow and can also setup a model service worker with a custom socket diff --git a/examples/instruction_embedding/README.md b/examples/instruction_embedding/README.md new file mode 100644 index 0000000000..efb2041aa0 --- /dev/null +++ b/examples/instruction_embedding/README.md @@ -0,0 +1,99 @@ +# A TorchServe handler for Instructor Embedding models + +A simple handler that you can use to serve [Instructor Embedding models](https://instructor-embedding.github.io/) with TorchServe, supporting both single inference and batch inference. + +# Setup: + +**1.** [Download an Instructor model (i.e. Instructor-XL)](https://huggingface.co/hkunlp/instructor-xl/tree/main?clone=true) from HuggingFace into your model store directory of choosing. Copy the `instructor-embedding-handler.py` into the same directory as your newly downloaded directory containing all the model-related files. + +**2.** Create the .MAR Model Archive using [`torch-model-archiver`](https://github.com/pytorch/serve/blob/master/model-archiver/README.md): + +```bash +torch-model-archiver --model-name --version 1.0 --handler PATH/TO/instructor-embedding-handler.py --extra-files --serialized-file /pytorch_model.bin --f +``` + +**3.** Use [TorchServe](https://pytorch.org/serve/server.html) to startup the server and deploy the Instruction Embedding model you downloaded. + +**Note:** Instructor Embedding models are around ~4 GB. By default, torchserve will autoscale workers (each with a loaded copy of the model). [At present](https://github.com/pytorch/serve/issues/2432), if you have memory concerns, you have to make use of the [Management API](https://pytorch.org/serve/management_api.html) to bring up the server and deploy your model. + + +# Performing Inference +To perform inference for an instruction and corresponding sentence, use the following format for the request body: +```text +{ + "inputs": [INSTRUCTION, SENTENCE] +} +``` + +To perform batch inference, use the following format for the request body: +```text +{ + "inputs": [ + [INSTRUCTION_1, SENTENCE_1], + [INSTRUCTION_2, SENTENCE_2], + ... + ] +} +``` + +## Example: Single Inference +Request Endpoint: /predictions/ + +Request Body: +```json +{ + "inputs": ["Represent the Science title:", "3D ActionSLAM: wearable person tracking in multi-floor environments"] +} +``` + +### Response: +```yaml +[ + 0.010738617740571499, + ... + 0.10961631685495377 +] +``` + +## Example: Batch Inference +Request Endpoint: /predictions/ + +Request Body: +```json +{ + "inputs": [ + ["Represent the Science title:", "3D ActionSLAM: wearable person tracking in multi-floor environments"], + ["Represent the Medicine sentence for retrieving a duplicate sentence:", "Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear."] + ] +} +``` + +### Response: +```yaml +[ + [ + 0.010738617740571499, + ... + 0.10961631685495377 + ], + [ + 0.014582153409719467, + ... + 0.08006688207387924 + ] +] +``` + +**Note:** The above request example was for batch inference on 2 distinct instruction/sentence pairs. The output of the batch inference request is two embedding vectors corresponding to the two input pairs (instruction, sentence): + +**The first input was:** +["Represent the Science title:", "3D ActionSLAM: wearable person tracking in multi-floor environments"] + +**The second input was:** +["Represent the Medicine sentence for retrieving a duplicate sentence:", "Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear."] + +The response was a list of 2 embedding vectors (numpy arrays converted `.tolist()` to ensure they were JSON serializable) corresponding to each of those inputs. The output vectors are quite long, so ellipses were used to make it more readable. + +# Then What? + +**Despite being slightly different under the hood compared to more traditional embedding models (i.e. Sentence Transformers), instruction embeddings can be used just like any other embeddings.** They are still just vector representations of your input text. The only difference is that the embedding vectors are *more fine-tuned* to the downstream task described by the instruction. To that end, these outputted embedding vectors can be stored or looked up in a vector database for [use cases](https://www.pinecone.io/learn/vector-embeddings-for-developers/#what-can-i-do-with-vector-embeddings) like semantic search or question answering or long-term memory for large language models. Check out the [Instructor Embedding project page](https://instructor-embedding.github.io/) for more information. diff --git a/examples/instruction_embedding/instructor_embedding_handler.py b/examples/instruction_embedding/instructor_embedding_handler.py new file mode 100644 index 0000000000..2b29fc19eb --- /dev/null +++ b/examples/instruction_embedding/instructor_embedding_handler.py @@ -0,0 +1,37 @@ +""" +Handler class for Instruction Embedding models (https://instructor-embedding.github.io/) +""" +import logging + +from InstructorEmbedding import INSTRUCTOR + +from ts.torch_handler.base_handler import BaseHandler + +logger = logging.getLogger(__name__) + + +class InstructorEmbeddingHandler(BaseHandler): + """ + Handler class for Instruction Embedding models. + Refer to the README for how to use Instructor models and this handler. + """ + + def __init__(self): + super().__init__() + self.initialized = False + self.model = None + + def initialize(self, context): + properties = context.system_properties + logger.info("Initializing Instructor Embedding model...") + model_dir = properties.get("model_dir") + self.model = INSTRUCTOR(model_dir) + self.initialized = True + + def handle(self, data, context): + inputs = data[0].get("body").get("inputs") + if isinstance(inputs[0], str): + # single inference + inputs = [inputs] + pred_embeddings = self.model.encode(inputs) + return [pred_embeddings.tolist()] diff --git a/examples/instruction_embedding/requirements.txt b/examples/instruction_embedding/requirements.txt new file mode 100644 index 0000000000..3205c675e9 --- /dev/null +++ b/examples/instruction_embedding/requirements.txt @@ -0,0 +1 @@ +InstructorEmbedding \ No newline at end of file diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index db5a380543..76141f1e83 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1060,3 +1060,5 @@ AMI DLAMI XLA inferentia +ActionSLAM +statins