From 7b34c96b347885eee49902e7ed8dd80002d7dd4d Mon Sep 17 00:00:00 2001
From: lxning <lninga@amazon.com>
Date: Wed, 11 Oct 2023 12:48:51 -0700
Subject: [PATCH 1/3] fmt

---
 .../llama2/custom_handler_code.py             | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py

diff --git a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py
new file mode 100644
index 0000000000..d48c0cc593
--- /dev/null
+++ b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py
@@ -0,0 +1,140 @@
+import logging
+from abc import ABC
+
+import torch
+import transformers
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from ts.context import Context
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+logger.info("Transformers version %s", transformers.__version__)
+
+
+class LlamaHandler(BaseHandler, ABC):
+    """
+    Transformers handler class for sequence, token classification and question answering.
+    """
+
+    def __init__(self):
+        super(LlamaHandler, self).__init__()
+        self.max_length = None
+        self.max_new_tokens = None
+        self.tokenizer = None
+        self.initialized = False
+
+    def initialize(self, ctx: Context):
+        """In this initialize function, the HF large model is loaded and
+        partitioned using DeepSpeed.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artifacts parameters.
+        """
+        model_dir = ctx.system_properties.get("model_dir")
+        self.max_length = int(ctx.model_yaml_config["handler"]["max_length"])
+        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
+        model_name = ctx.model_yaml_config["handler"]["model_name"]
+        model_path = f'{model_dir}/{ctx.model_yaml_config["handler"]["model_path"]}'
+        seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
+        torch.manual_seed(seed)
+
+        logger.info("Model %s loading tokenizer", ctx.model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            device_map="balanced",
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+            load_in_8bit=True,
+            trust_remote_code=True,
+        )
+        if ctx.model_yaml_config["handler"]["fast_kernels"]:
+            from optimum.bettertransformer import BetterTransformer
+
+            try:
+                self.model = BetterTransformer.transform(self.model)
+            except RuntimeError as error:
+                logger.warning(
+                    "HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview"
+                )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        logger.info("Model %s loaded successfully", ctx.model_name)
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """
+        Basic text preprocessing, based on the user's choice of application mode.
+        Args:
+            requests (list): A list of dictionaries with a "data" or "body" field, each
+                            containing the input text to be processed.
+        Returns:
+            tuple: A tuple with two tensors: the batch of input ids and the batch of
+                attention masks.
+        """
+        input_texts = [data.get("data") or data.get("body") for data in requests]
+        input_ids_batch, attention_mask_batch = [], []
+        for input_text in input_texts:
+            input_ids, attention_mask = self.encode_input_text(input_text)
+            input_ids_batch.append(input_ids)
+            attention_mask_batch.append(attention_mask)
+        input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.model.device)
+        attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
+        return input_ids_batch, attention_mask_batch
+
+    def encode_input_text(self, input_text):
+        """
+        Encodes a single input text using the tokenizer.
+        Args:
+            input_text (str): The input text to be encoded.
+        Returns:
+            tuple: A tuple with two tensors: the encoded input ids and the attention mask.
+        """
+        if isinstance(input_text, (bytes, bytearray)):
+            input_text = input_text.decode("utf-8")
+        logger.info("Received text: '%s'", input_text)
+        inputs = self.tokenizer.encode_plus(
+            input_text,
+            max_length=self.max_length,
+            padding=False,
+            add_special_tokens=True,
+            return_tensors="pt",
+            truncation=True,
+        )
+        input_ids = inputs["input_ids"]
+        attention_mask = inputs["attention_mask"]
+        return input_ids, attention_mask
+
+    def inference(self, input_batch):
+        """
+        Predicts the class (or classes) of the received text using the serialized transformers
+        checkpoint.
+        Args:
+            input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
+                                of attention masks, as returned by the preprocess function.
+        Returns:
+            list: A list of strings with the predicted values for each input text in the batch.
+        """
+        input_ids_batch, attention_mask_batch = input_batch
+        input_ids_batch = input_ids_batch.to(self.device)
+        outputs = self.model.generate(
+            input_ids_batch,
+            attention_mask=attention_mask_batch,
+            max_length=self.max_new_tokens,
+        )
+
+        inferences = self.tokenizer.batch_decode(
+            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+        logger.info("Generated text: %s", inferences)
+        return inferences
+
+    def postprocess(self, inference_output):
+        """Post Process Function converts the predicted response into Torchserve readable format.
+        Args:
+            inference_output (list): It contains the predicted response of the input text.
+        Returns:
+            (list): Returns a list of the Predictions and Explanations.
+        """
+        return inference_output

From 6241c75d41693418ae4b9a8c98ed04eb7f7a0d24 Mon Sep 17 00:00:00 2001
From: lxning <lninga@amazon.com>
Date: Thu, 12 Oct 2023 18:14:44 -0700
Subject: [PATCH 2/3] update SECURITY.md

---
 SECURITY.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SECURITY.md b/SECURITY.md
index 38d22373c6..1f424bcfa3 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -3,8 +3,8 @@
 ## Supported Versions
 
 | Version | Supported          |
-| ------- | ------------------ |
-| 0.8.2   | :white_check_mark: |
+|---------| ------------------ |
+| 0.9.0   | :white_check_mark: |
 
 
 ## How we do security

From e34a204f128b7d6d7af4a7f9462ceb4380aa554f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@fb.com>
Date: Thu, 12 Oct 2023 19:22:17 -0700
Subject: [PATCH 3/3] Delete
 examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py

---
 .../llama2/custom_handler_code.py             | 140 ------------------
 1 file changed, 140 deletions(-)
 delete mode 100644 examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py

diff --git a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py
deleted file mode 100644
index d48c0cc593..0000000000
--- a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py
+++ /dev/null
@@ -1,140 +0,0 @@
-import logging
-from abc import ABC
-
-import torch
-import transformers
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from ts.context import Context
-from ts.torch_handler.base_handler import BaseHandler
-
-logger = logging.getLogger(__name__)
-logger.info("Transformers version %s", transformers.__version__)
-
-
-class LlamaHandler(BaseHandler, ABC):
-    """
-    Transformers handler class for sequence, token classification and question answering.
-    """
-
-    def __init__(self):
-        super(LlamaHandler, self).__init__()
-        self.max_length = None
-        self.max_new_tokens = None
-        self.tokenizer = None
-        self.initialized = False
-
-    def initialize(self, ctx: Context):
-        """In this initialize function, the HF large model is loaded and
-        partitioned using DeepSpeed.
-        Args:
-            ctx (context): It is a JSON Object containing information
-            pertaining to the model artifacts parameters.
-        """
-        model_dir = ctx.system_properties.get("model_dir")
-        self.max_length = int(ctx.model_yaml_config["handler"]["max_length"])
-        self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"])
-        model_name = ctx.model_yaml_config["handler"]["model_name"]
-        model_path = f'{model_dir}/{ctx.model_yaml_config["handler"]["model_path"]}'
-        seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
-        torch.manual_seed(seed)
-
-        logger.info("Model %s loading tokenizer", ctx.model_name)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_path,
-            device_map="balanced",
-            low_cpu_mem_usage=True,
-            torch_dtype=torch.float16,
-            load_in_8bit=True,
-            trust_remote_code=True,
-        )
-        if ctx.model_yaml_config["handler"]["fast_kernels"]:
-            from optimum.bettertransformer import BetterTransformer
-
-            try:
-                self.model = BetterTransformer.transform(self.model)
-            except RuntimeError as error:
-                logger.warning(
-                    "HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview"
-                )
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-        logger.info("Model %s loaded successfully", ctx.model_name)
-        self.initialized = True
-
-    def preprocess(self, requests):
-        """
-        Basic text preprocessing, based on the user's choice of application mode.
-        Args:
-            requests (list): A list of dictionaries with a "data" or "body" field, each
-                            containing the input text to be processed.
-        Returns:
-            tuple: A tuple with two tensors: the batch of input ids and the batch of
-                attention masks.
-        """
-        input_texts = [data.get("data") or data.get("body") for data in requests]
-        input_ids_batch, attention_mask_batch = [], []
-        for input_text in input_texts:
-            input_ids, attention_mask = self.encode_input_text(input_text)
-            input_ids_batch.append(input_ids)
-            attention_mask_batch.append(attention_mask)
-        input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.model.device)
-        attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device)
-        return input_ids_batch, attention_mask_batch
-
-    def encode_input_text(self, input_text):
-        """
-        Encodes a single input text using the tokenizer.
-        Args:
-            input_text (str): The input text to be encoded.
-        Returns:
-            tuple: A tuple with two tensors: the encoded input ids and the attention mask.
-        """
-        if isinstance(input_text, (bytes, bytearray)):
-            input_text = input_text.decode("utf-8")
-        logger.info("Received text: '%s'", input_text)
-        inputs = self.tokenizer.encode_plus(
-            input_text,
-            max_length=self.max_length,
-            padding=False,
-            add_special_tokens=True,
-            return_tensors="pt",
-            truncation=True,
-        )
-        input_ids = inputs["input_ids"]
-        attention_mask = inputs["attention_mask"]
-        return input_ids, attention_mask
-
-    def inference(self, input_batch):
-        """
-        Predicts the class (or classes) of the received text using the serialized transformers
-        checkpoint.
-        Args:
-            input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch
-                                of attention masks, as returned by the preprocess function.
-        Returns:
-            list: A list of strings with the predicted values for each input text in the batch.
-        """
-        input_ids_batch, attention_mask_batch = input_batch
-        input_ids_batch = input_ids_batch.to(self.device)
-        outputs = self.model.generate(
-            input_ids_batch,
-            attention_mask=attention_mask_batch,
-            max_length=self.max_new_tokens,
-        )
-
-        inferences = self.tokenizer.batch_decode(
-            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-
-        logger.info("Generated text: %s", inferences)
-        return inferences
-
-    def postprocess(self, inference_output):
-        """Post Process Function converts the predicted response into Torchserve readable format.
-        Args:
-            inference_output (list): It contains the predicted response of the input text.
-        Returns:
-            (list): Returns a list of the Predictions and Explanations.
-        """
-        return inference_output