From 7b34c96b347885eee49902e7ed8dd80002d7dd4d Mon Sep 17 00:00:00 2001 From: lxning Date: Wed, 11 Oct 2023 12:48:51 -0700 Subject: [PATCH 1/3] fmt --- .../llama2/custom_handler_code.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py diff --git a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py new file mode 100644 index 0000000000..d48c0cc593 --- /dev/null +++ b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py @@ -0,0 +1,140 @@ +import logging +from abc import ABC + +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer + +from ts.context import Context +from ts.torch_handler.base_handler import BaseHandler + +logger = logging.getLogger(__name__) +logger.info("Transformers version %s", transformers.__version__) + + +class LlamaHandler(BaseHandler, ABC): + """ + Transformers handler class for sequence, token classification and question answering. + """ + + def __init__(self): + super(LlamaHandler, self).__init__() + self.max_length = None + self.max_new_tokens = None + self.tokenizer = None + self.initialized = False + + def initialize(self, ctx: Context): + """In this initialize function, the HF large model is loaded and + partitioned using DeepSpeed. + Args: + ctx (context): It is a JSON Object containing information + pertaining to the model artifacts parameters. + """ + model_dir = ctx.system_properties.get("model_dir") + self.max_length = int(ctx.model_yaml_config["handler"]["max_length"]) + self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"]) + model_name = ctx.model_yaml_config["handler"]["model_name"] + model_path = f'{model_dir}/{ctx.model_yaml_config["handler"]["model_path"]}' + seed = int(ctx.model_yaml_config["handler"]["manual_seed"]) + torch.manual_seed(seed) + + logger.info("Model %s loading tokenizer", ctx.model_name) + self.model = AutoModelForCausalLM.from_pretrained( + model_path, + device_map="balanced", + low_cpu_mem_usage=True, + torch_dtype=torch.float16, + load_in_8bit=True, + trust_remote_code=True, + ) + if ctx.model_yaml_config["handler"]["fast_kernels"]: + from optimum.bettertransformer import BetterTransformer + + try: + self.model = BetterTransformer.transform(self.model) + except RuntimeError as error: + logger.warning( + "HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview" + ) + self.tokenizer = AutoTokenizer.from_pretrained(model_path) + + logger.info("Model %s loaded successfully", ctx.model_name) + self.initialized = True + + def preprocess(self, requests): + """ + Basic text preprocessing, based on the user's choice of application mode. + Args: + requests (list): A list of dictionaries with a "data" or "body" field, each + containing the input text to be processed. + Returns: + tuple: A tuple with two tensors: the batch of input ids and the batch of + attention masks. + """ + input_texts = [data.get("data") or data.get("body") for data in requests] + input_ids_batch, attention_mask_batch = [], [] + for input_text in input_texts: + input_ids, attention_mask = self.encode_input_text(input_text) + input_ids_batch.append(input_ids) + attention_mask_batch.append(attention_mask) + input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.model.device) + attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device) + return input_ids_batch, attention_mask_batch + + def encode_input_text(self, input_text): + """ + Encodes a single input text using the tokenizer. + Args: + input_text (str): The input text to be encoded. + Returns: + tuple: A tuple with two tensors: the encoded input ids and the attention mask. + """ + if isinstance(input_text, (bytes, bytearray)): + input_text = input_text.decode("utf-8") + logger.info("Received text: '%s'", input_text) + inputs = self.tokenizer.encode_plus( + input_text, + max_length=self.max_length, + padding=False, + add_special_tokens=True, + return_tensors="pt", + truncation=True, + ) + input_ids = inputs["input_ids"] + attention_mask = inputs["attention_mask"] + return input_ids, attention_mask + + def inference(self, input_batch): + """ + Predicts the class (or classes) of the received text using the serialized transformers + checkpoint. + Args: + input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch + of attention masks, as returned by the preprocess function. + Returns: + list: A list of strings with the predicted values for each input text in the batch. + """ + input_ids_batch, attention_mask_batch = input_batch + input_ids_batch = input_ids_batch.to(self.device) + outputs = self.model.generate( + input_ids_batch, + attention_mask=attention_mask_batch, + max_length=self.max_new_tokens, + ) + + inferences = self.tokenizer.batch_decode( + outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + logger.info("Generated text: %s", inferences) + return inferences + + def postprocess(self, inference_output): + """Post Process Function converts the predicted response into Torchserve readable format. + Args: + inference_output (list): It contains the predicted response of the input text. + Returns: + (list): Returns a list of the Predictions and Explanations. + """ + return inference_output From 6241c75d41693418ae4b9a8c98ed04eb7f7a0d24 Mon Sep 17 00:00:00 2001 From: lxning Date: Thu, 12 Oct 2023 18:14:44 -0700 Subject: [PATCH 2/3] update SECURITY.md --- SECURITY.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 38d22373c6..1f424bcfa3 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -3,8 +3,8 @@ ## Supported Versions | Version | Supported | -| ------- | ------------------ | -| 0.8.2 | :white_check_mark: | +|---------| ------------------ | +| 0.9.0 | :white_check_mark: | ## How we do security From e34a204f128b7d6d7af4a7f9462ceb4380aa554f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 12 Oct 2023 19:22:17 -0700 Subject: [PATCH 3/3] Delete examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py --- .../llama2/custom_handler_code.py | 140 ------------------ 1 file changed, 140 deletions(-) delete mode 100644 examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py diff --git a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py b/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py deleted file mode 100644 index d48c0cc593..0000000000 --- a/examples/large_models/Huggingface_accelerate/llama2/custom_handler_code.py +++ /dev/null @@ -1,140 +0,0 @@ -import logging -from abc import ABC - -import torch -import transformers -from transformers import AutoModelForCausalLM, AutoTokenizer - -from ts.context import Context -from ts.torch_handler.base_handler import BaseHandler - -logger = logging.getLogger(__name__) -logger.info("Transformers version %s", transformers.__version__) - - -class LlamaHandler(BaseHandler, ABC): - """ - Transformers handler class for sequence, token classification and question answering. - """ - - def __init__(self): - super(LlamaHandler, self).__init__() - self.max_length = None - self.max_new_tokens = None - self.tokenizer = None - self.initialized = False - - def initialize(self, ctx: Context): - """In this initialize function, the HF large model is loaded and - partitioned using DeepSpeed. - Args: - ctx (context): It is a JSON Object containing information - pertaining to the model artifacts parameters. - """ - model_dir = ctx.system_properties.get("model_dir") - self.max_length = int(ctx.model_yaml_config["handler"]["max_length"]) - self.max_new_tokens = int(ctx.model_yaml_config["handler"]["max_new_tokens"]) - model_name = ctx.model_yaml_config["handler"]["model_name"] - model_path = f'{model_dir}/{ctx.model_yaml_config["handler"]["model_path"]}' - seed = int(ctx.model_yaml_config["handler"]["manual_seed"]) - torch.manual_seed(seed) - - logger.info("Model %s loading tokenizer", ctx.model_name) - self.model = AutoModelForCausalLM.from_pretrained( - model_path, - device_map="balanced", - low_cpu_mem_usage=True, - torch_dtype=torch.float16, - load_in_8bit=True, - trust_remote_code=True, - ) - if ctx.model_yaml_config["handler"]["fast_kernels"]: - from optimum.bettertransformer import BetterTransformer - - try: - self.model = BetterTransformer.transform(self.model) - except RuntimeError as error: - logger.warning( - "HuggingFace Optimum is not supporting this model,for the list of supported models, please refer to this doc,https://huggingface.co/docs/optimum/bettertransformer/overview" - ) - self.tokenizer = AutoTokenizer.from_pretrained(model_path) - - logger.info("Model %s loaded successfully", ctx.model_name) - self.initialized = True - - def preprocess(self, requests): - """ - Basic text preprocessing, based on the user's choice of application mode. - Args: - requests (list): A list of dictionaries with a "data" or "body" field, each - containing the input text to be processed. - Returns: - tuple: A tuple with two tensors: the batch of input ids and the batch of - attention masks. - """ - input_texts = [data.get("data") or data.get("body") for data in requests] - input_ids_batch, attention_mask_batch = [], [] - for input_text in input_texts: - input_ids, attention_mask = self.encode_input_text(input_text) - input_ids_batch.append(input_ids) - attention_mask_batch.append(attention_mask) - input_ids_batch = torch.cat(input_ids_batch, dim=0).to(self.model.device) - attention_mask_batch = torch.cat(attention_mask_batch, dim=0).to(self.device) - return input_ids_batch, attention_mask_batch - - def encode_input_text(self, input_text): - """ - Encodes a single input text using the tokenizer. - Args: - input_text (str): The input text to be encoded. - Returns: - tuple: A tuple with two tensors: the encoded input ids and the attention mask. - """ - if isinstance(input_text, (bytes, bytearray)): - input_text = input_text.decode("utf-8") - logger.info("Received text: '%s'", input_text) - inputs = self.tokenizer.encode_plus( - input_text, - max_length=self.max_length, - padding=False, - add_special_tokens=True, - return_tensors="pt", - truncation=True, - ) - input_ids = inputs["input_ids"] - attention_mask = inputs["attention_mask"] - return input_ids, attention_mask - - def inference(self, input_batch): - """ - Predicts the class (or classes) of the received text using the serialized transformers - checkpoint. - Args: - input_batch (tuple): A tuple with two tensors: the batch of input ids and the batch - of attention masks, as returned by the preprocess function. - Returns: - list: A list of strings with the predicted values for each input text in the batch. - """ - input_ids_batch, attention_mask_batch = input_batch - input_ids_batch = input_ids_batch.to(self.device) - outputs = self.model.generate( - input_ids_batch, - attention_mask=attention_mask_batch, - max_length=self.max_new_tokens, - ) - - inferences = self.tokenizer.batch_decode( - outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False - ) - - logger.info("Generated text: %s", inferences) - return inferences - - def postprocess(self, inference_output): - """Post Process Function converts the predicted response into Torchserve readable format. - Args: - inference_output (list): It contains the predicted response of the input text. - Returns: - (list): Returns a list of the Predictions and Explanations. - """ - return inference_output