diff --git a/llama/llama-3-70b-chatqa/README.md b/llama/llama-3-70b-chatqa/README.md new file mode 100644 index 00000000..388f9ce3 --- /dev/null +++ b/llama/llama-3-70b-chatqa/README.md @@ -0,0 +1,55 @@ +# Llama 3 70B ChatQA + +This is a [Truss](https://truss.baseten.co/) for Llama 3 70B ChatQA. + +## Usage + +This ChatQA model uses the standard `messages` dictionary used by most conversation-tuned LLMs, with an additional `context` parameter for passing in a single string of concatenated context. + +### API route: `predict` + +The predict route is the primary method for generating text completions based on a given prompt. It takes several parameters: + +- __messages__: The input text that you want the model to generate a response for. +- __context__: The context to use in the QA application. +- __max_tokens__ (optional, default=512): The maximum number of tokens to return, counting input tokens. Maximum of 8192. + +The API also supports passing any parameter supported by HuggingFace's `Transformers.generate`. + +### Example usage + +You can also invoke your model via a REST API: + +``` +curl -X POST " https://app.baseten.co/model_versions/YOUR_MODEL_VERSION_ID/predict" \ + -H "Content-Type: application/json" \ + -H 'Authorization: Api-Key {YOUR_API_KEY}' \ + -d '{ + "messages": [{"role": "user", "content": "what is the percentage change of the net income from Q4 FY23 to Q4 FY24?"}], + "context": "NVIDIA (NASDAQ: NVDA) today reported revenue for the fourth quarter ended January 28, 2024, of $22.1 billion, up 22% from the previous quarter and up 265% from a year ago.\nFor the quarter, GAAP earnings per diluted share was $4.93, up 33% from the previous quarter and up 765% from a year ago. Non-GAAP earnings per diluted share was $5.16, up 28% from the previous quarter and up 486% from a year ago.\nQ4 Fiscal 2024 Summary\nGAAP\n| $ in millions, except earnings per share | Q4 FY24 | Q3 FY24 | Q4 FY23 | Q/Q | Y/Y |\n| Revenue | $22,103 | $18,120 | $6,051 | Up 22% | Up 265% |\n| Gross margin | 76.0% | 74.0% | 63.3% | Up 2.0 pts | Up 12.7 pts |\n| Operating expenses | $3,176 | $2,983 | $2,576 | Up 6% | Up 23% |\n| Operating income | $13,615 | $10,417 | $1,257 | Up 31% | Up 983% |\n| Net income | $12,285 | $9,243 | $1,414 | Up 33% | Up 769% |\n| Diluted earnings per share | $4.93 | $3.71 | $0.57 | Up 33% | Up 765% |" + }' +``` + +## Deployment + +First, clone this repository: + +```sh +git clone https://github.com/basetenlabs/truss-examples/ +cd llama-3-70b-chatqa +``` + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + +With `llama-3-70b-chatqa` as your working directory, you can deploy the model with: + +```sh +truss push +``` + +Paste your Baseten API key if prompted. + +For more information, see [Truss documentation](https://truss.baseten.co). diff --git a/llama/llama-3-70b-chatqa/config.yaml b/llama/llama-3-70b-chatqa/config.yaml new file mode 100644 index 00000000..fe644051 --- /dev/null +++ b/llama/llama-3-70b-chatqa/config.yaml @@ -0,0 +1,22 @@ +environment_variables: {} +external_package_dirs: [] +model_metadata: + avatar_url: https://cdn.baseten.co/production/static/explore/meta.png + cover_image_url: https://cdn.baseten.co/production/static/explore/llama.png + repo_id: nvidia/Llama3-ChatQA-1.5-70B + tags: + - text-generation +model_name: Llama3-ChatQA-1.5-70B +python_version: py310 +model_cache: + - repo_id: nvidia/Llama3-ChatQA-1.5-70B +requirements: + - accelerate + - einops + - transformers + - torch +resources: + accelerator: A100:2 + use_gpu: true +secrets: {} +system_packages: [] diff --git a/llama/llama-3-70b-chatqa/model/__init__.py b/llama/llama-3-70b-chatqa/model/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/llama/llama-3-70b-chatqa/model/model.py b/llama/llama-3-70b-chatqa/model/model.py new file mode 100644 index 00000000..6f126cad --- /dev/null +++ b/llama/llama-3-70b-chatqa/model/model.py @@ -0,0 +1,124 @@ +from threading import Thread +from typing import Dict + +import torch +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, + TextIteratorStreamer, +) + +MODEL_NAME = "nvidia/Llama3-ChatQA-1.5-70B" +MAX_LENGTH = 512 +TEMPERATURE = 1.0 +TOP_P = 0.95 +TOP_K = 40 +REPETITION_PENALTY = 1.0 +NO_REPEAT_NGRAM_SIZE = 0 +DO_SAMPLE = True +DEFAULT_STREAM = True + +SYSTEM = "System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context." +INSTRUCTION = "Please give a full and complete answer for the question." + + +def get_formatted_input(messages, context): + for item in messages: + if item["role"] == "user": + ## only apply this instruction for the first user turn + item["content"] = INSTRUCTION + " " + item["content"] + break + + conversation = ( + "\n\n".join( + [ + "User: " + item["content"] + if item["role"] == "user" + else "Assistant: " + item["content"] + for item in messages + ] + ) + + "\n\nAssistant:" + ) + formatted_input = SYSTEM + "\n\n" + context + "\n\n" + conversation + return formatted_input + + +class Model: + def __init__(self, **kwargs): + self.model = None + self.tokenizer = None + self._secrets = kwargs["secrets"] + + def load(self): + self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + self.model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, device_map="auto", torch_dtype=torch.float16 + ) + + def preprocess(self, request: dict): + terminators = [ + self.tokenizer.eos_token_id, + self.tokenizer.convert_tokens_to_ids("<|eot_id|>"), + ] + generate_args = { + "max_length": request.get("max_tokens", MAX_LENGTH), + "temperature": request.get("temperature", TEMPERATURE), + "top_p": request.get("top_p", TOP_P), + "top_k": request.get("top_k", TOP_K), + "repetition_penalty": request.get("repetition_penalty", REPETITION_PENALTY), + "no_repeat_ngram_size": request.get( + "no_repeat_ngram_size", NO_REPEAT_NGRAM_SIZE + ), + "do_sample": request.get("do_sample", DO_SAMPLE), + "use_cache": True, + "eos_token_id": terminators, + "pad_token_id": self.tokenizer.pad_token_id, + } + request["formatted_input"] = get_formatted_input( + request.pop("messages"), request.pop("context") + ) + request["generate_args"] = generate_args + return request + + def stream(self, input_ids: list, generation_args: dict): + streamer = TextIteratorStreamer(self.tokenizer) + generation_config = GenerationConfig(**generation_args) + generation_kwargs = { + "input_ids": input_ids, + "generation_config": generation_config, + "return_dict_in_generate": True, + "output_scores": True, + "max_new_tokens": generation_args["max_length"], + "streamer": streamer, + } + + with torch.no_grad(): + # Begin generation in a separate thread + thread = Thread(target=self.model.generate, kwargs=generation_kwargs) + thread.start() + + # Yield generated text as it becomes available + def inner(): + for text in streamer: + yield text + thread.join() + + return inner() + + def predict(self, request: Dict): + formatted_input = request.pop("formatted_input") + stream = request.pop("stream", DEFAULT_STREAM) + generation_args = request.pop("generate_args") + + inputs = self.tokenizer(formatted_input, return_tensors="pt") + input_ids = inputs["input_ids"].to("cuda") + + if stream: + return self.stream(input_ids, generation_args) + + with torch.no_grad(): + outputs = self.model.generate(input_ids=input_ids, **generation_args) + output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + return {"output": output_text} diff --git a/llama/llama-3-8b-chatqa/README.md b/llama/llama-3-8b-chatqa/README.md new file mode 100644 index 00000000..75053de7 --- /dev/null +++ b/llama/llama-3-8b-chatqa/README.md @@ -0,0 +1,55 @@ +# Llama 3 8B ChatQA + +This is a [Truss](https://truss.baseten.co/) for Llama 3 8B ChatQA. + +## Usage + +This ChatQA model uses the standard `messages` dictionary used by most conversation-tuned LLMs, with an additional `context` parameter for passing in a single string of concatenated context. + +### API route: `predict` + +The predict route is the primary method for generating text completions based on a given prompt. It takes several parameters: + +- __messages__: The input text that you want the model to generate a response for. +- __context__: The context to use in the QA application. +- __max_tokens__ (optional, default=512): The maximum number of tokens to return, counting input tokens. Maximum of 8192. + +The API also supports passing any parameter supported by HuggingFace's `Transformers.generate`. + +### Example usage + +You can also invoke your model via a REST API: + +``` +curl -X POST " https://app.baseten.co/model_versions/YOUR_MODEL_VERSION_ID/predict" \ + -H "Content-Type: application/json" \ + -H 'Authorization: Api-Key {YOUR_API_KEY}' \ + -d '{ + "messages": [{"role": "user", "content": "what is the percentage change of the net income from Q4 FY23 to Q4 FY24?"}], + "context": "NVIDIA (NASDAQ: NVDA) today reported revenue for the fourth quarter ended January 28, 2024, of $22.1 billion, up 22% from the previous quarter and up 265% from a year ago.\nFor the quarter, GAAP earnings per diluted share was $4.93, up 33% from the previous quarter and up 765% from a year ago. Non-GAAP earnings per diluted share was $5.16, up 28% from the previous quarter and up 486% from a year ago.\nQ4 Fiscal 2024 Summary\nGAAP\n| $ in millions, except earnings per share | Q4 FY24 | Q3 FY24 | Q4 FY23 | Q/Q | Y/Y |\n| Revenue | $22,103 | $18,120 | $6,051 | Up 22% | Up 265% |\n| Gross margin | 76.0% | 74.0% | 63.3% | Up 2.0 pts | Up 12.7 pts |\n| Operating expenses | $3,176 | $2,983 | $2,576 | Up 6% | Up 23% |\n| Operating income | $13,615 | $10,417 | $1,257 | Up 31% | Up 983% |\n| Net income | $12,285 | $9,243 | $1,414 | Up 33% | Up 769% |\n| Diluted earnings per share | $4.93 | $3.71 | $0.57 | Up 33% | Up 765% |" + }' +``` + +## Deployment + +First, clone this repository: + +```sh +git clone https://github.com/basetenlabs/truss-examples/ +cd llama-3-8b-chatqa +``` + +Before deployment: + +1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys). +2. Install the latest version of Truss: `pip install --upgrade truss` + +With `llama-3-8b-chatqa` as your working directory, you can deploy the model with: + +```sh +truss push +``` + +Paste your Baseten API key if prompted. + +For more information, see [Truss documentation](https://truss.baseten.co). diff --git a/llama/llama-3-8b-chatqa/config.yaml b/llama/llama-3-8b-chatqa/config.yaml new file mode 100644 index 00000000..7bb14f70 --- /dev/null +++ b/llama/llama-3-8b-chatqa/config.yaml @@ -0,0 +1,22 @@ +environment_variables: {} +external_package_dirs: [] +model_metadata: + avatar_url: https://cdn.baseten.co/production/static/explore/meta.png + cover_image_url: https://cdn.baseten.co/production/static/explore/llama.png + repo_id: nvidia/Llama3-ChatQA-1.5-8B + tags: + - text-generation +model_name: Llama3-ChatQA-1.5-8B +python_version: py310 +model_cache: + - repo_id: nvidia/Llama3-ChatQA-1.5-8B +requirements: + - accelerate + - einops + - transformers + - torch +resources: + accelerator: A100 + use_gpu: true +secrets: {} +system_packages: [] diff --git a/llama/llama-3-8b-chatqa/model/__init__.py b/llama/llama-3-8b-chatqa/model/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/llama/llama-3-8b-chatqa/model/model.py b/llama/llama-3-8b-chatqa/model/model.py new file mode 100644 index 00000000..196bc7c5 --- /dev/null +++ b/llama/llama-3-8b-chatqa/model/model.py @@ -0,0 +1,124 @@ +from threading import Thread +from typing import Dict + +import torch +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + GenerationConfig, + TextIteratorStreamer, +) + +MODEL_NAME = "nvidia/Llama3-ChatQA-1.5-8B" +MAX_LENGTH = 512 +TEMPERATURE = 1.0 +TOP_P = 0.95 +TOP_K = 40 +REPETITION_PENALTY = 1.0 +NO_REPEAT_NGRAM_SIZE = 0 +DO_SAMPLE = True +DEFAULT_STREAM = True + +SYSTEM = "System: This is a chat between a user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions based on the context. The assistant should also indicate when the answer cannot be found in the context." +INSTRUCTION = "Please give a full and complete answer for the question." + + +def get_formatted_input(messages, context): + for item in messages: + if item["role"] == "user": + ## only apply this instruction for the first user turn + item["content"] = INSTRUCTION + " " + item["content"] + break + + conversation = ( + "\n\n".join( + [ + "User: " + item["content"] + if item["role"] == "user" + else "Assistant: " + item["content"] + for item in messages + ] + ) + + "\n\nAssistant:" + ) + formatted_input = SYSTEM + "\n\n" + context + "\n\n" + conversation + return formatted_input + + +class Model: + def __init__(self, **kwargs): + self.model = None + self.tokenizer = None + self._secrets = kwargs["secrets"] + + def load(self): + self.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + self.model = AutoModelForCausalLM.from_pretrained( + MODEL_NAME, device_map="auto", torch_dtype=torch.float16 + ) + + def preprocess(self, request: dict): + terminators = [ + self.tokenizer.eos_token_id, + self.tokenizer.convert_tokens_to_ids("<|eot_id|>"), + ] + generate_args = { + "max_length": request.get("max_tokens", MAX_LENGTH), + "temperature": request.get("temperature", TEMPERATURE), + "top_p": request.get("top_p", TOP_P), + "top_k": request.get("top_k", TOP_K), + "repetition_penalty": request.get("repetition_penalty", REPETITION_PENALTY), + "no_repeat_ngram_size": request.get( + "no_repeat_ngram_size", NO_REPEAT_NGRAM_SIZE + ), + "do_sample": request.get("do_sample", DO_SAMPLE), + "use_cache": True, + "eos_token_id": terminators, + "pad_token_id": self.tokenizer.pad_token_id, + } + request["formatted_input"] = get_formatted_input( + request.pop("messages"), request.pop("context") + ) + request["generate_args"] = generate_args + return request + + def stream(self, input_ids: list, generation_args: dict): + streamer = TextIteratorStreamer(self.tokenizer) + generation_config = GenerationConfig(**generation_args) + generation_kwargs = { + "input_ids": input_ids, + "generation_config": generation_config, + "return_dict_in_generate": True, + "output_scores": True, + "max_new_tokens": generation_args["max_length"], + "streamer": streamer, + } + + with torch.no_grad(): + # Begin generation in a separate thread + thread = Thread(target=self.model.generate, kwargs=generation_kwargs) + thread.start() + + # Yield generated text as it becomes available + def inner(): + for text in streamer: + yield text + thread.join() + + return inner() + + def predict(self, request: Dict): + formatted_input = request.pop("formatted_input") + stream = request.pop("stream", DEFAULT_STREAM) + generation_args = request.pop("generate_args") + + inputs = self.tokenizer(formatted_input, return_tensors="pt") + input_ids = inputs["input_ids"].to("cuda") + + if stream: + return self.stream(input_ids, generation_args) + + with torch.no_grad(): + outputs = self.model.generate(input_ids=input_ids, **generation_args) + output_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True) + return {"output": output_text}