From d4e7db2f9b7811f7963f1e76bb1313d7610d0a66 Mon Sep 17 00:00:00 2001 From: stanley-nod Date: Wed, 24 Jan 2024 23:49:40 -0800 Subject: [PATCH 1/6] Auto e2e benchmarker. --- .gitignore | 3 + .../README.md | 0 .../custom_models/llama_benchmark/__init__.py | 0 .../benchmark.mlir | 0 .../benchmark_forward.mlir | 0 .../benchmark_module.py | 0 .../llama_benchmark/e2e/README.md | 65 +++++++++ .../llama_benchmark/e2e/__init__.py | 0 .../e2e/benchmark_prompts.json | 7 + .../llama_benchmark/e2e/llm_e2e_benchmark.py | 127 ++++++++++++++++++ .../stateless_llama_benchmark.py | 0 .../custom_models/llm_runner.py | 49 ++++--- .../tests/benchmark_prompt_test.json | 4 + .../tests/stateless_llama_test.py | 43 +++++- 14 files changed, 281 insertions(+), 17 deletions(-) rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/README.md (100%) create mode 100644 python/turbine_models/custom_models/llama_benchmark/__init__.py rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/benchmark.mlir (100%) rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/benchmark_forward.mlir (100%) rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/benchmark_module.py (100%) create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/README.md create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/stateless_llama_benchmark.py (100%) create mode 100644 python/turbine_models/tests/benchmark_prompt_test.json diff --git a/.gitignore b/.gitignore index d85c8598b..663e9d93e 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ wheelhouse *.safetensors *.gguf *.vmfb + +#Benchmark artifacts +benchmark_e2e_results.json diff --git a/python/turbine_models/custom_models/llama-benchmark/README.md b/python/turbine_models/custom_models/llama_benchmark/README.md similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/README.md rename to python/turbine_models/custom_models/llama_benchmark/README.md diff --git a/python/turbine_models/custom_models/llama_benchmark/__init__.py b/python/turbine_models/custom_models/llama_benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark.mlir b/python/turbine_models/custom_models/llama_benchmark/benchmark.mlir similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/benchmark.mlir rename to python/turbine_models/custom_models/llama_benchmark/benchmark.mlir diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark_forward.mlir b/python/turbine_models/custom_models/llama_benchmark/benchmark_forward.mlir similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/benchmark_forward.mlir rename to python/turbine_models/custom_models/llama_benchmark/benchmark_forward.mlir diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark_module.py b/python/turbine_models/custom_models/llama_benchmark/benchmark_module.py similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/benchmark_module.py rename to python/turbine_models/custom_models/llama_benchmark/benchmark_module.py diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/README.md b/python/turbine_models/custom_models/llama_benchmark/e2e/README.md new file mode 100644 index 000000000..6432b9e60 --- /dev/null +++ b/python/turbine_models/custom_models/llama_benchmark/e2e/README.md @@ -0,0 +1,65 @@ +# Instructions + +Clone and install SHARK-Turbine +``` +git clone https://github.com/nod-ai/SHARK-Turbine.git +cd SHARK-Turbine +python -m venv turbine_venv && source turbine_venv/bin/activate + +pip install --upgrade -r requirements.txt +pip install --upgrade -e .[torch-cpu-nightly,testing] +pip install --upgrade -r turbine-models-requirements.txt +``` + +## Compiling LLMs +Note: Make sure to replace "your_token" with your actual hf_auth_token for all the commands. + +Now, you can generate the quantized weight file with +``` +python python/turbine_models/gen_external_params/gen_external_params.py --hf_auth_token=your_token +``` +The model weights will then be saved in the current directory as `Llama_2_7b_chat_hf_f16_int4.safetensors`. + +To compile to vmfb for llama +``` +python python/turbine_models/custom_models/stateless_llama.py --compile_to=vmfb --hf_auth_token=your_token --external_weights="safetensors" --quantization="int4" --precision="f16" +``` +By default the vmfb will be saved as `Llama_2_7b_chat_hf.vmfb`. + +## Benchmarking LLMs e2e +To run benchmark with the default benchmark dataset just run: +``` +python python/turbine_models/custom_models/llama-benchmark/e2e/llm_e2e_benchmark.py --vmfb_path=/path/to/Llama_2_7b_chat_hf.vmfb --external_weight_path=Llama_2_7b_chat_hf_f16_int4.safetensors --device=vulkan hf_auth_token=your_hf_token +``` +You can specify a path to dataset using: `--benchmark_dataset_path=/path/to/dataset.json` +You can specify where to store the result path using: `--benchmark_output_path=/path/to/output.json` + +## Benchmarking Dataset + +To setup a dataset json you'd need a json file with a list of entry(s) containing these attributes: +1. id : number identifying example (int) +2. system_prompt : System prompt to align LLM (str) +3. user_prompt : Query example from user (str) +4. num_iterations : number of times to run/benchmark the particular example (int) +5. num_tokens_to_generate : how many tokens do we want to generate for the example (int) + +Here is a sample: +```json +[ + {"id" : 0, + "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", + "user_prompt" : "what is the capital of canada?", + "num_iterations": 8, + "num_tokens_to_generate": 20} +] +``` + +The default dataset in `benchmark_prompts.json` contains example that SHARK-1.0 traditionally measures. Additionally, we also added some data common in MLPerf which uses some data from open-orca. In the future, we should add more of the data from open-orca to run benchmarks with. + +## Benchmarking Output + +The output json will have similar attributes with an addition of the results/measured benchmarks. Hence it will have these additional attributes: +1. prefill_tokens : number of tokens ran during the prefill stage (int) +2. prefill_speed(tok/s) : Number of tokens for initial input / time to complete prefill (float) +3. decoded_tokens : number of tokens decoded during decode stage. (int) +4. decode_speed(tok/s) : Average speed of decoding per token for this example, averaged over the number of iterations. (float) diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py b/python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json b/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json new file mode 100644 index 000000000..9c8c72346 --- /dev/null +++ b/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json @@ -0,0 +1,7 @@ +[ + {"id" : 0, "system_prompt": "hi", "user_prompt" : "", "num_iterations": 5, "num_tokens_to_generate": 512}, + {"id" : 1, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 8, "num_tokens_to_generate": 20}, + {"id" : 2, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada and what is famouse for", "num_iterations": 5, "num_tokens_to_generate": 255}, + {"id" : 3, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 5, "num_tokens_to_generate": 50}, + {"id" : 4, "system_prompt": "[INST] <>\nYou are an AI assistant that helps people find information. \n <>\n\n", "user_prompt" : "James runs a TV show and there are 5 main characters and 4 minor characters. He pays the minor characters $15,000 each episode. He paid the major characters three times as much. How much does he pay per episode? Let's be accurate as possible.", "num_iterations": 5, "num_tokens_to_generate": 255} +] \ No newline at end of file diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py b/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py new file mode 100644 index 000000000..f7974dbe3 --- /dev/null +++ b/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py @@ -0,0 +1,127 @@ +import argparse +from turbine_models.model_runner import vmfbRunner +from transformers import AutoTokenizer +from iree import runtime as ireert +import torch +import time +from turbine_models.custom_models.llm_optimizations.streaming_llm.modify_llama import ( + enable_llama_pos_shift_attention, +) +from turbine_models.custom_models.llm_runner import parser, SharkLLM +import os +import json + +parser.add_argument( + "--benchmark_dataset_path", + type=str, + default=f"{os.path.dirname(os.path.realpath(__file__))}/benchmark_prompts.json", + help="path to benchmarking dataset", +) +parser.add_argument( + "--benchmark_output_path", + type=str, + default=f"{os.getcwd()}/benchmark_e2e_results.json", + help="path to benchmarking dataset", +) + + +B_INST, E_INST = "[INST]", "[/INST]" + + +def append_user_prompt(history, input_prompt): + if len(input_prompt) == 0: + return history + user_prompt = f"{B_INST} {input_prompt} {E_INST}" + history += user_prompt + return history + + +def load_dataset(dataset_path): + dataset = [] + with open(dataset_path) as f: + dataset = json.load(f) + if len(dataset) <= 0: + raise ValueError("Dataset is empty, or did not read dataset correctly.") + return dataset + + +def run_llm_benchmark( + device, + vmfb_path, + hf_model_name, + hf_auth_token, + external_weight_path, + dataset_path, + output_path, + streaming_llm=False, +): + # TODO: Support streamingLLM benchmarking, need streamingLLM to be able to reset history/seq_len to 0. + if streaming_llm: + raise ValueError("Streaming LLM currently not supported for benchmarking.") + tokenizer = AutoTokenizer.from_pretrained( + hf_model_name, + use_fast=False, + token=hf_auth_token, + ) + dataset = load_dataset(dataset_path) + result_dicts = [] + llm = SharkLLM( + device=device, + vmfb_path=vmfb_path, + external_weight_path=external_weight_path, + streaming_llm=streaming_llm, + ) + for data in dataset: + llm.set_min_token(data["num_tokens_to_generate"]) + llm.set_max_token(data["num_tokens_to_generate"]) + running_token_decode_count = 0 + running_token_decode_time = 0.0 + running_token_prefill_count = 0 + running_token_prefill_time = 0.0 + for _ in range(data["num_iterations"]): + prompt = data["system_prompt"] + prompt = append_user_prompt(prompt, data["user_prompt"]) + initial_input = tokenizer(prompt, return_tensors="pt") + example_input_id = initial_input.input_ids + result = llm.generate(example_input_id) + bot_response = tokenizer.decode(result, skip_special_tokens=True) + running_token_decode_count += llm.last_num_tokens_decoded + running_token_decode_time += llm.last_prompt_decode_time + running_token_prefill_count += llm.last_num_tokens_prefill + running_token_prefill_time += llm.last_prefill_time + prefill_tokens = running_token_prefill_count / data["num_iterations"] + prefill_speed = running_token_prefill_count / running_token_prefill_time + decoded_tokens = running_token_decode_count / data["num_iterations"] - 1 + decode_speed = running_token_decode_count / running_token_decode_time + result_dicts.append( + { + "prompt_id": data["id"], + "system_prompt": data["system_prompt"], + "user_prompt": data["user_prompt"], + "prefill_tokens": prefill_tokens, + "prefill_speed(tok/s)": prefill_speed, + "decoded_tokens": decoded_tokens, + "decode_speed(tok/s)": decode_speed, + "num_iterations": data["num_iterations"], + "response": bot_response, + } + ) + with open(output_path, "w") as f: + json_results = json.dumps(result_dicts, indent=2) + f.write(json_results) + return output_path + + +if __name__ == "__main__": + args = parser.parse_args() + print("generating turbine output: ") + turbine_output_file = run_llm_benchmark( + args.device, + args.vmfb_path, + args.hf_model_name, + args.hf_auth_token, + args.external_weight_path, + args.benchmark_dataset_path, + args.benchmark_output_path, + args.streaming_llm, + ) diff --git a/python/turbine_models/custom_models/llama-benchmark/stateless_llama_benchmark.py b/python/turbine_models/custom_models/llama_benchmark/stateless_llama_benchmark.py similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/stateless_llama_benchmark.py rename to python/turbine_models/custom_models/llama_benchmark/stateless_llama_benchmark.py diff --git a/python/turbine_models/custom_models/llm_runner.py b/python/turbine_models/custom_models/llm_runner.py index 7632d1e65..19fa58a9e 100644 --- a/python/turbine_models/custom_models/llm_runner.py +++ b/python/turbine_models/custom_models/llm_runner.py @@ -104,6 +104,18 @@ def __init__(self, device, vmfb_path, external_weight_path, streaming_llm=False) self.last_prompt = None self.streaming_llm = streaming_llm self.prev_token_len = 0 + self.min_token = 0 + self.max_token = 1024 + self.last_prefill_time = -1.0 + self.last_prompt_decode_time = -1.0 + self.last_num_tokens_decoded = -1 + self.last_num_tokens_prefill = -1 + + def set_min_token(self, min_token): + self.min_token = min_token + + def set_max_token(self, max_token): + self.max_token = max_token def format_out(self, results): return torch.tensor(results.to_host()[0][0]) @@ -125,25 +137,23 @@ def generate(self, input_ids): input_ids = input_ids[:, token_slice:] inputs = [ireert.asdevicearray(self.runner.config.device, input_ids)] if self.first_input or not self.streaming_llm: - s = time.time() + prefill_start_time = time.time() results = self.model["run_initialize"](*inputs) # example_input_id - e = time.time() - print( - f"num_tokens: {token_len}, time_taken={e-s}, tok/second:{token_len/(e-s)}" - ) + prefill_end_time = time.time() + self.last_num_tokens_prefill = token_len + self.last_prefill_time = prefill_end_time - prefill_start_time token_len += 1 self.first_input = False else: - s = time.time() + prefill_start_time = time.time() results = self.model["run_cached_initialize"](*inputs) # example_input_id - e = time.time() - print( - f"Cached num_tokens: {token_len}, time_taken={e-s}, tok/second:{token_len/(e-s)}" - ) + prefill_end_time = time.time() + self.last_num_tokens_prefill = token_len + self.last_prefill_time = prefill_end_time - prefill_start_time token_len += 1 - s = time.time() + decode_start_time = time.time() turbine_results.append(self.format_out(results)) - while self.format_out(results) != 2: + for _ in range(self.max_token): if self.streaming_llm and self.model["get_seq_step"]() > 600: print("Evicting cache space!") self.model["evict_kvcache_space"]() @@ -151,11 +161,12 @@ def generate(self, input_ids): # uncomment to see tokens as they are emitted # print(f"turbine: {tokenizer.decode(self.format_out(results))}") turbine_results.append(self.format_out(results)) - e = time.time() + if self.format_out(results) == 2 and len(turbine_results) >= self.min_token: + break + decode_end_time = time.time() decoded_tokens = len(turbine_results) - print( - f"Decode num_tokens: {decoded_tokens}, time_taken={e-s}, tok/second:{decoded_tokens/(e-s)}" - ) + self.last_prompt_decode_time = decode_end_time - decode_start_time + self.last_num_tokens_decoded = decoded_tokens self.prev_token_len = token_len + decoded_tokens return turbine_results @@ -196,6 +207,12 @@ def run_llm( result = llm.generate(example_input_id) bot_response = tokenizer.decode(result, skip_special_tokens=True) print(f"\nBOT: {bot_response}\n") + print( + f"Prefill num_tokens : {llm.last_num_tokens_prefill}, time_taken: {llm.last_prefill_time}, tok/second: {llm.last_num_tokens_prefill/llm.last_prefill_time}" + ) + print( + f"Decode num_tokens : {llm.last_num_tokens_decoded}, time_taken: {llm.last_prompt_decode_time}, tok/second: {llm.last_num_tokens_decoded/llm.last_prompt_decode_time}" + ) prompt = append_bot_prompt(prompt, bot_response) diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json new file mode 100644 index 000000000..9dd85e26d --- /dev/null +++ b/python/turbine_models/tests/benchmark_prompt_test.json @@ -0,0 +1,4 @@ +[ + {"id" : 0, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 8, "num_tokens_to_generate": 20}, + {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 5, "num_tokens_to_generate": 50} +] \ No newline at end of file diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py index fc5bc9cd2..f4d703557 100644 --- a/python/turbine_models/tests/stateless_llama_test.py +++ b/python/turbine_models/tests/stateless_llama_test.py @@ -9,11 +9,12 @@ import os import unittest import difflib +import json os.environ["TORCH_LOGS"] = "dynamic" from shark_turbine.aot import * from turbine_models.custom_models import llm_runner - +from turbine_models.custom_models.llama_benchmark.e2e import llm_e2e_benchmark from turbine_models.gen_external_params.gen_external_params import ( gen_external_params, ) @@ -88,6 +89,46 @@ def test_vmfb_comparison(self): ) check_output_string(torch_str, turbine_str) + def test_benchmark_vmfb(self): + llama.export_transformer_model( + hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", + hf_auth_token=None, + compile_to="vmfb", + external_weights="safetensors", + # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized + quantization=quantization, + precision=precision, + device="llvm-cpu", + target_triple="host", + ) + test_dataset_path = "python/turbine_models/tests/benchmark_prompt_test.json" + test_output_path = "benchmark_e2e_results.json" + benchmark_result_path = llm_e2e_benchmark.run_llm_benchmark( + "local-task", + "Llama_2_7b_chat_hf_function_calling_v2.vmfb", + "Trelis/Llama-2-7b-chat-hf-function-calling-v2", + None, + f"Llama_2_7b_chat_hf_function_calling_v2_{precision}_{quantization}.safetensors", + test_dataset_path, + test_output_path, + ) + benchmark_result = [] + with open(benchmark_result_path) as f: + benchmark_result = json.load(f) + if len(benchmark_result) <= 0: + raise ValueError("Dataset is empty, or did not read dataset correctly.") + # Test result for prompt #1 + assert(benchmark_result[0]["decoded_tokens"] == 20) + assert(benchmark_result[0]["num_iterations"] == 8) + assert(benchmark_result[0]["decode_speed(tok/s)"] > 0) + assert(benchmark_result[0]["prefill_speed(tok/s)"] > 0) + # Test result for prompt #2 + assert(benchmark_result[1]["decoded_tokens"] == 50) + assert(benchmark_result[1]["num_iterations"] == 5) + assert(benchmark_result[1]["decode_speed(tok/s)"] > 0) + assert(benchmark_result[1]["prefill_speed(tok/s)"] > 0) + + def test_streaming_vmfb_comparison(self): """ Similar test to above but for streaming-LLM. From 09f030da534e0372da8f80cbf5407a54c1cfb671 Mon Sep 17 00:00:00 2001 From: stanley-nod Date: Thu, 25 Jan 2024 13:13:54 -0800 Subject: [PATCH 2/6] fix black lint. --- .../tests/stateless_llama_test.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py index f4d703557..0c55105ae 100644 --- a/python/turbine_models/tests/stateless_llama_test.py +++ b/python/turbine_models/tests/stateless_llama_test.py @@ -118,16 +118,15 @@ def test_benchmark_vmfb(self): if len(benchmark_result) <= 0: raise ValueError("Dataset is empty, or did not read dataset correctly.") # Test result for prompt #1 - assert(benchmark_result[0]["decoded_tokens"] == 20) - assert(benchmark_result[0]["num_iterations"] == 8) - assert(benchmark_result[0]["decode_speed(tok/s)"] > 0) - assert(benchmark_result[0]["prefill_speed(tok/s)"] > 0) + assert benchmark_result[0]["decoded_tokens"] == 20 + assert benchmark_result[0]["num_iterations"] == 8 + assert benchmark_result[0]["decode_speed(tok/s)"] > 0 + assert benchmark_result[0]["prefill_speed(tok/s)"] > 0 # Test result for prompt #2 - assert(benchmark_result[1]["decoded_tokens"] == 50) - assert(benchmark_result[1]["num_iterations"] == 5) - assert(benchmark_result[1]["decode_speed(tok/s)"] > 0) - assert(benchmark_result[1]["prefill_speed(tok/s)"] > 0) - + assert benchmark_result[1]["decoded_tokens"] == 50 + assert benchmark_result[1]["num_iterations"] == 5 + assert benchmark_result[1]["decode_speed(tok/s)"] > 0 + assert benchmark_result[1]["prefill_speed(tok/s)"] > 0 def test_streaming_vmfb_comparison(self): """ From a44c364ef143ae85016fdd3be1e356dd3564b19a Mon Sep 17 00:00:00 2001 From: stanley-nod Date: Thu, 25 Jan 2024 15:40:15 -0800 Subject: [PATCH 3/6] Reuse vmfb when possible. --- .../tests/stateless_llama_test.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py index 0c55105ae..5510da12f 100644 --- a/python/turbine_models/tests/stateless_llama_test.py +++ b/python/turbine_models/tests/stateless_llama_test.py @@ -90,17 +90,19 @@ def test_vmfb_comparison(self): check_output_string(torch_str, turbine_str) def test_benchmark_vmfb(self): - llama.export_transformer_model( - hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", - hf_auth_token=None, - compile_to="vmfb", - external_weights="safetensors", - # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized - quantization=quantization, - precision=precision, - device="llvm-cpu", - target_triple="host", - ) + vmfb_name = "Llama_2_7b_chat_hf_function_calling_v2.vmfb" + if not os.path.isfile(vmfb_name): + llama.export_transformer_model( + hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", + hf_auth_token=None, + compile_to="vmfb", + external_weights="safetensors", + # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized + quantization=quantization, + precision=precision, + device="llvm-cpu", + target_triple="host", + ) test_dataset_path = "python/turbine_models/tests/benchmark_prompt_test.json" test_output_path = "benchmark_e2e_results.json" benchmark_result_path = llm_e2e_benchmark.run_llm_benchmark( From 47a817a0043ebdb8fbf68ce7535615473ba44929 Mon Sep 17 00:00:00 2001 From: stanley-nod Date: Thu, 25 Jan 2024 16:24:31 -0800 Subject: [PATCH 4/6] Reduce num iteration for benchmark test. --- python/turbine_models/tests/benchmark_prompt_test.json | 4 ++-- python/turbine_models/tests/stateless_llama_test.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json index 9dd85e26d..5cc388161 100644 --- a/python/turbine_models/tests/benchmark_prompt_test.json +++ b/python/turbine_models/tests/benchmark_prompt_test.json @@ -1,4 +1,4 @@ [ - {"id" : 0, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 8, "num_tokens_to_generate": 20}, - {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 5, "num_tokens_to_generate": 50} + {"id" : 0, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 20}, + {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 50} ] \ No newline at end of file diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py index 5510da12f..aaa696e37 100644 --- a/python/turbine_models/tests/stateless_llama_test.py +++ b/python/turbine_models/tests/stateless_llama_test.py @@ -121,12 +121,12 @@ def test_benchmark_vmfb(self): raise ValueError("Dataset is empty, or did not read dataset correctly.") # Test result for prompt #1 assert benchmark_result[0]["decoded_tokens"] == 20 - assert benchmark_result[0]["num_iterations"] == 8 + assert benchmark_result[0]["num_iterations"] == 2 assert benchmark_result[0]["decode_speed(tok/s)"] > 0 assert benchmark_result[0]["prefill_speed(tok/s)"] > 0 # Test result for prompt #2 assert benchmark_result[1]["decoded_tokens"] == 50 - assert benchmark_result[1]["num_iterations"] == 5 + assert benchmark_result[1]["num_iterations"] == 1 assert benchmark_result[1]["decode_speed(tok/s)"] > 0 assert benchmark_result[1]["prefill_speed(tok/s)"] > 0 From a05eff525fba713b9b01fa94ff32983de070cbd1 Mon Sep 17 00:00:00 2001 From: stanley-nod Date: Thu, 25 Jan 2024 19:31:17 -0800 Subject: [PATCH 5/6] Further reduce to bare num tokens and iteration for fn test. --- python/turbine_models/tests/benchmark_prompt_test.json | 4 ++-- python/turbine_models/tests/stateless_llama_test.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json index 5cc388161..f97b8187f 100644 --- a/python/turbine_models/tests/benchmark_prompt_test.json +++ b/python/turbine_models/tests/benchmark_prompt_test.json @@ -1,4 +1,4 @@ [ - {"id" : 0, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 20}, - {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 50} + {"id" : 0, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 10}, + {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 20} ] \ No newline at end of file diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py index aaa696e37..c94a755de 100644 --- a/python/turbine_models/tests/stateless_llama_test.py +++ b/python/turbine_models/tests/stateless_llama_test.py @@ -120,12 +120,12 @@ def test_benchmark_vmfb(self): if len(benchmark_result) <= 0: raise ValueError("Dataset is empty, or did not read dataset correctly.") # Test result for prompt #1 - assert benchmark_result[0]["decoded_tokens"] == 20 + assert benchmark_result[0]["decoded_tokens"] == 10 assert benchmark_result[0]["num_iterations"] == 2 assert benchmark_result[0]["decode_speed(tok/s)"] > 0 assert benchmark_result[0]["prefill_speed(tok/s)"] > 0 # Test result for prompt #2 - assert benchmark_result[1]["decoded_tokens"] == 50 + assert benchmark_result[1]["decoded_tokens"] == 20 assert benchmark_result[1]["num_iterations"] == 1 assert benchmark_result[1]["decode_speed(tok/s)"] > 0 assert benchmark_result[1]["prefill_speed(tok/s)"] > 0 From 41e1d8dcf827444f8ec260c1c296da77aeeedfc4 Mon Sep 17 00:00:00 2001 From: stanley-nod Date: Thu, 25 Jan 2024 20:51:37 -0800 Subject: [PATCH 6/6] Fix to num iters. --- python/turbine_models/tests/benchmark_prompt_test.json | 2 +- python/turbine_models/tests/stateless_llama_test.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json index f97b8187f..39d34fd3b 100644 --- a/python/turbine_models/tests/benchmark_prompt_test.json +++ b/python/turbine_models/tests/benchmark_prompt_test.json @@ -1,4 +1,4 @@ [ {"id" : 0, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 10}, - {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 20} + {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 25} ] \ No newline at end of file diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py index c94a755de..4a264473d 100644 --- a/python/turbine_models/tests/stateless_llama_test.py +++ b/python/turbine_models/tests/stateless_llama_test.py @@ -125,7 +125,7 @@ def test_benchmark_vmfb(self): assert benchmark_result[0]["decode_speed(tok/s)"] > 0 assert benchmark_result[0]["prefill_speed(tok/s)"] > 0 # Test result for prompt #2 - assert benchmark_result[1]["decoded_tokens"] == 20 + assert benchmark_result[1]["decoded_tokens"] == 25 assert benchmark_result[1]["num_iterations"] == 1 assert benchmark_result[1]["decode_speed(tok/s)"] > 0 assert benchmark_result[1]["prefill_speed(tok/s)"] > 0