diff --git a/.gitignore b/.gitignore index d85c8598b..663e9d93e 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,6 @@ wheelhouse *.safetensors *.gguf *.vmfb + +#Benchmark artifacts +benchmark_e2e_results.json diff --git a/python/turbine_models/custom_models/llama-benchmark/README.md b/python/turbine_models/custom_models/llama_benchmark/README.md similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/README.md rename to python/turbine_models/custom_models/llama_benchmark/README.md diff --git a/python/turbine_models/custom_models/llama_benchmark/__init__.py b/python/turbine_models/custom_models/llama_benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark.mlir b/python/turbine_models/custom_models/llama_benchmark/benchmark.mlir similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/benchmark.mlir rename to python/turbine_models/custom_models/llama_benchmark/benchmark.mlir diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark_forward.mlir b/python/turbine_models/custom_models/llama_benchmark/benchmark_forward.mlir similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/benchmark_forward.mlir rename to python/turbine_models/custom_models/llama_benchmark/benchmark_forward.mlir diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark_module.py b/python/turbine_models/custom_models/llama_benchmark/benchmark_module.py similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/benchmark_module.py rename to python/turbine_models/custom_models/llama_benchmark/benchmark_module.py diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/README.md b/python/turbine_models/custom_models/llama_benchmark/e2e/README.md new file mode 100644 index 000000000..6432b9e60 --- /dev/null +++ b/python/turbine_models/custom_models/llama_benchmark/e2e/README.md @@ -0,0 +1,65 @@ +# Instructions + +Clone and install SHARK-Turbine +``` +git clone https://github.com/nod-ai/SHARK-Turbine.git +cd SHARK-Turbine +python -m venv turbine_venv && source turbine_venv/bin/activate + +pip install --upgrade -r requirements.txt +pip install --upgrade -e .[torch-cpu-nightly,testing] +pip install --upgrade -r turbine-models-requirements.txt +``` + +## Compiling LLMs +Note: Make sure to replace "your_token" with your actual hf_auth_token for all the commands. + +Now, you can generate the quantized weight file with +``` +python python/turbine_models/gen_external_params/gen_external_params.py --hf_auth_token=your_token +``` +The model weights will then be saved in the current directory as `Llama_2_7b_chat_hf_f16_int4.safetensors`. + +To compile to vmfb for llama +``` +python python/turbine_models/custom_models/stateless_llama.py --compile_to=vmfb --hf_auth_token=your_token --external_weights="safetensors" --quantization="int4" --precision="f16" +``` +By default the vmfb will be saved as `Llama_2_7b_chat_hf.vmfb`. + +## Benchmarking LLMs e2e +To run benchmark with the default benchmark dataset just run: +``` +python python/turbine_models/custom_models/llama-benchmark/e2e/llm_e2e_benchmark.py --vmfb_path=/path/to/Llama_2_7b_chat_hf.vmfb --external_weight_path=Llama_2_7b_chat_hf_f16_int4.safetensors --device=vulkan hf_auth_token=your_hf_token +``` +You can specify a path to dataset using: `--benchmark_dataset_path=/path/to/dataset.json` +You can specify where to store the result path using: `--benchmark_output_path=/path/to/output.json` + +## Benchmarking Dataset + +To setup a dataset json you'd need a json file with a list of entry(s) containing these attributes: +1. id : number identifying example (int) +2. system_prompt : System prompt to align LLM (str) +3. user_prompt : Query example from user (str) +4. num_iterations : number of times to run/benchmark the particular example (int) +5. num_tokens_to_generate : how many tokens do we want to generate for the example (int) + +Here is a sample: +```json +[ + {"id" : 0, + "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", + "user_prompt" : "what is the capital of canada?", + "num_iterations": 8, + "num_tokens_to_generate": 20} +] +``` + +The default dataset in `benchmark_prompts.json` contains example that SHARK-1.0 traditionally measures. Additionally, we also added some data common in MLPerf which uses some data from open-orca. In the future, we should add more of the data from open-orca to run benchmarks with. + +## Benchmarking Output + +The output json will have similar attributes with an addition of the results/measured benchmarks. Hence it will have these additional attributes: +1. prefill_tokens : number of tokens ran during the prefill stage (int) +2. prefill_speed(tok/s) : Number of tokens for initial input / time to complete prefill (float) +3. decoded_tokens : number of tokens decoded during decode stage. (int) +4. decode_speed(tok/s) : Average speed of decoding per token for this example, averaged over the number of iterations. (float) diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py b/python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json b/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json new file mode 100644 index 000000000..9c8c72346 --- /dev/null +++ b/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json @@ -0,0 +1,7 @@ +[ + {"id" : 0, "system_prompt": "hi", "user_prompt" : "", "num_iterations": 5, "num_tokens_to_generate": 512}, + {"id" : 1, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 8, "num_tokens_to_generate": 20}, + {"id" : 2, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada and what is famouse for", "num_iterations": 5, "num_tokens_to_generate": 255}, + {"id" : 3, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 5, "num_tokens_to_generate": 50}, + {"id" : 4, "system_prompt": "[INST] <>\nYou are an AI assistant that helps people find information. \n <>\n\n", "user_prompt" : "James runs a TV show and there are 5 main characters and 4 minor characters. He pays the minor characters $15,000 each episode. He paid the major characters three times as much. How much does he pay per episode? Let's be accurate as possible.", "num_iterations": 5, "num_tokens_to_generate": 255} +] \ No newline at end of file diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py b/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py new file mode 100644 index 000000000..f7974dbe3 --- /dev/null +++ b/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py @@ -0,0 +1,127 @@ +import argparse +from turbine_models.model_runner import vmfbRunner +from transformers import AutoTokenizer +from iree import runtime as ireert +import torch +import time +from turbine_models.custom_models.llm_optimizations.streaming_llm.modify_llama import ( + enable_llama_pos_shift_attention, +) +from turbine_models.custom_models.llm_runner import parser, SharkLLM +import os +import json + +parser.add_argument( + "--benchmark_dataset_path", + type=str, + default=f"{os.path.dirname(os.path.realpath(__file__))}/benchmark_prompts.json", + help="path to benchmarking dataset", +) +parser.add_argument( + "--benchmark_output_path", + type=str, + default=f"{os.getcwd()}/benchmark_e2e_results.json", + help="path to benchmarking dataset", +) + + +B_INST, E_INST = "[INST]", "[/INST]" + + +def append_user_prompt(history, input_prompt): + if len(input_prompt) == 0: + return history + user_prompt = f"{B_INST} {input_prompt} {E_INST}" + history += user_prompt + return history + + +def load_dataset(dataset_path): + dataset = [] + with open(dataset_path) as f: + dataset = json.load(f) + if len(dataset) <= 0: + raise ValueError("Dataset is empty, or did not read dataset correctly.") + return dataset + + +def run_llm_benchmark( + device, + vmfb_path, + hf_model_name, + hf_auth_token, + external_weight_path, + dataset_path, + output_path, + streaming_llm=False, +): + # TODO: Support streamingLLM benchmarking, need streamingLLM to be able to reset history/seq_len to 0. + if streaming_llm: + raise ValueError("Streaming LLM currently not supported for benchmarking.") + tokenizer = AutoTokenizer.from_pretrained( + hf_model_name, + use_fast=False, + token=hf_auth_token, + ) + dataset = load_dataset(dataset_path) + result_dicts = [] + llm = SharkLLM( + device=device, + vmfb_path=vmfb_path, + external_weight_path=external_weight_path, + streaming_llm=streaming_llm, + ) + for data in dataset: + llm.set_min_token(data["num_tokens_to_generate"]) + llm.set_max_token(data["num_tokens_to_generate"]) + running_token_decode_count = 0 + running_token_decode_time = 0.0 + running_token_prefill_count = 0 + running_token_prefill_time = 0.0 + for _ in range(data["num_iterations"]): + prompt = data["system_prompt"] + prompt = append_user_prompt(prompt, data["user_prompt"]) + initial_input = tokenizer(prompt, return_tensors="pt") + example_input_id = initial_input.input_ids + result = llm.generate(example_input_id) + bot_response = tokenizer.decode(result, skip_special_tokens=True) + running_token_decode_count += llm.last_num_tokens_decoded + running_token_decode_time += llm.last_prompt_decode_time + running_token_prefill_count += llm.last_num_tokens_prefill + running_token_prefill_time += llm.last_prefill_time + prefill_tokens = running_token_prefill_count / data["num_iterations"] + prefill_speed = running_token_prefill_count / running_token_prefill_time + decoded_tokens = running_token_decode_count / data["num_iterations"] - 1 + decode_speed = running_token_decode_count / running_token_decode_time + result_dicts.append( + { + "prompt_id": data["id"], + "system_prompt": data["system_prompt"], + "user_prompt": data["user_prompt"], + "prefill_tokens": prefill_tokens, + "prefill_speed(tok/s)": prefill_speed, + "decoded_tokens": decoded_tokens, + "decode_speed(tok/s)": decode_speed, + "num_iterations": data["num_iterations"], + "response": bot_response, + } + ) + with open(output_path, "w") as f: + json_results = json.dumps(result_dicts, indent=2) + f.write(json_results) + return output_path + + +if __name__ == "__main__": + args = parser.parse_args() + print("generating turbine output: ") + turbine_output_file = run_llm_benchmark( + args.device, + args.vmfb_path, + args.hf_model_name, + args.hf_auth_token, + args.external_weight_path, + args.benchmark_dataset_path, + args.benchmark_output_path, + args.streaming_llm, + ) diff --git a/python/turbine_models/custom_models/llama-benchmark/stateless_llama_benchmark.py b/python/turbine_models/custom_models/llama_benchmark/stateless_llama_benchmark.py similarity index 100% rename from python/turbine_models/custom_models/llama-benchmark/stateless_llama_benchmark.py rename to python/turbine_models/custom_models/llama_benchmark/stateless_llama_benchmark.py diff --git a/python/turbine_models/custom_models/llm_runner.py b/python/turbine_models/custom_models/llm_runner.py index 7632d1e65..19fa58a9e 100644 --- a/python/turbine_models/custom_models/llm_runner.py +++ b/python/turbine_models/custom_models/llm_runner.py @@ -104,6 +104,18 @@ def __init__(self, device, vmfb_path, external_weight_path, streaming_llm=False) self.last_prompt = None self.streaming_llm = streaming_llm self.prev_token_len = 0 + self.min_token = 0 + self.max_token = 1024 + self.last_prefill_time = -1.0 + self.last_prompt_decode_time = -1.0 + self.last_num_tokens_decoded = -1 + self.last_num_tokens_prefill = -1 + + def set_min_token(self, min_token): + self.min_token = min_token + + def set_max_token(self, max_token): + self.max_token = max_token def format_out(self, results): return torch.tensor(results.to_host()[0][0]) @@ -125,25 +137,23 @@ def generate(self, input_ids): input_ids = input_ids[:, token_slice:] inputs = [ireert.asdevicearray(self.runner.config.device, input_ids)] if self.first_input or not self.streaming_llm: - s = time.time() + prefill_start_time = time.time() results = self.model["run_initialize"](*inputs) # example_input_id - e = time.time() - print( - f"num_tokens: {token_len}, time_taken={e-s}, tok/second:{token_len/(e-s)}" - ) + prefill_end_time = time.time() + self.last_num_tokens_prefill = token_len + self.last_prefill_time = prefill_end_time - prefill_start_time token_len += 1 self.first_input = False else: - s = time.time() + prefill_start_time = time.time() results = self.model["run_cached_initialize"](*inputs) # example_input_id - e = time.time() - print( - f"Cached num_tokens: {token_len}, time_taken={e-s}, tok/second:{token_len/(e-s)}" - ) + prefill_end_time = time.time() + self.last_num_tokens_prefill = token_len + self.last_prefill_time = prefill_end_time - prefill_start_time token_len += 1 - s = time.time() + decode_start_time = time.time() turbine_results.append(self.format_out(results)) - while self.format_out(results) != 2: + for _ in range(self.max_token): if self.streaming_llm and self.model["get_seq_step"]() > 600: print("Evicting cache space!") self.model["evict_kvcache_space"]() @@ -151,11 +161,12 @@ def generate(self, input_ids): # uncomment to see tokens as they are emitted # print(f"turbine: {tokenizer.decode(self.format_out(results))}") turbine_results.append(self.format_out(results)) - e = time.time() + if self.format_out(results) == 2 and len(turbine_results) >= self.min_token: + break + decode_end_time = time.time() decoded_tokens = len(turbine_results) - print( - f"Decode num_tokens: {decoded_tokens}, time_taken={e-s}, tok/second:{decoded_tokens/(e-s)}" - ) + self.last_prompt_decode_time = decode_end_time - decode_start_time + self.last_num_tokens_decoded = decoded_tokens self.prev_token_len = token_len + decoded_tokens return turbine_results @@ -196,6 +207,12 @@ def run_llm( result = llm.generate(example_input_id) bot_response = tokenizer.decode(result, skip_special_tokens=True) print(f"\nBOT: {bot_response}\n") + print( + f"Prefill num_tokens : {llm.last_num_tokens_prefill}, time_taken: {llm.last_prefill_time}, tok/second: {llm.last_num_tokens_prefill/llm.last_prefill_time}" + ) + print( + f"Decode num_tokens : {llm.last_num_tokens_decoded}, time_taken: {llm.last_prompt_decode_time}, tok/second: {llm.last_num_tokens_decoded/llm.last_prompt_decode_time}" + ) prompt = append_bot_prompt(prompt, bot_response) diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json new file mode 100644 index 000000000..39d34fd3b --- /dev/null +++ b/python/turbine_models/tests/benchmark_prompt_test.json @@ -0,0 +1,4 @@ +[ + {"id" : 0, "system_prompt": "[INST] <>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 10}, + {"id" : 1, "system_prompt": "[INST] <>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 25} +] \ No newline at end of file diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py index fc5bc9cd2..4a264473d 100644 --- a/python/turbine_models/tests/stateless_llama_test.py +++ b/python/turbine_models/tests/stateless_llama_test.py @@ -9,11 +9,12 @@ import os import unittest import difflib +import json os.environ["TORCH_LOGS"] = "dynamic" from shark_turbine.aot import * from turbine_models.custom_models import llm_runner - +from turbine_models.custom_models.llama_benchmark.e2e import llm_e2e_benchmark from turbine_models.gen_external_params.gen_external_params import ( gen_external_params, ) @@ -88,6 +89,47 @@ def test_vmfb_comparison(self): ) check_output_string(torch_str, turbine_str) + def test_benchmark_vmfb(self): + vmfb_name = "Llama_2_7b_chat_hf_function_calling_v2.vmfb" + if not os.path.isfile(vmfb_name): + llama.export_transformer_model( + hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2", + hf_auth_token=None, + compile_to="vmfb", + external_weights="safetensors", + # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized + quantization=quantization, + precision=precision, + device="llvm-cpu", + target_triple="host", + ) + test_dataset_path = "python/turbine_models/tests/benchmark_prompt_test.json" + test_output_path = "benchmark_e2e_results.json" + benchmark_result_path = llm_e2e_benchmark.run_llm_benchmark( + "local-task", + "Llama_2_7b_chat_hf_function_calling_v2.vmfb", + "Trelis/Llama-2-7b-chat-hf-function-calling-v2", + None, + f"Llama_2_7b_chat_hf_function_calling_v2_{precision}_{quantization}.safetensors", + test_dataset_path, + test_output_path, + ) + benchmark_result = [] + with open(benchmark_result_path) as f: + benchmark_result = json.load(f) + if len(benchmark_result) <= 0: + raise ValueError("Dataset is empty, or did not read dataset correctly.") + # Test result for prompt #1 + assert benchmark_result[0]["decoded_tokens"] == 10 + assert benchmark_result[0]["num_iterations"] == 2 + assert benchmark_result[0]["decode_speed(tok/s)"] > 0 + assert benchmark_result[0]["prefill_speed(tok/s)"] > 0 + # Test result for prompt #2 + assert benchmark_result[1]["decoded_tokens"] == 25 + assert benchmark_result[1]["num_iterations"] == 1 + assert benchmark_result[1]["decode_speed(tok/s)"] > 0 + assert benchmark_result[1]["prefill_speed(tok/s)"] > 0 + def test_streaming_vmfb_comparison(self): """ Similar test to above but for streaming-LLM.