From d4e7db2f9b7811f7963f1e76bb1313d7610d0a66 Mon Sep 17 00:00:00 2001
From: stanley-nod <stanley@nod-labs.com>
Date: Wed, 24 Jan 2024 23:49:40 -0800
Subject: [PATCH 1/6] Auto e2e benchmarker.

---
 .gitignore                                    |   3 +
 .../README.md                                 |   0
 .../custom_models/llama_benchmark/__init__.py |   0
 .../benchmark.mlir                            |   0
 .../benchmark_forward.mlir                    |   0
 .../benchmark_module.py                       |   0
 .../llama_benchmark/e2e/README.md             |  65 +++++++++
 .../llama_benchmark/e2e/__init__.py           |   0
 .../e2e/benchmark_prompts.json                |   7 +
 .../llama_benchmark/e2e/llm_e2e_benchmark.py  | 127 ++++++++++++++++++
 .../stateless_llama_benchmark.py              |   0
 .../custom_models/llm_runner.py               |  49 ++++---
 .../tests/benchmark_prompt_test.json          |   4 +
 .../tests/stateless_llama_test.py             |  43 +++++-
 14 files changed, 281 insertions(+), 17 deletions(-)
 rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/README.md (100%)
 create mode 100644 python/turbine_models/custom_models/llama_benchmark/__init__.py
 rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/benchmark.mlir (100%)
 rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/benchmark_forward.mlir (100%)
 rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/benchmark_module.py (100%)
 create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/README.md
 create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py
 create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json
 create mode 100644 python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py
 rename python/turbine_models/custom_models/{llama-benchmark => llama_benchmark}/stateless_llama_benchmark.py (100%)
 create mode 100644 python/turbine_models/tests/benchmark_prompt_test.json

diff --git a/.gitignore b/.gitignore
index d85c8598b..663e9d93e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,6 @@ wheelhouse
 *.safetensors
 *.gguf
 *.vmfb
+
+#Benchmark artifacts
+benchmark_e2e_results.json
diff --git a/python/turbine_models/custom_models/llama-benchmark/README.md b/python/turbine_models/custom_models/llama_benchmark/README.md
similarity index 100%
rename from python/turbine_models/custom_models/llama-benchmark/README.md
rename to python/turbine_models/custom_models/llama_benchmark/README.md
diff --git a/python/turbine_models/custom_models/llama_benchmark/__init__.py b/python/turbine_models/custom_models/llama_benchmark/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark.mlir b/python/turbine_models/custom_models/llama_benchmark/benchmark.mlir
similarity index 100%
rename from python/turbine_models/custom_models/llama-benchmark/benchmark.mlir
rename to python/turbine_models/custom_models/llama_benchmark/benchmark.mlir
diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark_forward.mlir b/python/turbine_models/custom_models/llama_benchmark/benchmark_forward.mlir
similarity index 100%
rename from python/turbine_models/custom_models/llama-benchmark/benchmark_forward.mlir
rename to python/turbine_models/custom_models/llama_benchmark/benchmark_forward.mlir
diff --git a/python/turbine_models/custom_models/llama-benchmark/benchmark_module.py b/python/turbine_models/custom_models/llama_benchmark/benchmark_module.py
similarity index 100%
rename from python/turbine_models/custom_models/llama-benchmark/benchmark_module.py
rename to python/turbine_models/custom_models/llama_benchmark/benchmark_module.py
diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/README.md b/python/turbine_models/custom_models/llama_benchmark/e2e/README.md
new file mode 100644
index 000000000..6432b9e60
--- /dev/null
+++ b/python/turbine_models/custom_models/llama_benchmark/e2e/README.md
@@ -0,0 +1,65 @@
+# Instructions
+
+Clone and install SHARK-Turbine
+```
+git clone https://github.com/nod-ai/SHARK-Turbine.git
+cd SHARK-Turbine
+python -m venv turbine_venv && source turbine_venv/bin/activate
+
+pip install --upgrade -r requirements.txt
+pip install --upgrade -e .[torch-cpu-nightly,testing]
+pip install --upgrade -r turbine-models-requirements.txt
+```
+
+## Compiling LLMs
+Note: Make sure to replace "your_token" with your actual hf_auth_token for all the commands.
+
+Now, you can generate the quantized weight file with
+```
+python python/turbine_models/gen_external_params/gen_external_params.py --hf_auth_token=your_token
+```
+The model weights will then be saved in the current directory as `Llama_2_7b_chat_hf_f16_int4.safetensors`.
+
+To compile to vmfb for llama
+```
+python python/turbine_models/custom_models/stateless_llama.py --compile_to=vmfb --hf_auth_token=your_token --external_weights="safetensors" --quantization="int4" --precision="f16"
+```
+By default the vmfb will be saved as `Llama_2_7b_chat_hf.vmfb`.
+
+##  Benchmarking LLMs e2e
+To run benchmark with the default benchmark dataset just run:
+```
+python python/turbine_models/custom_models/llama-benchmark/e2e/llm_e2e_benchmark.py --vmfb_path=/path/to/Llama_2_7b_chat_hf.vmfb --external_weight_path=Llama_2_7b_chat_hf_f16_int4.safetensors --device=vulkan hf_auth_token=your_hf_token
+```
+You can specify a path to dataset using: `--benchmark_dataset_path=/path/to/dataset.json`
+You can specify where to store the result path using: `--benchmark_output_path=/path/to/output.json`
+
+## Benchmarking Dataset
+
+To setup a dataset json you'd need a json file with a list of entry(s) containing these attributes:
+1. id : number identifying example (int)
+2. system_prompt : System prompt to align LLM (str)
+3. user_prompt : Query example from user (str)
+4. num_iterations : number of times to run/benchmark the particular example (int)
+5. num_tokens_to_generate : how many tokens do we want to generate for the example (int)
+
+Here is a sample:
+```json
+[
+    {"id" : 0,
+    "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n",
+    "user_prompt" : "what is the capital of canada?",
+    "num_iterations": 8,
+    "num_tokens_to_generate": 20}
+]
+```
+
+The default dataset in `benchmark_prompts.json` contains example that SHARK-1.0 traditionally measures. Additionally, we also added some data common in MLPerf which uses some data from open-orca. In the future, we should add more of the data from open-orca to run benchmarks with. 
+
+## Benchmarking Output
+
+The output json will have similar attributes with an addition of the results/measured benchmarks. Hence it will have these additional attributes:
+1. prefill_tokens : number of tokens ran during the prefill stage (int)
+2. prefill_speed(tok/s) : Number of tokens for initial input / time to complete prefill (float)
+3. decoded_tokens : number of tokens decoded during decode stage. (int)
+4. decode_speed(tok/s) : Average speed of decoding per token for this example, averaged over the number of iterations. (float)
diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py b/python/turbine_models/custom_models/llama_benchmark/e2e/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json b/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json
new file mode 100644
index 000000000..9c8c72346
--- /dev/null
+++ b/python/turbine_models/custom_models/llama_benchmark/e2e/benchmark_prompts.json
@@ -0,0 +1,7 @@
+[
+    {"id" : 0, "system_prompt": "hi", "user_prompt" : "", "num_iterations": 5, "num_tokens_to_generate": 512},
+    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 8, "num_tokens_to_generate": 20},
+    {"id" : 2, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada and what is famouse for", "num_iterations": 5, "num_tokens_to_generate": 255},
+    {"id" : 3, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 5, "num_tokens_to_generate": 50},
+    {"id" : 4, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant that helps people find information. \n <</SYS>>\n\n", "user_prompt" : "James runs a TV show and there are 5 main characters and 4 minor characters. He pays the minor characters $15,000 each episode. He paid the major characters three times as much. How much does he pay per episode? Let's be accurate as possible.", "num_iterations": 5, "num_tokens_to_generate": 255}
+]
\ No newline at end of file
diff --git a/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py b/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py
new file mode 100644
index 000000000..f7974dbe3
--- /dev/null
+++ b/python/turbine_models/custom_models/llama_benchmark/e2e/llm_e2e_benchmark.py
@@ -0,0 +1,127 @@
+import argparse
+from turbine_models.model_runner import vmfbRunner
+from transformers import AutoTokenizer
+from iree import runtime as ireert
+import torch
+import time
+from turbine_models.custom_models.llm_optimizations.streaming_llm.modify_llama import (
+    enable_llama_pos_shift_attention,
+)
+from turbine_models.custom_models.llm_runner import parser, SharkLLM
+import os
+import json
+
+parser.add_argument(
+    "--benchmark_dataset_path",
+    type=str,
+    default=f"{os.path.dirname(os.path.realpath(__file__))}/benchmark_prompts.json",
+    help="path to benchmarking dataset",
+)
+parser.add_argument(
+    "--benchmark_output_path",
+    type=str,
+    default=f"{os.getcwd()}/benchmark_e2e_results.json",
+    help="path to benchmarking dataset",
+)
+
+
+B_INST, E_INST = "[INST]", "[/INST]"
+
+
+def append_user_prompt(history, input_prompt):
+    if len(input_prompt) == 0:
+        return history
+    user_prompt = f"{B_INST} {input_prompt} {E_INST}"
+    history += user_prompt
+    return history
+
+
+def load_dataset(dataset_path):
+    dataset = []
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    if len(dataset) <= 0:
+        raise ValueError("Dataset is empty, or did not read dataset correctly.")
+    return dataset
+
+
+def run_llm_benchmark(
+    device,
+    vmfb_path,
+    hf_model_name,
+    hf_auth_token,
+    external_weight_path,
+    dataset_path,
+    output_path,
+    streaming_llm=False,
+):
+    # TODO: Support streamingLLM benchmarking, need streamingLLM to be able to reset history/seq_len to 0.
+    if streaming_llm:
+        raise ValueError("Streaming LLM currently not supported for benchmarking.")
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_model_name,
+        use_fast=False,
+        token=hf_auth_token,
+    )
+    dataset = load_dataset(dataset_path)
+    result_dicts = []
+    llm = SharkLLM(
+        device=device,
+        vmfb_path=vmfb_path,
+        external_weight_path=external_weight_path,
+        streaming_llm=streaming_llm,
+    )
+    for data in dataset:
+        llm.set_min_token(data["num_tokens_to_generate"])
+        llm.set_max_token(data["num_tokens_to_generate"])
+        running_token_decode_count = 0
+        running_token_decode_time = 0.0
+        running_token_prefill_count = 0
+        running_token_prefill_time = 0.0
+        for _ in range(data["num_iterations"]):
+            prompt = data["system_prompt"]
+            prompt = append_user_prompt(prompt, data["user_prompt"])
+            initial_input = tokenizer(prompt, return_tensors="pt")
+            example_input_id = initial_input.input_ids
+            result = llm.generate(example_input_id)
+            bot_response = tokenizer.decode(result, skip_special_tokens=True)
+            running_token_decode_count += llm.last_num_tokens_decoded
+            running_token_decode_time += llm.last_prompt_decode_time
+            running_token_prefill_count += llm.last_num_tokens_prefill
+            running_token_prefill_time += llm.last_prefill_time
+        prefill_tokens = running_token_prefill_count / data["num_iterations"]
+        prefill_speed = running_token_prefill_count / running_token_prefill_time
+        decoded_tokens = running_token_decode_count / data["num_iterations"] - 1
+        decode_speed = running_token_decode_count / running_token_decode_time
+        result_dicts.append(
+            {
+                "prompt_id": data["id"],
+                "system_prompt": data["system_prompt"],
+                "user_prompt": data["user_prompt"],
+                "prefill_tokens": prefill_tokens,
+                "prefill_speed(tok/s)": prefill_speed,
+                "decoded_tokens": decoded_tokens,
+                "decode_speed(tok/s)": decode_speed,
+                "num_iterations": data["num_iterations"],
+                "response": bot_response,
+            }
+        )
+    with open(output_path, "w") as f:
+        json_results = json.dumps(result_dicts, indent=2)
+        f.write(json_results)
+    return output_path
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print("generating turbine output: ")
+    turbine_output_file = run_llm_benchmark(
+        args.device,
+        args.vmfb_path,
+        args.hf_model_name,
+        args.hf_auth_token,
+        args.external_weight_path,
+        args.benchmark_dataset_path,
+        args.benchmark_output_path,
+        args.streaming_llm,
+    )
diff --git a/python/turbine_models/custom_models/llama-benchmark/stateless_llama_benchmark.py b/python/turbine_models/custom_models/llama_benchmark/stateless_llama_benchmark.py
similarity index 100%
rename from python/turbine_models/custom_models/llama-benchmark/stateless_llama_benchmark.py
rename to python/turbine_models/custom_models/llama_benchmark/stateless_llama_benchmark.py
diff --git a/python/turbine_models/custom_models/llm_runner.py b/python/turbine_models/custom_models/llm_runner.py
index 7632d1e65..19fa58a9e 100644
--- a/python/turbine_models/custom_models/llm_runner.py
+++ b/python/turbine_models/custom_models/llm_runner.py
@@ -104,6 +104,18 @@ def __init__(self, device, vmfb_path, external_weight_path, streaming_llm=False)
         self.last_prompt = None
         self.streaming_llm = streaming_llm
         self.prev_token_len = 0
+        self.min_token = 0
+        self.max_token = 1024
+        self.last_prefill_time = -1.0
+        self.last_prompt_decode_time = -1.0
+        self.last_num_tokens_decoded = -1
+        self.last_num_tokens_prefill = -1
+
+    def set_min_token(self, min_token):
+        self.min_token = min_token
+
+    def set_max_token(self, max_token):
+        self.max_token = max_token
 
     def format_out(self, results):
         return torch.tensor(results.to_host()[0][0])
@@ -125,25 +137,23 @@ def generate(self, input_ids):
             input_ids = input_ids[:, token_slice:]
         inputs = [ireert.asdevicearray(self.runner.config.device, input_ids)]
         if self.first_input or not self.streaming_llm:
-            s = time.time()
+            prefill_start_time = time.time()
             results = self.model["run_initialize"](*inputs)  # example_input_id
-            e = time.time()
-            print(
-                f"num_tokens: {token_len}, time_taken={e-s}, tok/second:{token_len/(e-s)}"
-            )
+            prefill_end_time = time.time()
+            self.last_num_tokens_prefill = token_len
+            self.last_prefill_time = prefill_end_time - prefill_start_time
             token_len += 1
             self.first_input = False
         else:
-            s = time.time()
+            prefill_start_time = time.time()
             results = self.model["run_cached_initialize"](*inputs)  # example_input_id
-            e = time.time()
-            print(
-                f"Cached num_tokens: {token_len}, time_taken={e-s}, tok/second:{token_len/(e-s)}"
-            )
+            prefill_end_time = time.time()
+            self.last_num_tokens_prefill = token_len
+            self.last_prefill_time = prefill_end_time - prefill_start_time
             token_len += 1
-        s = time.time()
+        decode_start_time = time.time()
         turbine_results.append(self.format_out(results))
-        while self.format_out(results) != 2:
+        for _ in range(self.max_token):
             if self.streaming_llm and self.model["get_seq_step"]() > 600:
                 print("Evicting cache space!")
                 self.model["evict_kvcache_space"]()
@@ -151,11 +161,12 @@ def generate(self, input_ids):
             # uncomment to see tokens as they are emitted
             # print(f"turbine: {tokenizer.decode(self.format_out(results))}")
             turbine_results.append(self.format_out(results))
-        e = time.time()
+            if self.format_out(results) == 2 and len(turbine_results) >= self.min_token:
+                break
+        decode_end_time = time.time()
         decoded_tokens = len(turbine_results)
-        print(
-            f"Decode num_tokens: {decoded_tokens}, time_taken={e-s}, tok/second:{decoded_tokens/(e-s)}"
-        )
+        self.last_prompt_decode_time = decode_end_time - decode_start_time
+        self.last_num_tokens_decoded = decoded_tokens
         self.prev_token_len = token_len + decoded_tokens
         return turbine_results
 
@@ -196,6 +207,12 @@ def run_llm(
         result = llm.generate(example_input_id)
         bot_response = tokenizer.decode(result, skip_special_tokens=True)
         print(f"\nBOT: {bot_response}\n")
+        print(
+            f"Prefill num_tokens : {llm.last_num_tokens_prefill}, time_taken: {llm.last_prefill_time}, tok/second: {llm.last_num_tokens_prefill/llm.last_prefill_time}"
+        )
+        print(
+            f"Decode num_tokens : {llm.last_num_tokens_decoded}, time_taken: {llm.last_prompt_decode_time}, tok/second: {llm.last_num_tokens_decoded/llm.last_prompt_decode_time}"
+        )
         prompt = append_bot_prompt(prompt, bot_response)
 
 
diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json
new file mode 100644
index 000000000..9dd85e26d
--- /dev/null
+++ b/python/turbine_models/tests/benchmark_prompt_test.json
@@ -0,0 +1,4 @@
+[
+    {"id" : 0, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 8, "num_tokens_to_generate": 20},
+    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 5, "num_tokens_to_generate": 50}
+]
\ No newline at end of file
diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py
index fc5bc9cd2..f4d703557 100644
--- a/python/turbine_models/tests/stateless_llama_test.py
+++ b/python/turbine_models/tests/stateless_llama_test.py
@@ -9,11 +9,12 @@
 import os
 import unittest
 import difflib
+import json
 
 os.environ["TORCH_LOGS"] = "dynamic"
 from shark_turbine.aot import *
 from turbine_models.custom_models import llm_runner
-
+from turbine_models.custom_models.llama_benchmark.e2e import llm_e2e_benchmark
 from turbine_models.gen_external_params.gen_external_params import (
     gen_external_params,
 )
@@ -88,6 +89,46 @@ def test_vmfb_comparison(self):
         )
         check_output_string(torch_str, turbine_str)
 
+    def test_benchmark_vmfb(self):
+        llama.export_transformer_model(
+            hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            hf_auth_token=None,
+            compile_to="vmfb",
+            external_weights="safetensors",
+            # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized
+            quantization=quantization,
+            precision=precision,
+            device="llvm-cpu",
+            target_triple="host",
+        )
+        test_dataset_path = "python/turbine_models/tests/benchmark_prompt_test.json"
+        test_output_path = "benchmark_e2e_results.json"
+        benchmark_result_path = llm_e2e_benchmark.run_llm_benchmark(
+            "local-task",
+            "Llama_2_7b_chat_hf_function_calling_v2.vmfb",
+            "Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+            None,
+            f"Llama_2_7b_chat_hf_function_calling_v2_{precision}_{quantization}.safetensors",
+            test_dataset_path,
+            test_output_path,
+        )
+        benchmark_result = []
+        with open(benchmark_result_path) as f:
+            benchmark_result = json.load(f)
+        if len(benchmark_result) <= 0:
+            raise ValueError("Dataset is empty, or did not read dataset correctly.")
+        # Test result for prompt #1
+        assert(benchmark_result[0]["decoded_tokens"] == 20)
+        assert(benchmark_result[0]["num_iterations"] == 8)
+        assert(benchmark_result[0]["decode_speed(tok/s)"] > 0)
+        assert(benchmark_result[0]["prefill_speed(tok/s)"] > 0)
+        # Test result for prompt #2
+        assert(benchmark_result[1]["decoded_tokens"] == 50)
+        assert(benchmark_result[1]["num_iterations"] == 5)
+        assert(benchmark_result[1]["decode_speed(tok/s)"] > 0)
+        assert(benchmark_result[1]["prefill_speed(tok/s)"] > 0)
+
+
     def test_streaming_vmfb_comparison(self):
         """
         Similar test to above but for streaming-LLM.

From 09f030da534e0372da8f80cbf5407a54c1cfb671 Mon Sep 17 00:00:00 2001
From: stanley-nod <stanley@nod-labs.com>
Date: Thu, 25 Jan 2024 13:13:54 -0800
Subject: [PATCH 2/6] fix black lint.

---
 .../tests/stateless_llama_test.py               | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py
index f4d703557..0c55105ae 100644
--- a/python/turbine_models/tests/stateless_llama_test.py
+++ b/python/turbine_models/tests/stateless_llama_test.py
@@ -118,16 +118,15 @@ def test_benchmark_vmfb(self):
         if len(benchmark_result) <= 0:
             raise ValueError("Dataset is empty, or did not read dataset correctly.")
         # Test result for prompt #1
-        assert(benchmark_result[0]["decoded_tokens"] == 20)
-        assert(benchmark_result[0]["num_iterations"] == 8)
-        assert(benchmark_result[0]["decode_speed(tok/s)"] > 0)
-        assert(benchmark_result[0]["prefill_speed(tok/s)"] > 0)
+        assert benchmark_result[0]["decoded_tokens"] == 20
+        assert benchmark_result[0]["num_iterations"] == 8
+        assert benchmark_result[0]["decode_speed(tok/s)"] > 0
+        assert benchmark_result[0]["prefill_speed(tok/s)"] > 0
         # Test result for prompt #2
-        assert(benchmark_result[1]["decoded_tokens"] == 50)
-        assert(benchmark_result[1]["num_iterations"] == 5)
-        assert(benchmark_result[1]["decode_speed(tok/s)"] > 0)
-        assert(benchmark_result[1]["prefill_speed(tok/s)"] > 0)
-
+        assert benchmark_result[1]["decoded_tokens"] == 50
+        assert benchmark_result[1]["num_iterations"] == 5
+        assert benchmark_result[1]["decode_speed(tok/s)"] > 0
+        assert benchmark_result[1]["prefill_speed(tok/s)"] > 0
 
     def test_streaming_vmfb_comparison(self):
         """

From a44c364ef143ae85016fdd3be1e356dd3564b19a Mon Sep 17 00:00:00 2001
From: stanley-nod <stanley@nod-labs.com>
Date: Thu, 25 Jan 2024 15:40:15 -0800
Subject: [PATCH 3/6] Reuse vmfb when possible.

---
 .../tests/stateless_llama_test.py             | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py
index 0c55105ae..5510da12f 100644
--- a/python/turbine_models/tests/stateless_llama_test.py
+++ b/python/turbine_models/tests/stateless_llama_test.py
@@ -90,17 +90,19 @@ def test_vmfb_comparison(self):
         check_output_string(torch_str, turbine_str)
 
     def test_benchmark_vmfb(self):
-        llama.export_transformer_model(
-            hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
-            hf_auth_token=None,
-            compile_to="vmfb",
-            external_weights="safetensors",
-            # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized
-            quantization=quantization,
-            precision=precision,
-            device="llvm-cpu",
-            target_triple="host",
-        )
+        vmfb_name = "Llama_2_7b_chat_hf_function_calling_v2.vmfb"
+        if not os.path.isfile(vmfb_name):
+            llama.export_transformer_model(
+                hf_model_name="Trelis/Llama-2-7b-chat-hf-function-calling-v2",
+                hf_auth_token=None,
+                compile_to="vmfb",
+                external_weights="safetensors",
+                # external_weight_file="Llama-2-7b-chat-hf-function-calling-v2_f16_int4.safetensors", Do not export weights because this doesn't get quantized
+                quantization=quantization,
+                precision=precision,
+                device="llvm-cpu",
+                target_triple="host",
+            )
         test_dataset_path = "python/turbine_models/tests/benchmark_prompt_test.json"
         test_output_path = "benchmark_e2e_results.json"
         benchmark_result_path = llm_e2e_benchmark.run_llm_benchmark(

From 47a817a0043ebdb8fbf68ce7535615473ba44929 Mon Sep 17 00:00:00 2001
From: stanley-nod <stanley@nod-labs.com>
Date: Thu, 25 Jan 2024 16:24:31 -0800
Subject: [PATCH 4/6] Reduce num iteration for benchmark test.

---
 python/turbine_models/tests/benchmark_prompt_test.json | 4 ++--
 python/turbine_models/tests/stateless_llama_test.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json
index 9dd85e26d..5cc388161 100644
--- a/python/turbine_models/tests/benchmark_prompt_test.json
+++ b/python/turbine_models/tests/benchmark_prompt_test.json
@@ -1,4 +1,4 @@
 [
-    {"id" : 0, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 8, "num_tokens_to_generate": 20},
-    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 5, "num_tokens_to_generate": 50}
+    {"id" : 0, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 20},
+    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 50}
 ]
\ No newline at end of file
diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py
index 5510da12f..aaa696e37 100644
--- a/python/turbine_models/tests/stateless_llama_test.py
+++ b/python/turbine_models/tests/stateless_llama_test.py
@@ -121,12 +121,12 @@ def test_benchmark_vmfb(self):
             raise ValueError("Dataset is empty, or did not read dataset correctly.")
         # Test result for prompt #1
         assert benchmark_result[0]["decoded_tokens"] == 20
-        assert benchmark_result[0]["num_iterations"] == 8
+        assert benchmark_result[0]["num_iterations"] == 2
         assert benchmark_result[0]["decode_speed(tok/s)"] > 0
         assert benchmark_result[0]["prefill_speed(tok/s)"] > 0
         # Test result for prompt #2
         assert benchmark_result[1]["decoded_tokens"] == 50
-        assert benchmark_result[1]["num_iterations"] == 5
+        assert benchmark_result[1]["num_iterations"] == 1
         assert benchmark_result[1]["decode_speed(tok/s)"] > 0
         assert benchmark_result[1]["prefill_speed(tok/s)"] > 0
 

From a05eff525fba713b9b01fa94ff32983de070cbd1 Mon Sep 17 00:00:00 2001
From: stanley-nod <stanley@nod-labs.com>
Date: Thu, 25 Jan 2024 19:31:17 -0800
Subject: [PATCH 5/6] Further reduce to bare num tokens and iteration for fn
 test.

---
 python/turbine_models/tests/benchmark_prompt_test.json | 4 ++--
 python/turbine_models/tests/stateless_llama_test.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json
index 5cc388161..f97b8187f 100644
--- a/python/turbine_models/tests/benchmark_prompt_test.json
+++ b/python/turbine_models/tests/benchmark_prompt_test.json
@@ -1,4 +1,4 @@
 [
-    {"id" : 0, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 20},
-    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 50}
+    {"id" : 0, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 10},
+    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 20}
 ]
\ No newline at end of file
diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py
index aaa696e37..c94a755de 100644
--- a/python/turbine_models/tests/stateless_llama_test.py
+++ b/python/turbine_models/tests/stateless_llama_test.py
@@ -120,12 +120,12 @@ def test_benchmark_vmfb(self):
         if len(benchmark_result) <= 0:
             raise ValueError("Dataset is empty, or did not read dataset correctly.")
         # Test result for prompt #1
-        assert benchmark_result[0]["decoded_tokens"] == 20
+        assert benchmark_result[0]["decoded_tokens"] == 10
         assert benchmark_result[0]["num_iterations"] == 2
         assert benchmark_result[0]["decode_speed(tok/s)"] > 0
         assert benchmark_result[0]["prefill_speed(tok/s)"] > 0
         # Test result for prompt #2
-        assert benchmark_result[1]["decoded_tokens"] == 50
+        assert benchmark_result[1]["decoded_tokens"] == 20
         assert benchmark_result[1]["num_iterations"] == 1
         assert benchmark_result[1]["decode_speed(tok/s)"] > 0
         assert benchmark_result[1]["prefill_speed(tok/s)"] > 0

From 41e1d8dcf827444f8ec260c1c296da77aeeedfc4 Mon Sep 17 00:00:00 2001
From: stanley-nod <stanley@nod-labs.com>
Date: Thu, 25 Jan 2024 20:51:37 -0800
Subject: [PATCH 6/6] Fix to num iters.

---
 python/turbine_models/tests/benchmark_prompt_test.json | 2 +-
 python/turbine_models/tests/stateless_llama_test.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/turbine_models/tests/benchmark_prompt_test.json b/python/turbine_models/tests/benchmark_prompt_test.json
index f97b8187f..39d34fd3b 100644
--- a/python/turbine_models/tests/benchmark_prompt_test.json
+++ b/python/turbine_models/tests/benchmark_prompt_test.json
@@ -1,4 +1,4 @@
 [
     {"id" : 0, "system_prompt": "<s>[INST] <<SYS>>\nBe concise. You are a helpful, respectful and honest assistant. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n <</SYS>>\n\n", "user_prompt" : "what is the capital of canada?", "num_iterations": 2, "num_tokens_to_generate": 10},
-    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 20}
+    {"id" : 1, "system_prompt": "<s>[INST] <<SYS>>\nYou are an AI assistant. You will be given a task. You must generate a detailed and long answer.\n <</SYS>>\n\n", "user_prompt" : "Generate an approximately fifteen-word sentence that describes all this data: Midsummer House eatType restaurant; Midsummer House food Chinese; Midsummer House priceRange moderate; Midsummer House customer rating 3 out of 5; Midsummer House near All Bar One", "num_iterations": 1, "num_tokens_to_generate": 25}
 ]
\ No newline at end of file
diff --git a/python/turbine_models/tests/stateless_llama_test.py b/python/turbine_models/tests/stateless_llama_test.py
index c94a755de..4a264473d 100644
--- a/python/turbine_models/tests/stateless_llama_test.py
+++ b/python/turbine_models/tests/stateless_llama_test.py
@@ -125,7 +125,7 @@ def test_benchmark_vmfb(self):
         assert benchmark_result[0]["decode_speed(tok/s)"] > 0
         assert benchmark_result[0]["prefill_speed(tok/s)"] > 0
         # Test result for prompt #2
-        assert benchmark_result[1]["decoded_tokens"] == 20
+        assert benchmark_result[1]["decoded_tokens"] == 25
         assert benchmark_result[1]["num_iterations"] == 1
         assert benchmark_result[1]["decode_speed(tok/s)"] > 0
         assert benchmark_result[1]["prefill_speed(tok/s)"] > 0