Add llama2 benchmark

Dan-wanna-M · Aug 13, 2024 · c0bc50e · c0bc50e
1 parent 6c4963c
commit c0bc50e
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 26 deletions.
diff --git a/benchmarks/result.md b/benchmarks/result.md
@@ -25,6 +25,9 @@ Default vllm setting are used.
 | Llama3-8B(bf16) | address_json    | 40.72                           | 42.02               | 0.76                    |
 | Llama3-8B(bf16) | linkedlist_json | 40.57                           | 41.95               | 0.81                    |
 | Llama3-8B(bf16) | order_json      | 40.10                           | 41.56               | 0.88                    |
+| Llama2-7B(fp16) | address_json    | 46.66                           | 47.65               | 0.45                    |
+| Llama2-7B(fp16) | linkedlist_json | 46.55                           | 47.68               | 0.51                    |
+| Llama2-7B(fp16) | order_json      | 45.76                           | 46.81               | 0.49                    |
 ## Exllamav2
 Default exllamav2 setting are used.
 

diff --git a/benchmarks/vllm_json.py b/benchmarks/vllm_json.py
@@ -1,20 +1,23 @@
+import gc
 import json
-import typing
 from timeit import timeit
 
+import torch
 from formatron.grammar_generators.json_generator import JsonGenerator
 from vllm import LLM, SamplingParams
+from vllm.distributed import destroy_model_parallel, destroy_distributed_environment
 
-from utils import Order, log
-from utils import LinkedList
-from utils import BenchResult, Context
-from utils import Address
 from formatter import FormatterBuilder
 from integrations.vllm import create_formatters_logits_processor, FormattersLogitsProcessor
+from utils import Address
+from utils import BenchResult, Context
+from utils import LinkedList
+from utils import Order, log
+
 
 def execute():
     prompts = [
-        f"{system_prompt}{inputs[context.index]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
+        f"{system_prompt}{inputs[context.index]}{tail}",
     ]
     context.index+=1
     outputs = llm.generate(prompts, sampling_params)
@@ -29,6 +32,7 @@ def get_vllm_address():
     f = FormatterBuilder()
     f.append_line(f"```json\n{f.schema(Address, JsonGenerator(), capture_name='json')}```")
     logits_processor = create_formatters_logits_processor(llm, [f])
+    print(llm.get_tokenizer().vocab_size)
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95,max_tokens=100, logits_processors=[logits_processor])
     return sampling_params
 
@@ -65,22 +69,46 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
     result.t2 = context.tokens
     log(bench_name, result, f)
 
-if __name__ == "__main__":
-    system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 
-    You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>
 
-    Extract information into json format: """
-    llm = LLM(model="NurtureAI/Meta-Llama-3-8B-Instruct-32k", max_model_len=4096)
-    sampling_params = get_vllm_address()
+if __name__ == "__main__":
     data = BenchResult(0, 0, 0, 0)
     context = Context(0, 0)
-    inputs = json.load(open("address.json"))["sentences"]
     with open("vllm_json_bench.txt", "w") as f:
-        bench(data, context,execute, "vllm_address", f)
+        system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+                You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+                Extract information into json format: """
+        tail = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+        llm = LLM(model="NurtureAI/Meta-Llama-3-8B-Instruct-32k", max_model_len=4096)
+        inputs = json.load(open("address.json"))["sentences"]
+        sampling_params = get_vllm_address()
+        bench(data, context, execute, "llama3_8b_vllm_address", f)
+        sampling_params = get_vllm_linkedlist()
+        inputs = json.load(open("linkedlist.json"))["sentences"]
+        bench(data, context, execute, "llama3_8b_linkedlist", f)
+        sampling_params = get_vllm_order()
+        inputs = json.load(open("orders.json"))["orders"]
+        bench(data, context, execute, "llama3_8b_orders", f)
+        destroy_model_parallel()
+        destroy_distributed_environment()
+        del llm.llm_engine.model_executor
+        del llm
+        gc.collect()
+        torch.cuda.empty_cache()
+        system_prompt = """[INST]
+            You are a helpful AI assistant for information extraction.
+
+            Extract information into json format: """
+        tail = "[/INST]"
+        llm = LLM(model="daryl149/llama-2-7b-chat-hf", max_model_len=2048)
+        inputs = json.load(open("address.json"))["sentences"]
+        sampling_params = get_vllm_address()
+        bench(data, context,execute, "llama2_7b_vllm_address", f)
         sampling_params = get_vllm_linkedlist()
         inputs = json.load(open("linkedlist.json"))["sentences"]
-        bench(data, context, execute, "linkedlist", f)
+        bench(data, context, execute, "llama2_7b_linkedlist", f)
         sampling_params = get_vllm_order()
         inputs = json.load(open("orders.json"))["orders"]
-        bench(data, context, execute, "orders", f)
+        bench(data, context, execute, "llama2_7b_orders", f)
diff --git a/benchmarks/vllm_json_bench.txt b/benchmarks/vllm_json_bench.txt
@@ -1,6 +1,12 @@
-vllm_address generated 1032 tokens with 40.715563518001495 tps (with warm up)
-vllm_address unconstrained generated 1500 tokens with 42.01920504622742 tps
-linkedlist generated 1262 tokens with 40.568280943625346 tps (with warm up)
-linkedlist unconstrained generated 1337 tokens with 41.946869871332254 tps
-orders generated 4271 tokens with 40.09933827742811 tps (with warm up)
-orders unconstrained generated 4589 tokens with 41.564089177085926 tps
+llama3_8b_vllm_address generated 1085 tokens with 40.811995725723065 tps (with warm up)
+llama3_8b_vllm_address unconstrained generated 1526 tokens with 41.99365646425085 tps
+llama3_8b_linkedlist generated 1252 tokens with 40.58724636960033 tps (with warm up)
+llama3_8b_linkedlist unconstrained generated 1389 tokens with 41.977055386201535 tps
+llama3_8b_orders generated 4266 tokens with 40.109222217293876 tps (with warm up)
+llama3_8b_orders unconstrained generated 4595 tokens with 41.5902565932466 tps
+llama2_7b_vllm_address generated 1571 tokens with 46.65812840219695 tps (with warm up)
+llama2_7b_vllm_address unconstrained generated 1748 tokens with 47.65099657371631 tps
+llama2_7b_linkedlist generated 1648 tokens with 46.55372314285624 tps (with warm up)
+llama2_7b_linkedlist unconstrained generated 1917 tokens with 47.682641280112215 tps
+llama2_7b_orders generated 5120 tokens with 45.76177824108742 tps (with warm up)
+llama2_7b_orders unconstrained generated 5114 tokens with 46.81601673468687 tps
diff --git a/src/formatron/integrations/transformers.py b/src/formatron/integrations/transformers.py
@@ -2,14 +2,14 @@
 This module integrates the transformers library by providing convenience utilities.
 """
 import collections
-import time
 import typing
 
 import kbnf
+from transformers import LogitsProcessor, PreTrainedTokenizerBase, LogitsProcessorList
+
 from config import EngineGenerationConfig
 from formatter import Formatter, FormatterBuilder
 from integrations._utils import get_original_characters
-from transformers import LogitsProcessor, PreTrainedTokenizerBase, LogitsProcessorList
 
 
 def create_engine_vocabulary(tokenizer: PreTrainedTokenizerBase) -> kbnf.Vocabulary:
@@ -77,7 +77,7 @@ def __call__(self, input_ids, scores):
         if self._last_input_id_length is None:  # First iteration
             self._last_input_id_length = input_ids.shape[1]
             for formatter, config, prompt in zip(self._formatters, self.configs, input_ids):
-                if config.reset_at_beginning and formatter.is_completed():
+                if config.reset_at_beginning:
                     formatter.reset()
                 if config.read_prompt:
                     for token in prompt:

diff --git a/src/formatron/integrations/vllm.py b/src/formatron/integrations/vllm.py
@@ -5,7 +5,6 @@
 import typing
 
 import kbnf
-import torch
 from vllm import LLM
 
 from config import EngineGenerationConfig