Speed up huggingface integrations&update benchmarks with newest kbnf

Dan-wanna-M · Aug 18, 2024 · 3d48a0e · 3d48a0e
1 parent ed43077
commit 3d48a0e
Show file tree

Hide file tree

Showing 10 changed files with 149 additions and 76 deletions.
diff --git a/benchmarks/exllamav2_json.py b/benchmarks/exllamav2_json.py
@@ -1,6 +1,8 @@
+import gc
 import json
 from timeit import timeit
 
+import torch
 from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer
 from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler
 from formatron.formatter import FormatterBuilder
@@ -26,6 +28,20 @@ def create_exllamav2_6bpw_llama3_8b():
     )
     return generator
 
+def create_exllamav2_4bpw_llama2_7b():
+    model_dir = "../tests/local_assets/Llama-2-7b-chat-hf-4.0-bpw-exl2/"
+    config = ExLlamaV2Config(model_dir)
+    model = ExLlamaV2(config)
+    cache = ExLlamaV2Cache(model, max_seq_len=65536, lazy=True)
+    model.load_autosplit(cache, progress=True)
+    tokenizer = ExLlamaV2Tokenizer(config)
+    generator = ExLlamaV2DynamicGenerator(
+        model=model,
+        cache=cache,
+        tokenizer=tokenizer,
+    )
+    return generator
+
 def get_address_filter():
     f = FormatterBuilder()
     f.append_line(f"{f.schema(Address, JsonGenerator(), capture_name='json')}")
@@ -76,6 +92,7 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
     context.index = 0
     context.tokens = 0
     context.filters = None
+    settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
     result.s2 = (timeit(func, number=len(inputs)))
     result.t2 = context.tokens
     log(bench_name, result, f)
@@ -89,19 +106,39 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
 Extract information into json format: """
     data = BenchResult(0, 0, 0, 0)
     context = Context(0, 0)
-    generator = create_exllamav2_6bpw_llama3_8b()
-    settings = ExLlamaV2Sampler.Settings()
-    settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
     with open("exllamav2_json.txt", "w") as f:
+        generator = create_exllamav2_6bpw_llama3_8b()
+        settings = ExLlamaV2Sampler.Settings()
         inputs = json.load(open("address.json"))["sentences"]
         context.filters = [get_address_filter()]
         max_new_tokens = 100
-        bench(data, context, execute, "address_json_exllamav2", f)
+        bench(data, context, execute, "llama3_8b_6pw_exl2_address_json_exllamav2", f)
+        settings = ExLlamaV2Sampler.Settings()
         context.filters = [get_linkedlist_filter()]
         inputs = json.load(open("linkedlist.json"))["sentences"]
         max_new_tokens = 32
-        bench(data, context, execute, "linkedlist_json_exllamav2", f)
+        bench(data, context, execute, "llama3_8b_6pw_exl2_linkedlist_json_exllamav2", f)
+        settings = ExLlamaV2Sampler.Settings()
+        context.filters = [get_order_filter()]
+        inputs = json.load(open("orders.json"))["orders"]
+        max_new_tokens = 160
+        bench(data, context, execute, "llama3_8b_6pw_exl2_orders_json_exllamav2", f)
+        del generator
+        gc.collect()
+        torch.cuda.empty_cache()
+        generator = create_exllamav2_4bpw_llama2_7b()
+        settings = ExLlamaV2Sampler.Settings()
+        inputs = json.load(open("address.json"))["sentences"]
+        context.filters = [get_address_filter()]
+        max_new_tokens = 120
+        bench(data, context, execute, "llama2_7b_4pw_exl2_address_json_exllamav2", f)
+        settings = ExLlamaV2Sampler.Settings()
+        context.filters = [get_linkedlist_filter()]
+        inputs = json.load(open("linkedlist.json"))["sentences"]
+        max_new_tokens = 15
+        bench(data, context, execute, "llama2_7b_4pw_exl2_linkedlist_json_exllamav2", f)
+        settings = ExLlamaV2Sampler.Settings()
         context.filters = [get_order_filter()]
         inputs = json.load(open("orders.json"))["orders"]
         max_new_tokens = 160
-        bench(data, context, execute, "orders_json_exllamav2", f)
+        bench(data, context, execute, "llama2_7b_4pw_exl2_orders_json_exllamav2", f)
diff --git a/benchmarks/exllamav2_json.txt b/benchmarks/exllamav2_json.txt
@@ -1,6 +1,12 @@
-address_json_exllamav2 generated 1937 tokens with 82.8928200006119 tps (with warm up)
-address_json_exllamav2 unconstrained generated 2000 tokens with 84.3793451234579 tps
-linkedlist_json_exllamav2 generated 558 tokens with 80.73082629409335 tps (with warm up)
-linkedlist_json_exllamav2 unconstrained generated 640 tokens with 88.08959849567171 tps
-orders_json_exllamav2 generated 2976 tokens with 84.07997233781526 tps (with warm up)
-orders_json_exllamav2 unconstrained generated 3200 tokens with 91.83025447237793 tps
+llama3_8b_6pw_exl2_address_json_exllamav2 generated 1937 tokens with 81.76457267212113 tps (with warm up)
+llama3_8b_6pw_exl2_address_json_exllamav2 unconstrained generated 2000 tokens with 91.93855585432294 tps
+llama3_8b_6pw_exl2_linkedlist_json_exllamav2 generated 567 tokens with 73.72004132348941 tps (with warm up)
+llama3_8b_6pw_exl2_linkedlist_json_exllamav2 unconstrained generated 640 tokens with 92.92655429712437 tps
+llama3_8b_6pw_exl2_orders_json_exllamav2 generated 2976 tokens with 79.10910035605352 tps (with warm up)
+llama3_8b_6pw_exl2_orders_json_exllamav2 unconstrained generated 3200 tokens with 93.46945772542723 tps
+llama2_7b_4pw_exl2_address_json_exllamav2 generated 2400 tokens with 123.7077165970634 tps (with warm up)
+llama2_7b_4pw_exl2_address_json_exllamav2 unconstrained generated 2400 tokens with 133.37570270534903 tps
+llama2_7b_4pw_exl2_linkedlist_json_exllamav2 generated 250 tokens with 80.04987619935734 tps (with warm up)
+llama2_7b_4pw_exl2_linkedlist_json_exllamav2 unconstrained generated 300 tokens with 132.19982863147897 tps
+llama2_7b_4pw_exl2_orders_json_exllamav2 generated 3136 tokens with 117.27953013576354 tps (with warm up)
+llama2_7b_4pw_exl2_orders_json_exllamav2 unconstrained generated 3200 tokens with 129.65265959777014 tps
diff --git a/benchmarks/result.md b/benchmarks/result.md
@@ -33,16 +33,24 @@ Default exllamav2 setting are used.
 
 | model                  | schema          | constrained(with warm-up) / tps | unconstrained / tps | overhead per token / ms |
 |------------------------|-----------------|---------------------------------|---------------------|-------------------------|
-| Llama3-8B(6.0bpw-exl2) | address_json    | 82.89                           | 84.38               | 0.21                    |
-| Llama3-8B(6.0bpw-exl2) | linkedlist_json | 80.73                           | 88.09               | 1.03                    |
-| Llama3-8B(6.0bpw-exl2) | order_json      | 84.08                           | 91.83               | 1.00                    |
+| Llama3-8B(6.0bpw-exl2) | address_json    | 81.76                           | 91.94               | 1.36                    |
+| Llama3-8B(6.0bpw-exl2) | linkedlist_json | 73.73                           | 92.93               | 2.82                    |
+| Llama3-8B(6.0bpw-exl2) | order_json      | 79.11                           | 93.47               | 1.96                    |
+| Llama2-7B(4.0bpw-exl2) | address_json    | 123.71                          | 133.38              | 0.55                    |
+| Llama2-7B(4.0bpw-exl2) | linkedlist_json | 80.05                           | 132.20              | 4.90                    |
+| Llama2-7B(4.0bpw-exl2) | order_json      | 117.28                          | 129.65              | 0.82                    |
+
 ## Transformers
 Default transformers setting with flash attention v2 enabled.
 
-The mysterious performance drop in huggingface integration is very interesting.
+The mysterious performance drop in huggingface integration is very interesting. 
+The same implementation in `mask_logits` just appears to vastly inefficient.
 
 | model           | schema          | constrained(with warm-up) / tps | unconstrained / tps | overhead per token / ms |
 |-----------------|-----------------|---------------------------------|---------------------|-------------------------|
-| Llama3-8B(bf16) | address_json    | 27.73                           | 32.07               | 4.88                    |
-| Llama3-8B(bf16) | linkedlist_json | 26.11                           | 32.13               | 7.17                    |
-| Llama3-8B(bf16) | order_json      | 26.36                           | 31.66               | 6.35                    |
+| Llama3-8B(bf16) | address_json    | 37.42                           | 38.76               | 0.91                    |
+| Llama3-8B(bf16) | linkedlist_json | 37.14                           | 38.72               | 1.09                    |
+| Llama3-8B(bf16) | order_json      | 36.79                           | 38.16               | 0.97                    |
+| Llama2-7B(fp16) | address_json    | 41.34                           | 42.22               | 0.50                    |
+| Llama2-7B(fp16) | linkedlist_json | 40.97                           | 42.00               | 0.60                    |
+| Llama2-7B(fp16) | order_json      | 39.74                           | 40.60               | 0.54                    |
diff --git a/benchmarks/transformers_json.py b/benchmarks/transformers_json.py
@@ -1,3 +1,4 @@
+import gc
 import json
 from timeit import timeit
 
@@ -21,6 +22,15 @@ def get_llama3_8b_tokenizer_and_model():
     model.generation_config.pad_token_id = tokenizer.eos_token_id
     return model, tokenizer
 
+def get_llama2_7b_tokenizer_and_model():
+    model = AutoModelForCausalLM.from_pretrained("togethercomputer/LLaMA-2-7B-32K",
+                                                 device_map="cuda",
+                                                 torch_dtype=torch.float16,
+                                                 attn_implementation="flash_attention_2")
+    tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
+    model.generation_config.pad_token_id = tokenizer.eos_token_id
+    return model, tokenizer
+
 def get_address_schema():
     f = FormatterBuilder()
     f.append_line(f"{f.schema(Address, JsonGenerator(), capture_name='json')}")
@@ -38,7 +48,7 @@ def get_order_schema():
 
 def execute():
     prompts = [
-        f"{system_prompt}{inputs[context.index]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
+        f"{system_prompt}{inputs[context.index]}{tail}",
     ]
     prompts = tokenizer(prompts, return_tensors='pt').to(model.device)
     input_len = prompts.input_ids.shape[-1]
@@ -68,27 +78,49 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
     log(bench_name, result, f)
 
 if __name__ == "__main__":
-    system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-
-    You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-    Extract information into json format: """
-
-
     data = BenchResult(0, 0, 0, 0)
     context = Context(0, 0)
     with open("transformers_json.txt", "w") as f:
-        model, tokenizer = get_llama3_8b_tokenizer_and_model()
         with torch.no_grad():
+            system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+            You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+            Extract information into json format: """
+            tail = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+            model, tokenizer = get_llama3_8b_tokenizer_and_model()
+            model.eval()
+            max_new_tokens = 50
+            inputs = json.load(open("address.json"))["sentences"]
+            logits_processor = get_address_schema()
+            bench(data,context,execute, "llama3_8b_address_json", f)
+            inputs = json.load(open("linkedlist.json"))["sentences"]
+            logits_processor = get_linkedlist_schema()
+            max_new_tokens = 200
+            bench(data,context,execute, "llama3_8b_linkedlist_json", f)
+            inputs = json.load(open("orders.json"))["orders"]
+            logits_processor = get_order_schema()
+            bench(data, context, execute, "llama3_8b_order_json", f)
+            system_prompt = """[INST]
+                        You are a helpful AI assistant for information extraction.
+
+                        Extract information into json format: """
+            tail = "[/INST]"
+            del model
+            del tokenizer
+            gc.collect()
+            torch.cuda.empty_cache()
+            model, tokenizer = get_llama2_7b_tokenizer_and_model()
             model.eval()
             max_new_tokens = 50
             inputs = json.load(open("address.json"))["sentences"]
             logits_processor = get_address_schema()
-            bench(data,context,execute, "address_json", f)
+            bench(data, context, execute, "llama2_7b_address_json", f)
+            max_new_tokens = 30
             inputs = json.load(open("linkedlist.json"))["sentences"]
             logits_processor = get_linkedlist_schema()
             max_new_tokens = 200
-            bench(data,context,execute, "linkedlist_json", f)
+            bench(data, context, execute, "llama2_7b_linkedlist_json", f)
             inputs = json.load(open("orders.json"))["orders"]
             logits_processor = get_order_schema()
-            bench(data, context, execute, "order_json", f)
+            bench(data, context, execute, "llama2_7b_order_json", f)
diff --git a/benchmarks/transformers_json.txt b/benchmarks/transformers_json.txt
@@ -1,6 +1,12 @@
-address_json generated 803 tokens with 27.73133365441571 tps (with warm up)
-address_json unconstrained generated 1000 tokens with 32.06941835850326 tps
-linkedlist_json generated 1051 tokens with 26.10733210624505 tps (with warm up)
-linkedlist_json unconstrained generated 1225 tokens with 32.128672426385975 tps
-order_json generated 3466 tokens with 26.364491518684723 tps (with warm up)
-order_json unconstrained generated 3969 tokens with 31.66292436079728 tps
+llama3_8b_address_json generated 786 tokens with 37.36371589602811 tps (with warm up)
+llama3_8b_address_json unconstrained generated 1000 tokens with 38.69880490679053 tps
+llama3_8b_linkedlist_json generated 887 tokens with 37.120954838498776 tps (with warm up)
+llama3_8b_linkedlist_json unconstrained generated 1236 tokens with 38.697844784161475 tps
+llama3_8b_order_json generated 3505 tokens with 36.78628437278762 tps (with warm up)
+llama3_8b_order_json unconstrained generated 3969 tokens with 38.14263146423028 tps
+llama2_7b_address_json generated 984 tokens with 41.33470448687917 tps (with warm up)
+llama2_7b_address_json unconstrained generated 1000 tokens with 42.21328193562087 tps
+llama2_7b_linkedlist_json generated 3068 tokens with 41.05214473703741 tps (with warm up)
+llama2_7b_linkedlist_json unconstrained generated 4000 tokens with 42.01075161949615 tps
+llama2_7b_order_json generated 3987 tokens with 39.76025691683701 tps (with warm up)
+llama2_7b_order_json unconstrained generated 4000 tokens with 40.5808187425456 tps
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "formatron"
-version = "0.1.3"
+version = "0.2.0"
 authors = [
   {name = "Xintong Sun", email = "[email protected]"},
 ]

diff --git a/src/formatron/formatter.py b/src/formatron/formatter.py
@@ -123,9 +123,6 @@ def mask_logits(self, logits) -> typing.Any:
     def get_allowed_tokens_since_last_computation(self) -> typing.Sequence[int]:
         return self._engine.get_allowed_token_ids_from_last_computation()
 
-    def get_tokens_to_finish_since_last_computation(self) -> typing.Sequence[int]:
-        return self._engine.get_token_ids_to_finish_from_last_computation()
-
     def is_completed(self) -> bool:
         return self._engine.is_finished()
 
@@ -302,28 +299,22 @@ def schema(self, schema: typing.Type[schemas.schema.Schema],
                                    lambda nonterminal: grammar_generator.generate(schema, nonterminal))
 
     def str(self, *, stop: typing.Union[str, list[str]] = None,
-            not_contain: typing.Union[str, list[str], None] = None,
             capture_name: typing.Optional[str] = None) -> RegexExtractor:
         """
         Create a string extractor.
         :param stop: The strings for the extractors to stop at. They will be included in text generation and extraction.
-        :param not_contain: The strings that should not be included in the generation.
-         They will not be included in the generation and extraction.
         :param capture_name: The capture name of the extractor, or `None` if the extractor does not capture.
         :return: The string extractor.
         """
         stop = [stop] if isinstance(stop, str) else stop or []
-        not_contain = [not_contain] if isinstance(not_contain, str) else not_contain or []
         nonterminal = self._create_nonterminal(capture_name, "str")
-        if not stop and not not_contain:
+        if not stop:
             capture_regex = ".*"
             nonterminal_regex = "#'.*'"
         else:
-            capture_regex = f".*?(?:{'|'.join(map(re.escape, stop))})"
-            excepted = f"{nonterminal}_excepted"
-            end = f"({'|'.join(map(repr, stop))})" if stop else ""
-            nonterminal_regex = f"except!({excepted}){end}"
-            self._rules.append(f"{excepted} ::= {' | '.join(map(repr, stop + not_contain))};")
+            backslash = '\\'
+            capture_regex = f".*?(?:{'|'.join([i.replace(backslash, backslash*2) for i in map(re.escape, stop)])})"
+            nonterminal_regex = f"#e'{capture_regex}'"
         self._rules.append(f"{nonterminal} ::= {nonterminal_regex};")
         self._nonterminal_to_extractor[nonterminal] = RegexExtractor(capture_regex, capture_name, nonterminal)
         return self._nonterminal_to_extractor[nonterminal]

diff --git a/src/formatron/integrations/exllamav2.py b/src/formatron/integrations/exllamav2.py
@@ -19,9 +19,8 @@ def create_engine_vocabulary(tokenizer: ExLlamaV2Tokenizer) -> kbnf.Vocabulary:
     assert hasattr(tokenizer.tokenizer_model, "vocab"), (f"tokenizer({tokenizer})"
                                                          f" with tokenizer_model({tokenizer.tokenizer_model})"
                                                          f" does not have vocab attribute!")
-    vocab = tokenizer.get_id_to_piece_list(include_special_tokens=True)
-    new_vocab = {v: i for i, v in enumerate(vocab)}
-    new_vocab = get_original_characters(new_vocab)
+    vocab = {tokenizer.tokenizer_model.id_to_piece(i): i for i in range(tokenizer.tokenizer_model.vocab_size())}
+    new_vocab = get_original_characters(vocab)
     return kbnf.Vocabulary({v: kbnf.Token(k) for k, v in new_vocab.items()},
                            {k: v for k, v in enumerate(vocab)})
 
@@ -50,7 +49,6 @@ def __init__(self, model, tokenizer, formatter: Formatter,
             config = EngineGenerationConfig()
         self._config = config
         self._pass_tokens = set()
-        self._end_tokens = set()
 
     def clone(self, c=None) -> "FormatterFilter":
         if c is None:
@@ -61,7 +59,6 @@ def clone(self, c=None) -> "FormatterFilter":
         c._formatter = copy(self._formatter)  # formatter does not have mutable public state anyway
         c._config = deepcopy(self._config)
         c._pass_tokens = self._pass_tokens
-        c._end_tokens = self._end_tokens
         return c
 
     def begin(self, prefix_str: str) -> None:
@@ -75,15 +72,17 @@ def reset(self)->None:
         self._formatter.reset()
 
     def feed(self, token: int):
+        if self._formatter.is_completed():
+            return None
         self._formatter.accept_token(token)
 
     def next(self) -> typing.Tuple[typing.Set[int], typing.Set[int]]:
+        if self._formatter.is_completed():
+            return {self.tokenizer.eos_token_id}, set()
         self._formatter.compute_allowed_tokens()
         self._pass_tokens.clear()
-        self._end_tokens.clear()
         self._pass_tokens.update(self._formatter.get_allowed_tokens_since_last_computation())
-        self._end_tokens.update(self._formatter.get_tokens_to_finish_since_last_computation())
-        return self._pass_tokens, self._end_tokens
+        return self._pass_tokens, set()
 
     @property
     def formatter_captures(self) -> dict[str, typing.Any]: