Skip to content

Commit

Permalink
Speed up huggingface integrations&update benchmarks with newest kbnf
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan-wanna-M committed Aug 18, 2024
1 parent ed43077 commit 3d48a0e
Show file tree
Hide file tree
Showing 10 changed files with 149 additions and 76 deletions.
49 changes: 43 additions & 6 deletions benchmarks/exllamav2_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import gc
import json
from timeit import timeit

import torch
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer
from exllamav2.generator import ExLlamaV2DynamicGenerator, ExLlamaV2Sampler
from formatron.formatter import FormatterBuilder
Expand All @@ -26,6 +28,20 @@ def create_exllamav2_6bpw_llama3_8b():
)
return generator

def create_exllamav2_4bpw_llama2_7b():
model_dir = "../tests/local_assets/Llama-2-7b-chat-hf-4.0-bpw-exl2/"
config = ExLlamaV2Config(model_dir)
model = ExLlamaV2(config)
cache = ExLlamaV2Cache(model, max_seq_len=65536, lazy=True)
model.load_autosplit(cache, progress=True)
tokenizer = ExLlamaV2Tokenizer(config)
generator = ExLlamaV2DynamicGenerator(
model=model,
cache=cache,
tokenizer=tokenizer,
)
return generator

def get_address_filter():
f = FormatterBuilder()
f.append_line(f"{f.schema(Address, JsonGenerator(), capture_name='json')}")
Expand Down Expand Up @@ -76,6 +92,7 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
context.index = 0
context.tokens = 0
context.filters = None
settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
result.s2 = (timeit(func, number=len(inputs)))
result.t2 = context.tokens
log(bench_name, result, f)
Expand All @@ -89,19 +106,39 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
Extract information into json format: """
data = BenchResult(0, 0, 0, 0)
context = Context(0, 0)
generator = create_exllamav2_6bpw_llama3_8b()
settings = ExLlamaV2Sampler.Settings()
settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
with open("exllamav2_json.txt", "w") as f:
generator = create_exllamav2_6bpw_llama3_8b()
settings = ExLlamaV2Sampler.Settings()
inputs = json.load(open("address.json"))["sentences"]
context.filters = [get_address_filter()]
max_new_tokens = 100
bench(data, context, execute, "address_json_exllamav2", f)
bench(data, context, execute, "llama3_8b_6pw_exl2_address_json_exllamav2", f)
settings = ExLlamaV2Sampler.Settings()
context.filters = [get_linkedlist_filter()]
inputs = json.load(open("linkedlist.json"))["sentences"]
max_new_tokens = 32
bench(data, context, execute, "linkedlist_json_exllamav2", f)
bench(data, context, execute, "llama3_8b_6pw_exl2_linkedlist_json_exllamav2", f)
settings = ExLlamaV2Sampler.Settings()
context.filters = [get_order_filter()]
inputs = json.load(open("orders.json"))["orders"]
max_new_tokens = 160
bench(data, context, execute, "llama3_8b_6pw_exl2_orders_json_exllamav2", f)
del generator
gc.collect()
torch.cuda.empty_cache()
generator = create_exllamav2_4bpw_llama2_7b()
settings = ExLlamaV2Sampler.Settings()
inputs = json.load(open("address.json"))["sentences"]
context.filters = [get_address_filter()]
max_new_tokens = 120
bench(data, context, execute, "llama2_7b_4pw_exl2_address_json_exllamav2", f)
settings = ExLlamaV2Sampler.Settings()
context.filters = [get_linkedlist_filter()]
inputs = json.load(open("linkedlist.json"))["sentences"]
max_new_tokens = 15
bench(data, context, execute, "llama2_7b_4pw_exl2_linkedlist_json_exllamav2", f)
settings = ExLlamaV2Sampler.Settings()
context.filters = [get_order_filter()]
inputs = json.load(open("orders.json"))["orders"]
max_new_tokens = 160
bench(data, context, execute, "orders_json_exllamav2", f)
bench(data, context, execute, "llama2_7b_4pw_exl2_orders_json_exllamav2", f)
18 changes: 12 additions & 6 deletions benchmarks/exllamav2_json.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
address_json_exllamav2 generated 1937 tokens with 82.8928200006119 tps (with warm up)
address_json_exllamav2 unconstrained generated 2000 tokens with 84.3793451234579 tps
linkedlist_json_exllamav2 generated 558 tokens with 80.73082629409335 tps (with warm up)
linkedlist_json_exllamav2 unconstrained generated 640 tokens with 88.08959849567171 tps
orders_json_exllamav2 generated 2976 tokens with 84.07997233781526 tps (with warm up)
orders_json_exllamav2 unconstrained generated 3200 tokens with 91.83025447237793 tps
llama3_8b_6pw_exl2_address_json_exllamav2 generated 1937 tokens with 81.76457267212113 tps (with warm up)
llama3_8b_6pw_exl2_address_json_exllamav2 unconstrained generated 2000 tokens with 91.93855585432294 tps
llama3_8b_6pw_exl2_linkedlist_json_exllamav2 generated 567 tokens with 73.72004132348941 tps (with warm up)
llama3_8b_6pw_exl2_linkedlist_json_exllamav2 unconstrained generated 640 tokens with 92.92655429712437 tps
llama3_8b_6pw_exl2_orders_json_exllamav2 generated 2976 tokens with 79.10910035605352 tps (with warm up)
llama3_8b_6pw_exl2_orders_json_exllamav2 unconstrained generated 3200 tokens with 93.46945772542723 tps
llama2_7b_4pw_exl2_address_json_exllamav2 generated 2400 tokens with 123.7077165970634 tps (with warm up)
llama2_7b_4pw_exl2_address_json_exllamav2 unconstrained generated 2400 tokens with 133.37570270534903 tps
llama2_7b_4pw_exl2_linkedlist_json_exllamav2 generated 250 tokens with 80.04987619935734 tps (with warm up)
llama2_7b_4pw_exl2_linkedlist_json_exllamav2 unconstrained generated 300 tokens with 132.19982863147897 tps
llama2_7b_4pw_exl2_orders_json_exllamav2 generated 3136 tokens with 117.27953013576354 tps (with warm up)
llama2_7b_4pw_exl2_orders_json_exllamav2 unconstrained generated 3200 tokens with 129.65265959777014 tps
22 changes: 15 additions & 7 deletions benchmarks/result.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,16 +33,24 @@ Default exllamav2 setting are used.

| model | schema | constrained(with warm-up) / tps | unconstrained / tps | overhead per token / ms |
|------------------------|-----------------|---------------------------------|---------------------|-------------------------|
| Llama3-8B(6.0bpw-exl2) | address_json | 82.89 | 84.38 | 0.21 |
| Llama3-8B(6.0bpw-exl2) | linkedlist_json | 80.73 | 88.09 | 1.03 |
| Llama3-8B(6.0bpw-exl2) | order_json | 84.08 | 91.83 | 1.00 |
| Llama3-8B(6.0bpw-exl2) | address_json | 81.76 | 91.94 | 1.36 |
| Llama3-8B(6.0bpw-exl2) | linkedlist_json | 73.73 | 92.93 | 2.82 |
| Llama3-8B(6.0bpw-exl2) | order_json | 79.11 | 93.47 | 1.96 |
| Llama2-7B(4.0bpw-exl2) | address_json | 123.71 | 133.38 | 0.55 |
| Llama2-7B(4.0bpw-exl2) | linkedlist_json | 80.05 | 132.20 | 4.90 |
| Llama2-7B(4.0bpw-exl2) | order_json | 117.28 | 129.65 | 0.82 |

## Transformers
Default transformers setting with flash attention v2 enabled.

The mysterious performance drop in huggingface integration is very interesting.
The mysterious performance drop in huggingface integration is very interesting.
The same implementation in `mask_logits` just appears to vastly inefficient.

| model | schema | constrained(with warm-up) / tps | unconstrained / tps | overhead per token / ms |
|-----------------|-----------------|---------------------------------|---------------------|-------------------------|
| Llama3-8B(bf16) | address_json | 27.73 | 32.07 | 4.88 |
| Llama3-8B(bf16) | linkedlist_json | 26.11 | 32.13 | 7.17 |
| Llama3-8B(bf16) | order_json | 26.36 | 31.66 | 6.35 |
| Llama3-8B(bf16) | address_json | 37.42 | 38.76 | 0.91 |
| Llama3-8B(bf16) | linkedlist_json | 37.14 | 38.72 | 1.09 |
| Llama3-8B(bf16) | order_json | 36.79 | 38.16 | 0.97 |
| Llama2-7B(fp16) | address_json | 41.34 | 42.22 | 0.50 |
| Llama2-7B(fp16) | linkedlist_json | 40.97 | 42.00 | 0.60 |
| Llama2-7B(fp16) | order_json | 39.74 | 40.60 | 0.54 |
56 changes: 44 additions & 12 deletions benchmarks/transformers_json.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import gc
import json
from timeit import timeit

Expand All @@ -21,6 +22,15 @@ def get_llama3_8b_tokenizer_and_model():
model.generation_config.pad_token_id = tokenizer.eos_token_id
return model, tokenizer

def get_llama2_7b_tokenizer_and_model():
model = AutoModelForCausalLM.from_pretrained("togethercomputer/LLaMA-2-7B-32K",
device_map="cuda",
torch_dtype=torch.float16,
attn_implementation="flash_attention_2")
tokenizer = AutoTokenizer.from_pretrained("togethercomputer/LLaMA-2-7B-32K")
model.generation_config.pad_token_id = tokenizer.eos_token_id
return model, tokenizer

def get_address_schema():
f = FormatterBuilder()
f.append_line(f"{f.schema(Address, JsonGenerator(), capture_name='json')}")
Expand All @@ -38,7 +48,7 @@ def get_order_schema():

def execute():
prompts = [
f"{system_prompt}{inputs[context.index]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
f"{system_prompt}{inputs[context.index]}{tail}",
]
prompts = tokenizer(prompts, return_tensors='pt').to(model.device)
input_len = prompts.input_ids.shape[-1]
Expand Down Expand Up @@ -68,27 +78,49 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
log(bench_name, result, f)

if __name__ == "__main__":
system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>
Extract information into json format: """


data = BenchResult(0, 0, 0, 0)
context = Context(0, 0)
with open("transformers_json.txt", "w") as f:
model, tokenizer = get_llama3_8b_tokenizer_and_model()
with torch.no_grad():
system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>
Extract information into json format: """
tail = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
model, tokenizer = get_llama3_8b_tokenizer_and_model()
model.eval()
max_new_tokens = 50
inputs = json.load(open("address.json"))["sentences"]
logits_processor = get_address_schema()
bench(data,context,execute, "llama3_8b_address_json", f)
inputs = json.load(open("linkedlist.json"))["sentences"]
logits_processor = get_linkedlist_schema()
max_new_tokens = 200
bench(data,context,execute, "llama3_8b_linkedlist_json", f)
inputs = json.load(open("orders.json"))["orders"]
logits_processor = get_order_schema()
bench(data, context, execute, "llama3_8b_order_json", f)
system_prompt = """[INST]
You are a helpful AI assistant for information extraction.
Extract information into json format: """
tail = "[/INST]"
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
model, tokenizer = get_llama2_7b_tokenizer_and_model()
model.eval()
max_new_tokens = 50
inputs = json.load(open("address.json"))["sentences"]
logits_processor = get_address_schema()
bench(data,context,execute, "address_json", f)
bench(data, context, execute, "llama2_7b_address_json", f)
max_new_tokens = 30
inputs = json.load(open("linkedlist.json"))["sentences"]
logits_processor = get_linkedlist_schema()
max_new_tokens = 200
bench(data,context,execute, "linkedlist_json", f)
bench(data, context, execute, "llama2_7b_linkedlist_json", f)
inputs = json.load(open("orders.json"))["orders"]
logits_processor = get_order_schema()
bench(data, context, execute, "order_json", f)
bench(data, context, execute, "llama2_7b_order_json", f)
18 changes: 12 additions & 6 deletions benchmarks/transformers_json.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
address_json generated 803 tokens with 27.73133365441571 tps (with warm up)
address_json unconstrained generated 1000 tokens with 32.06941835850326 tps
linkedlist_json generated 1051 tokens with 26.10733210624505 tps (with warm up)
linkedlist_json unconstrained generated 1225 tokens with 32.128672426385975 tps
order_json generated 3466 tokens with 26.364491518684723 tps (with warm up)
order_json unconstrained generated 3969 tokens with 31.66292436079728 tps
llama3_8b_address_json generated 786 tokens with 37.36371589602811 tps (with warm up)
llama3_8b_address_json unconstrained generated 1000 tokens with 38.69880490679053 tps
llama3_8b_linkedlist_json generated 887 tokens with 37.120954838498776 tps (with warm up)
llama3_8b_linkedlist_json unconstrained generated 1236 tokens with 38.697844784161475 tps
llama3_8b_order_json generated 3505 tokens with 36.78628437278762 tps (with warm up)
llama3_8b_order_json unconstrained generated 3969 tokens with 38.14263146423028 tps
llama2_7b_address_json generated 984 tokens with 41.33470448687917 tps (with warm up)
llama2_7b_address_json unconstrained generated 1000 tokens with 42.21328193562087 tps
llama2_7b_linkedlist_json generated 3068 tokens with 41.05214473703741 tps (with warm up)
llama2_7b_linkedlist_json unconstrained generated 4000 tokens with 42.01075161949615 tps
llama2_7b_order_json generated 3987 tokens with 39.76025691683701 tps (with warm up)
llama2_7b_order_json unconstrained generated 4000 tokens with 40.5808187425456 tps
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
build-backend = "setuptools.build_meta"
[project]
name = "formatron"
version = "0.1.3"
version = "0.2.0"
authors = [
{name = "Xintong Sun", email = "[email protected]"},
]
Expand Down
17 changes: 4 additions & 13 deletions src/formatron/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,6 @@ def mask_logits(self, logits) -> typing.Any:
def get_allowed_tokens_since_last_computation(self) -> typing.Sequence[int]:
return self._engine.get_allowed_token_ids_from_last_computation()

def get_tokens_to_finish_since_last_computation(self) -> typing.Sequence[int]:
return self._engine.get_token_ids_to_finish_from_last_computation()

def is_completed(self) -> bool:
return self._engine.is_finished()

Expand Down Expand Up @@ -302,28 +299,22 @@ def schema(self, schema: typing.Type[schemas.schema.Schema],
lambda nonterminal: grammar_generator.generate(schema, nonterminal))

def str(self, *, stop: typing.Union[str, list[str]] = None,
not_contain: typing.Union[str, list[str], None] = None,
capture_name: typing.Optional[str] = None) -> RegexExtractor:
"""
Create a string extractor.
:param stop: The strings for the extractors to stop at. They will be included in text generation and extraction.
:param not_contain: The strings that should not be included in the generation.
They will not be included in the generation and extraction.
:param capture_name: The capture name of the extractor, or `None` if the extractor does not capture.
:return: The string extractor.
"""
stop = [stop] if isinstance(stop, str) else stop or []
not_contain = [not_contain] if isinstance(not_contain, str) else not_contain or []
nonterminal = self._create_nonterminal(capture_name, "str")
if not stop and not not_contain:
if not stop:
capture_regex = ".*"
nonterminal_regex = "#'.*'"
else:
capture_regex = f".*?(?:{'|'.join(map(re.escape, stop))})"
excepted = f"{nonterminal}_excepted"
end = f"({'|'.join(map(repr, stop))})" if stop else ""
nonterminal_regex = f"except!({excepted}){end}"
self._rules.append(f"{excepted} ::= {' | '.join(map(repr, stop + not_contain))};")
backslash = '\\'
capture_regex = f".*?(?:{'|'.join([i.replace(backslash, backslash*2) for i in map(re.escape, stop)])})"
nonterminal_regex = f"#e'{capture_regex}'"
self._rules.append(f"{nonterminal} ::= {nonterminal_regex};")
self._nonterminal_to_extractor[nonterminal] = RegexExtractor(capture_regex, capture_name, nonterminal)
return self._nonterminal_to_extractor[nonterminal]
Expand Down
15 changes: 7 additions & 8 deletions src/formatron/integrations/exllamav2.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ def create_engine_vocabulary(tokenizer: ExLlamaV2Tokenizer) -> kbnf.Vocabulary:
assert hasattr(tokenizer.tokenizer_model, "vocab"), (f"tokenizer({tokenizer})"
f" with tokenizer_model({tokenizer.tokenizer_model})"
f" does not have vocab attribute!")
vocab = tokenizer.get_id_to_piece_list(include_special_tokens=True)
new_vocab = {v: i for i, v in enumerate(vocab)}
new_vocab = get_original_characters(new_vocab)
vocab = {tokenizer.tokenizer_model.id_to_piece(i): i for i in range(tokenizer.tokenizer_model.vocab_size())}
new_vocab = get_original_characters(vocab)
return kbnf.Vocabulary({v: kbnf.Token(k) for k, v in new_vocab.items()},
{k: v for k, v in enumerate(vocab)})

Expand Down Expand Up @@ -50,7 +49,6 @@ def __init__(self, model, tokenizer, formatter: Formatter,
config = EngineGenerationConfig()
self._config = config
self._pass_tokens = set()
self._end_tokens = set()

def clone(self, c=None) -> "FormatterFilter":
if c is None:
Expand All @@ -61,7 +59,6 @@ def clone(self, c=None) -> "FormatterFilter":
c._formatter = copy(self._formatter) # formatter does not have mutable public state anyway
c._config = deepcopy(self._config)
c._pass_tokens = self._pass_tokens
c._end_tokens = self._end_tokens
return c

def begin(self, prefix_str: str) -> None:
Expand All @@ -75,15 +72,17 @@ def reset(self)->None:
self._formatter.reset()

def feed(self, token: int):
if self._formatter.is_completed():
return None
self._formatter.accept_token(token)

def next(self) -> typing.Tuple[typing.Set[int], typing.Set[int]]:
if self._formatter.is_completed():
return {self.tokenizer.eos_token_id}, set()
self._formatter.compute_allowed_tokens()
self._pass_tokens.clear()
self._end_tokens.clear()
self._pass_tokens.update(self._formatter.get_allowed_tokens_since_last_computation())
self._end_tokens.update(self._formatter.get_tokens_to_finish_since_last_computation())
return self._pass_tokens, self._end_tokens
return self._pass_tokens, set()

@property
def formatter_captures(self) -> dict[str, typing.Any]:
Expand Down
Loading

0 comments on commit 3d48a0e

Please sign in to comment.