Skip to content

Commit

Permalink
Add llama2 benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
Dan-wanna-M committed Aug 13, 2024
1 parent 6c4963c commit c0bc50e
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 26 deletions.
3 changes: 3 additions & 0 deletions benchmarks/result.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ Default vllm setting are used.
| Llama3-8B(bf16) | address_json | 40.72 | 42.02 | 0.76 |
| Llama3-8B(bf16) | linkedlist_json | 40.57 | 41.95 | 0.81 |
| Llama3-8B(bf16) | order_json | 40.10 | 41.56 | 0.88 |
| Llama2-7B(fp16) | address_json | 46.66 | 47.65 | 0.45 |
| Llama2-7B(fp16) | linkedlist_json | 46.55 | 47.68 | 0.51 |
| Llama2-7B(fp16) | order_json | 45.76 | 46.81 | 0.49 |
## Exllamav2
Default exllamav2 setting are used.

Expand Down
60 changes: 44 additions & 16 deletions benchmarks/vllm_json.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
import gc
import json
import typing
from timeit import timeit

import torch
from formatron.grammar_generators.json_generator import JsonGenerator
from vllm import LLM, SamplingParams
from vllm.distributed import destroy_model_parallel, destroy_distributed_environment

from utils import Order, log
from utils import LinkedList
from utils import BenchResult, Context
from utils import Address
from formatter import FormatterBuilder
from integrations.vllm import create_formatters_logits_processor, FormattersLogitsProcessor
from utils import Address
from utils import BenchResult, Context
from utils import LinkedList
from utils import Order, log


def execute():
prompts = [
f"{system_prompt}{inputs[context.index]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
f"{system_prompt}{inputs[context.index]}{tail}",
]
context.index+=1
outputs = llm.generate(prompts, sampling_params)
Expand All @@ -29,6 +32,7 @@ def get_vllm_address():
f = FormatterBuilder()
f.append_line(f"```json\n{f.schema(Address, JsonGenerator(), capture_name='json')}```")
logits_processor = create_formatters_logits_processor(llm, [f])
print(llm.get_tokenizer().vocab_size)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95,max_tokens=100, logits_processors=[logits_processor])
return sampling_params

Expand Down Expand Up @@ -65,22 +69,46 @@ def bench(result:BenchResult, context:Context,func, bench_name:str, f):
result.t2 = context.tokens
log(bench_name, result, f)

if __name__ == "__main__":
system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>

Extract information into json format: """
llm = LLM(model="NurtureAI/Meta-Llama-3-8B-Instruct-32k", max_model_len=4096)
sampling_params = get_vllm_address()
if __name__ == "__main__":
data = BenchResult(0, 0, 0, 0)
context = Context(0, 0)
inputs = json.load(open("address.json"))["sentences"]
with open("vllm_json_bench.txt", "w") as f:
bench(data, context,execute, "vllm_address", f)
system_prompt = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful AI assistant for information extraction.<|eot_id|><|start_header_id|>user<|end_header_id|>
Extract information into json format: """
tail = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
llm = LLM(model="NurtureAI/Meta-Llama-3-8B-Instruct-32k", max_model_len=4096)
inputs = json.load(open("address.json"))["sentences"]
sampling_params = get_vllm_address()
bench(data, context, execute, "llama3_8b_vllm_address", f)
sampling_params = get_vllm_linkedlist()
inputs = json.load(open("linkedlist.json"))["sentences"]
bench(data, context, execute, "llama3_8b_linkedlist", f)
sampling_params = get_vllm_order()
inputs = json.load(open("orders.json"))["orders"]
bench(data, context, execute, "llama3_8b_orders", f)
destroy_model_parallel()
destroy_distributed_environment()
del llm.llm_engine.model_executor
del llm
gc.collect()
torch.cuda.empty_cache()
system_prompt = """[INST]
You are a helpful AI assistant for information extraction.
Extract information into json format: """
tail = "[/INST]"
llm = LLM(model="daryl149/llama-2-7b-chat-hf", max_model_len=2048)
inputs = json.load(open("address.json"))["sentences"]
sampling_params = get_vllm_address()
bench(data, context,execute, "llama2_7b_vllm_address", f)
sampling_params = get_vllm_linkedlist()
inputs = json.load(open("linkedlist.json"))["sentences"]
bench(data, context, execute, "linkedlist", f)
bench(data, context, execute, "llama2_7b_linkedlist", f)
sampling_params = get_vllm_order()
inputs = json.load(open("orders.json"))["orders"]
bench(data, context, execute, "orders", f)
bench(data, context, execute, "llama2_7b_orders", f)
18 changes: 12 additions & 6 deletions benchmarks/vllm_json_bench.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
vllm_address generated 1032 tokens with 40.715563518001495 tps (with warm up)
vllm_address unconstrained generated 1500 tokens with 42.01920504622742 tps
linkedlist generated 1262 tokens with 40.568280943625346 tps (with warm up)
linkedlist unconstrained generated 1337 tokens with 41.946869871332254 tps
orders generated 4271 tokens with 40.09933827742811 tps (with warm up)
orders unconstrained generated 4589 tokens with 41.564089177085926 tps
llama3_8b_vllm_address generated 1085 tokens with 40.811995725723065 tps (with warm up)
llama3_8b_vllm_address unconstrained generated 1526 tokens with 41.99365646425085 tps
llama3_8b_linkedlist generated 1252 tokens with 40.58724636960033 tps (with warm up)
llama3_8b_linkedlist unconstrained generated 1389 tokens with 41.977055386201535 tps
llama3_8b_orders generated 4266 tokens with 40.109222217293876 tps (with warm up)
llama3_8b_orders unconstrained generated 4595 tokens with 41.5902565932466 tps
llama2_7b_vllm_address generated 1571 tokens with 46.65812840219695 tps (with warm up)
llama2_7b_vllm_address unconstrained generated 1748 tokens with 47.65099657371631 tps
llama2_7b_linkedlist generated 1648 tokens with 46.55372314285624 tps (with warm up)
llama2_7b_linkedlist unconstrained generated 1917 tokens with 47.682641280112215 tps
llama2_7b_orders generated 5120 tokens with 45.76177824108742 tps (with warm up)
llama2_7b_orders unconstrained generated 5114 tokens with 46.81601673468687 tps
6 changes: 3 additions & 3 deletions src/formatron/integrations/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
This module integrates the transformers library by providing convenience utilities.
"""
import collections
import time
import typing

import kbnf
from transformers import LogitsProcessor, PreTrainedTokenizerBase, LogitsProcessorList

from config import EngineGenerationConfig
from formatter import Formatter, FormatterBuilder
from integrations._utils import get_original_characters
from transformers import LogitsProcessor, PreTrainedTokenizerBase, LogitsProcessorList


def create_engine_vocabulary(tokenizer: PreTrainedTokenizerBase) -> kbnf.Vocabulary:
Expand Down Expand Up @@ -77,7 +77,7 @@ def __call__(self, input_ids, scores):
if self._last_input_id_length is None: # First iteration
self._last_input_id_length = input_ids.shape[1]
for formatter, config, prompt in zip(self._formatters, self.configs, input_ids):
if config.reset_at_beginning and formatter.is_completed():
if config.reset_at_beginning:
formatter.reset()
if config.read_prompt:
for token in prompt:
Expand Down
1 change: 0 additions & 1 deletion src/formatron/integrations/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import typing

import kbnf
import torch
from vllm import LLM

from config import EngineGenerationConfig
Expand Down

0 comments on commit c0bc50e

Please sign in to comment.