From f98d37cdf66ce9492c42d768dabb3cdef2e8a2a7 Mon Sep 17 00:00:00 2001 From: Hugo Larcher Date: Fri, 27 Sep 2024 18:55:40 +0200 Subject: [PATCH] fix: Fix extra meta --- README.md | 65 +++++++++++++++++++++++++++++++++-------------------- src/main.rs | 21 ++++++++--------- 2 files changed, 52 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 8da638b..0a2b0d8 100644 --- a/README.md +++ b/README.md @@ -3,27 +3,28 @@ A lightweight benchmarking tool for LLM inference servers. Benchmarks using constant arrival rate or constant virtual user count. - - ![ui.png](assets/ui.png) ## Table of contents + * [Text Generation Inference benchmarking tool](#text-generation-inference-benchmarking-tool) - * [Table of contents](#table-of-contents) - * [TODO](#todo) - * [Get started](#get-started) - * [Run a benchmark](#run-a-benchmark) - * [Configure your benchmark](#configure-your-benchmark) - * [Benchmark mode](#benchmark-mode) - * [Dataset configuration](#dataset-configuration) - * [Prompt configuration](#prompt-configuration) - * [Development](#development) - * [Frequently Asked Questions](#frequently-asked-questions) + * [Table of contents](#table-of-contents) + * [TODO](#todo) + * [Get started](#get-started) + * [Run a benchmark](#run-a-benchmark) + * [Configure your benchmark](#configure-your-benchmark) + * [Benchmark mode](#benchmark-mode) + * [Dataset configuration](#dataset-configuration) + * [Prompt configuration](#prompt-configuration) + * [Development](#development) + * [Frequently Asked Questions](#frequently-asked-questions) + ## TODO + - [X] Customizable token count and variance - [ ] Check results - [X] Allow for system prompts for prefix caching @@ -31,7 +32,6 @@ Benchmarks using constant arrival rate or constant virtual user count. - [ ] Push results to Optimum benchmark backend - [X] Script to generate plots from results - ## Get started ### Run a benchmark @@ -57,8 +57,7 @@ $ docker run \ --decode-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10" ``` -Results will be saved in `results.json` in current directory. - +Results will be saved in JSON format in current directory. ### Configure your benchmark @@ -68,18 +67,20 @@ In default mode, tool runs a `sweep` benchmark. It first runs a throughput test sweeps on QPS values up to the maximum throughput. Available modes: + - `sweep`: runs a sweep benchmark - `rate`: runs a benchmark at a fixed request rate - `throughput`: runs a benchmark at a fixed throughput (constant VUs) - #### Dataset configuration Prompts are sampled for a Hugging Face dataset file, using a [subset of ShareGPT -as default](https://huggingface.co/datasets/hlarcher/share_gpt_small). You can specify a different dataset file using the +as default](https://huggingface.co/datasets/hlarcher/share_gpt_small). You can specify a different dataset file using +the `--dataset` and `--dataset-file` option. Dataset is expected to be JSON with the following format: + ```json [ { @@ -94,6 +95,7 @@ Dataset is expected to be JSON with the following format: ``` To benchmark with prefix caching, you can use a system prompt that will be sent with each request from a discussion. + ```json [ { @@ -111,8 +113,8 @@ To benchmark with prefix caching, you can use a system prompt that will be sent ] ``` - #### Prompt configuration + For consistent results you can configure the token count and variance. The tool will sample prompts with the specified values, sampling token counts from a normal distribution with the specified variance. @@ -120,22 +122,37 @@ values, sampling token counts from a normal distribution with the specified vari --prompt-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10" ``` +### Decode options + +You can also configure the decoding options for the model. The tool will sample decoding options with the specified +values, sampling token counts from a normal distribution with the specified variance. + +```shell +--decode-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10" +``` ## Development You need [Rust](https://rustup.rs/) installed to build the benchmarking tool. + ```shell $ make build ``` - ## Frequently Asked Questions + * **What's the difference between constant arrival rate and constant virtual user count?** - * **Constant virtual user count** means that the number of virtual users is fixed. Each virtual user can send a single requests and waits for server response. It's basically simulating a fixed number of users querying the server. - * **Constant arrival rate** means that the rate of requests is fixed and the number of virtual users is adjusted to maintain that rate. Queries hit the server independently of responses performances. + * **Constant virtual user count** means that the number of virtual users is fixed. Each virtual user can send a + single requests and waits for server response. It's basically simulating a fixed number of users querying the + server. + * **Constant arrival rate** means that the rate of requests is fixed and the number of virtual users is adjusted to + maintain that rate. Queries hit the server independently of responses performances. - **Constant virtual user count** is a closed loop model where the server's response time dictates the number of iterations. **Constant arrival rate** is an open-loop model more representative of real-life workloads. + **Constant virtual user count** is a closed loop model where the server's response time dictates the number of + iterations. **Constant arrival rate** is an open-loop model more representative of real-life workloads. * **What is the influence of CUDA graphs?** -CUDA graphs are used to optimize the GPU usage by minimizing the overhead of launching kernels. This can lead to better performance in some cases, but can also lead to worse performance in others. -If your CUDA graphs are not evenly distributed, you may see a performance drop at some request rates as batch size may fall in a bigger CUDA graph batch size leading to a lost of compute due to excessive padding. \ No newline at end of file + CUDA graphs are used to optimize the GPU usage by minimizing the overhead of launching kernels. This can lead to + better performance in some cases, but can also lead to worse performance in others. + If your CUDA graphs are not evenly distributed, you may see a performance drop at some request rates as batch size may + fall in a bigger CUDA graph batch size leading to a lost of compute due to excessive padding. \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index b9099b4..afccb0a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -81,11 +81,10 @@ struct Args { /// File to use in the Dataset #[clap(default_value = "share_gpt_filtered_small.json", long, env)] dataset_file: String, - /// Extra metadata to include in the benchmark results file. + /// Extra metadata to include in the benchmark results file, comma-separated key-value pairs. /// It can be, for example, used to include information about the configuration of the /// benched server. - /// Can be specified multiple times. - /// Example: --extra-meta key1=value1 --extra-meta key2=value2 + /// Example: --extra-meta "key1=value1,key2=value2" #[clap(long, env, value_parser(parse_key_val))] extra_meta: Option>, } @@ -102,15 +101,17 @@ fn parse_url(s: &str) -> Result { } fn parse_key_val(s: &str) -> Result, Error> { - let key_value = s.split("=").collect::>(); - if key_value.len() % 2 != 0 { - return Err(Error::new(InvalidValue)); - } let mut key_val_map = HashMap::new(); - for i in 0..key_value.len() / 2 { - key_val_map.insert(key_value[i * 2].to_string(), key_value[i * 2 + 1].to_string()); + let items = s.split(",").collect::>(); + for item in items.iter() { + let key_value = item.split("=").collect::>(); + if key_value.len() % 2 != 0 { + return Err(Error::new(InvalidValue)); + } + for i in 0..key_value.len() / 2 { + key_val_map.insert(key_value[i * 2].to_string(), key_value[i * 2 + 1].to_string()); + } } - Ok(key_val_map) }