diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index a05226aa6..8ae1109ee 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -114,6 +114,7 @@ async def send_request( top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, + model: str, ) -> None: """Sends request to server.""" request_start_time = time.time() @@ -121,6 +122,7 @@ async def send_request( headers = {"User-Agent": "Benchmark Client"} if backend == "vllm": pload = { + "model": model, "prompt": prompt, "n": 1, "best_of": best_of, @@ -179,7 +181,7 @@ async def send_request( elif backend == "jetstream": pload = { "prompt": prompt, - "max_tokens": 1, + "max_tokens": output_len, } else: raise ValueError(f"Unknown backend: {backend}") @@ -219,9 +221,8 @@ async def send_request( output_token_ids = tokenizer(output["generated_text"]).input_ids output_len = len(output_token_ids) elif backend == "vllm": - total_token_ids = tokenizer(output["text"][0]).input_ids - new_total_len = len(total_token_ids) - output_len = new_total_len - prompt_len + output_token_ids = tokenizer(output["choices"][0]["text"]).input_ids + output_len = len(output_token_ids) elif backend == "jetstream": output_token_ids = tokenizer(output["response"]).input_ids output_len = len(output_token_ids) @@ -240,6 +241,7 @@ async def benchmark( top_k: int, tokenizer: PreTrainedTokenizerBase, sax_model: str, + model: str, ) -> None: """Runs benchmark with asynchronous requests.""" tasks: List[asyncio.Task] = [] @@ -257,6 +259,7 @@ async def benchmark( top_k, tokenizer, sax_model, + model, ) ) tasks.append(task) @@ -268,7 +271,13 @@ def main(args: argparse.Namespace): random.seed(args.seed) np.random.seed(args.seed) - api_url = f"http://{args.host}:{args.port}/{args.endpoint}" + endpoint = ( + "v1/completions" + if args.backend == "vllm" + else args.endpoint +) + + api_url = f"http://{args.host}:{args.port}/{endpoint}" tokenizer = AutoTokenizer.from_pretrained( args.tokenizer, trust_remote_code=args.trust_remote_code ) @@ -293,6 +302,7 @@ def main(args: argparse.Namespace): args.top_k, tokenizer, args.sax_model, + args.model, ) ) benchmark_end_time = time.time() @@ -388,6 +398,11 @@ def main(args: argparse.Namespace): parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=7080) parser.add_argument("--dataset", type=str, help="Path to the dataset.") + parser.add_argument( + "--model", + type=str, + help="Name of the model.", + ) parser.add_argument( "--tokenizer", type=str, diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh index 9c9e5ccf5..033391830 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh +++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh @@ -23,5 +23,8 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do # TODO: Check if profile already exists, if so then skip timestamp=$(date +"%Y-%m-%d_%H-%M-%S") output_file="latency-profile-${timestamp}.txt" - python3 benchmark_serving.py --host="$IP" --port="$PORT" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file -done \ No newline at end of file + python3 benchmark_serving.py --host="$IP" --port="$PORT" --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file + cat $output_file + sleep 5 # wait 5 seconds before next run +done + diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars index dea00ad56..de2be79ec 100644 --- a/benchmarks/benchmark/tools/profile-generator/sample.tfvars +++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars @@ -29,7 +29,8 @@ output_bucket = "your_project_id-benchmark-output-b k8s_hf_secret = "hf-token" # Benchmark configuration for Locust Docker accessing inference server -request_rates = [5, 10, 15, 20] +request_rates = [5, 10, 15, 20] +artifact_registry = "your_project_artifact_registry" # Model server configuration information targets = { @@ -39,4 +40,4 @@ targets = { service_port = "your_model_service_service_port" tokenizer = "your_tokenizer" } -} +} \ No newline at end of file diff --git a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl index 993f577b4..169ec4356 100644 --- a/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl +++ b/benchmarks/inference-server/vllm/manifest-templates/vllm.tftpl @@ -51,7 +51,7 @@ spec: - name: vllm ports: - containerPort: 80 - image: "vllm/vllm-openai:v0.3.3" + image: "vllm/vllm-openai:v0.5.5" command: ["python3", "-m", "vllm.entrypoints.openai.api_server"] args: ["--model", "${model_id}", "--tensor-parallel-size", "${gpu_count}", "--port", "80", "--swap-space", "${swap_space}", "--disable-log-requests"] env: