diff --git a/benchmarks/benchmark/tools/profile-generator/README.md b/benchmarks/benchmark/tools/profile-generator/README.md index 6d136bab9..e8375e7ed 100644 --- a/benchmarks/benchmark/tools/profile-generator/README.md +++ b/benchmarks/benchmark/tools/profile-generator/README.md @@ -1,17 +1,18 @@ # AI on GKE Benchmark Latency Profile Generator -* [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator) - * [Overview](#overview) - * [Instructions](#instructions) - * [Step 1: create output bucket](#step-1--create-output-bucket) - * [Step 2: create and give service account access to write to output gcs bucket](#step-2--create-and-give-service-account-access-to-write-to-output-gcs-bucket) - * [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3--create-artifact-repository-for-automated-latency-profile-generator-docker-build) - * [Step 4: create and configure terraform.tfvars](#step-4--create-and-configure-terraformtfvars) - * [[optional] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig) - * [[optional] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager) - * [Step 6: terraform initialize, plan and apply](#step-6--terraform-initialize-plan-and-apply) - * [Inputs](#inputs) +- [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator) + - [Overview](#overview) + - [Instructions](#instructions) + - [Step 1: create output bucket](#step-1-create-output-bucket) + - [Step 2: create and give service account access to write to output gcs bucket](#step-2-create-and-give-service-account-access-to-write-to-output-gcs-bucket) + - [\[optional\] give service account access to read Cloud Monitoring metrics](#optional-give-service-account-access-to-read-cloud-monitoring-metrics) + - [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3-create-artifact-repository-for-automated-latency-profile-generator-docker-build) + - [Step 4: create and configure terraform.tfvars](#step-4-create-and-configure-terraformtfvars) + - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig) + - [\[optional\] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager) + - [Step 5: login to gcloud](#step-5-login-to-gcloud) + - [Step 6: terraform initialize, plan and apply](#step-6-terraform-initialize-plan-and-apply) ## Overview @@ -62,6 +63,15 @@ Your kubernetes service account will inherit the reader permissions. You will set the `latency_profile_kubernetes_service_account` in your `terraform.tfvars` to the kubernetes service account name. +#### [optional] give service account access to read Cloud Monitoring metrics + +If `scrape-server-metrics` is set to True, you will need to give the service account access to read +the Cloud Monitoring metrics. You can do so with the following command: + +``` +gcloud projects add-iam-policy-binding $PROJECT_ID --member=serviceAccount:$GOOGLE_SERVICE_ACCOUNT@$PROJECT_ID.iam.gserviceaccount.com --role=roles/monitoring.viewer +``` + ### Step 3: create artifact repository for automated Latency Profile Generator docker build The latency profile generator rebuilds the docker file on each terraform apply diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 5f521058b..21c64517d 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -10,9 +10,13 @@ from datetime import datetime import json import random +import requests import time from typing import AsyncGenerator, List, Tuple +import google.auth +import google.auth.transport.requests + import aiohttp import numpy as np from transformers import AutoTokenizer @@ -302,6 +306,60 @@ def save_json_results(args: argparse.Namespace, benchmark_result): with open(file_name, "w", encoding="utf-8") as outfile: json.dump(final_json, outfile) +def metrics_to_scrape(backend: str) -> List[str]: + if backend == "vllm": + return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"] + elif backend == "jetstream": + return ["jetstream_slots_used_percentage", "jetstream_prefill_backlog_size"] + else: + return [] + +def print_metrics(metrics: List[str], duration: float, backend: str): + # Creates a credentials object from the default service account file + # Assumes that script has appropriate default credentials set up, ref: + # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials + credentials, project_id = google.auth.default() + # Prepare an authentication request - helps format the request auth token + auth_req = google.auth.transport.requests.Request() + + all_metric_results = {} + + for metric in metrics: + print("Metric Name: %s" % (metric)) + metric_results = {} + # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related + # podmonitoring spec assumed to be named "$BACKEND-podmonitoring" + queries = { + "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + } + for query_name, query in queries.items(): + # Request refresh tokens + credentials.refresh(auth_req) + + # Configure respective query + url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id) + headers_api = {'Authorization': 'Bearer ' + credentials.token} + params = {'query': query} + request_post = requests.get(url=url, headers=headers_api, params=params) + response = request_post.json() + + # handle response + if request_post.ok: + if response["status"] == "success": + metric_results[query_name] = response["data"]["result"][0]["value"][1] + print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) + else: + print("Cloud Monitoring PromQL Error: %s" % (response["error"])) + else: + print("HTTP Error: %s" % (response)) + all_metric_results[metric] = metric_results + return all_metric_results + def main(args: argparse.Namespace): print(args) @@ -420,6 +478,10 @@ def main(args: argparse.Namespace): ) benchmark_result['avg_output_len'] = avg_output_len + if args.scrape_server_metrics: + server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_time, args.backend) + benchmark_result['server_metrics'] = server_metrics + if args.save_json_results: save_json_results(args, benchmark_result) @@ -545,5 +607,10 @@ def main(args: argparse.Namespace): " the form of a string." ), ) + parser.add_argument( + "--scrape-server-metrics", + action="store_true", + help="Whether to scrape server metrics.", + ) cmd_args = parser.parse_args() main(cmd_args) diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh index 033391830..1437e4814 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh +++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh @@ -19,11 +19,17 @@ export IP=$IP huggingface-cli login --token "$HF_TOKEN" --add-to-git-credential +PYTHON="python3" +PYTHON_OPTS="benchmark_serving.py " for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do # TODO: Check if profile already exists, if so then skip timestamp=$(date +"%Y-%m-%d_%H-%M-%S") output_file="latency-profile-${timestamp}.txt" - python3 benchmark_serving.py --host="$IP" --port="$PORT" --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file + PYTHON_OPTS="$PYTHON_OPTS --host=$IP --port=$PORT --model=$TOKENIZER --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer=$TOKENIZER --request-rate=$request_rate --backend=$BACKEND --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH" + if [[ "$SCRAPE_SERVER_METRICS" = "true" ]]; then + PYTHON_OPTS="$PYTHON_OPTS --scrape-server-metrics" + fi + $PYTHON $PYTHON_OPTS > $output_file cat $output_file sleep 5 # wait 5 seconds before next run done diff --git a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt index b477c334b..e9fbf52c2 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/requirements.txt +++ b/benchmarks/benchmark/tools/profile-generator/container/requirements.txt @@ -34,4 +34,5 @@ pydantic >= 2.0 # Required for OpenAI server. aioprometheus[starlette] pynvml == 11.5.0 accelerate -aiohttp \ No newline at end of file +aiohttp +google-auth \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf index 0fb5c3b2b..d82e31daa 100644 --- a/benchmarks/benchmark/tools/profile-generator/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/main.tf @@ -77,4 +77,5 @@ module "latency-profile" { k8s_hf_secret = var.k8s_hf_secret hugging_face_secret = var.hugging_face_secret hugging_face_secret_version = var.hugging_face_secret_version + scrape_server_metrics = var.scrape_server_metrics } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf index 694e8c324..74d36a59d 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf @@ -61,5 +61,6 @@ resource "kubernetes_manifest" "latency-profile-generator" { hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret] output_bucket = var.output_bucket + scrape_server_metrics = var.scrape_server_metrics })) } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl index ba75c3ed1..0fd763a19 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl @@ -34,6 +34,8 @@ spec: value: ${request_rates} - name: OUTPUT_BUCKET value: ${output_bucket} + - name: SCRAPE_SERVER_METRICS + value: ${scrape_server_metrics} %{ for hugging_face_token_secret in hugging_face_token_secret_list ~} - name: HF_TOKEN valueFrom: diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf index a5dec1259..73bc93c6c 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf @@ -153,3 +153,9 @@ variable "hugging_face_secret_version" { nullable = true default = null } + +variable "scrape_server_metrics" { + description = "Whether to scrape server metrics." + type = bool + default = false +} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf index 26dd77d85..999c6cd95 100644 --- a/benchmarks/benchmark/tools/profile-generator/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/variables.tf @@ -144,4 +144,10 @@ variable "targets" { tokenizer = string }) }) +} + +variable "scrape_server_metrics" { + description = "Whether to scrape server metrics." + type = bool + default = false } \ No newline at end of file