From d730ea36d609490ab166d2f7f644b3ce69f84873 Mon Sep 17 00:00:00 2001 From: Anna Pendleton Date: Thu, 12 Sep 2024 00:54:31 +0000 Subject: [PATCH] Add flag and add metrics to json output --- .../tools/profile-generator/README.md | 32 ++++++++----- .../container/benchmark_serving.py | 45 +++++++++++-------- .../container/latency_throughput_curve.sh | 2 +- .../benchmark/tools/profile-generator/main.tf | 1 + .../modules/latency-profile/main.tf | 1 + .../latency-profile-generator.yaml.tpl | 2 + .../modules/latency-profile/variables.tf | 6 +++ .../tools/profile-generator/variables.tf | 6 +++ 8 files changed, 64 insertions(+), 31 deletions(-) diff --git a/benchmarks/benchmark/tools/profile-generator/README.md b/benchmarks/benchmark/tools/profile-generator/README.md index 6d136bab9..e8375e7ed 100644 --- a/benchmarks/benchmark/tools/profile-generator/README.md +++ b/benchmarks/benchmark/tools/profile-generator/README.md @@ -1,17 +1,18 @@ # AI on GKE Benchmark Latency Profile Generator -* [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator) - * [Overview](#overview) - * [Instructions](#instructions) - * [Step 1: create output bucket](#step-1--create-output-bucket) - * [Step 2: create and give service account access to write to output gcs bucket](#step-2--create-and-give-service-account-access-to-write-to-output-gcs-bucket) - * [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3--create-artifact-repository-for-automated-latency-profile-generator-docker-build) - * [Step 4: create and configure terraform.tfvars](#step-4--create-and-configure-terraformtfvars) - * [[optional] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig) - * [[optional] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager) - * [Step 6: terraform initialize, plan and apply](#step-6--terraform-initialize-plan-and-apply) - * [Inputs](#inputs) +- [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator) + - [Overview](#overview) + - [Instructions](#instructions) + - [Step 1: create output bucket](#step-1-create-output-bucket) + - [Step 2: create and give service account access to write to output gcs bucket](#step-2-create-and-give-service-account-access-to-write-to-output-gcs-bucket) + - [\[optional\] give service account access to read Cloud Monitoring metrics](#optional-give-service-account-access-to-read-cloud-monitoring-metrics) + - [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3-create-artifact-repository-for-automated-latency-profile-generator-docker-build) + - [Step 4: create and configure terraform.tfvars](#step-4-create-and-configure-terraformtfvars) + - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig) + - [\[optional\] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager) + - [Step 5: login to gcloud](#step-5-login-to-gcloud) + - [Step 6: terraform initialize, plan and apply](#step-6-terraform-initialize-plan-and-apply) ## Overview @@ -62,6 +63,15 @@ Your kubernetes service account will inherit the reader permissions. You will set the `latency_profile_kubernetes_service_account` in your `terraform.tfvars` to the kubernetes service account name. +#### [optional] give service account access to read Cloud Monitoring metrics + +If `scrape-server-metrics` is set to True, you will need to give the service account access to read +the Cloud Monitoring metrics. You can do so with the following command: + +``` +gcloud projects add-iam-policy-binding $PROJECT_ID --member=serviceAccount:$GOOGLE_SERVICE_ACCOUNT@$PROJECT_ID.iam.gserviceaccount.com --role=roles/monitoring.viewer +``` + ### Step 3: create artifact repository for automated Latency Profile Generator docker build The latency profile generator rebuilds the docker file on each terraform apply diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py index 96cd9df00..0498f7f4f 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py +++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py @@ -314,7 +314,7 @@ def metrics_to_scrape(backend: str) -> List[str]: else: return [] -def print_metrics(metrics: List[str], duration: str, backend: str) -> None: +def print_metrics(metrics: List[str], duration: float, backend: str): # Creates a credentials object from the default service account file # Assumes that script has appropriate default credentials set up, ref: # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials @@ -322,18 +322,20 @@ def print_metrics(metrics: List[str], duration: str, backend: str) -> None: # Prepare an authentication request - helps format the request auth token auth_req = google.auth.transport.requests.Request() + all_metric_results = {} + for metric in metrics: print("Metric Name: %s" % (metric)) - + metric_results = {} # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related # podmonitoring spec assumed to be named "$BACKEND-podmonitoring" queries = { - "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration), - "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration), - "Min": "min_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration), - "Max": "max_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration), - "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration), - "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration), + "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), + "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration), } for query_name, query in queries.items(): # Request refresh tokens @@ -343,16 +345,20 @@ def print_metrics(metrics: List[str], duration: str, backend: str) -> None: url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id) headers_api = {'Authorization': 'Bearer ' + credentials.token} params = {'query': query} - response = requests.get(url=url, headers=headers_api, params=params) + request_post = requests.get(url=url, headers=headers_api, params=params) + response = request_post.json() # handle response - if response.ok: + if request_post.ok: if response["status"] == "success": + metric_results[query_name] = response["data"]["result"][0]["value"][1] print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1])) else: print("Cloud Monitoring PromQL Error: %s" % (response["error"])) else: - print("HTTP Error: %s" % (response.text)) + print("HTTP Error: %s" % (response)) + all_metric_results[metric] = metric_results + return all_metric_results def main(args: argparse.Namespace): @@ -472,14 +478,9 @@ def main(args: argparse.Namespace): ) benchmark_result['avg_output_len'] = avg_output_len - ''' - TODO: Add flag for enabling model server scraping - Scrape and print model server metrics - 1. map model server to metrics list - 2. loop through metrics list, call the same promql queries on each metric, print out the data received - ''' - metrics = metrics_to_scrape(args.backend) - print_metrics(metrics, benchmark_time, args.backend) + if args.scrape_server_metrics: + server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_time, args.backend) + benchmark_result['server_metrics'] = server_metrics if args.save_json_results: save_json_results(args, benchmark_result) @@ -606,5 +607,11 @@ def main(args: argparse.Namespace): " the form of a string." ), ) + parser.add_argument( + "--scrape-server-metrics", + type=bool, + default=False, + help="Whether to scrape server metrics.", + ) cmd_args = parser.parse_args() main(cmd_args) diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh index 033391830..fedf69c27 100644 --- a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh +++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh @@ -23,7 +23,7 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do # TODO: Check if profile already exists, if so then skip timestamp=$(date +"%Y-%m-%d_%H-%M-%S") output_file="latency-profile-${timestamp}.txt" - python3 benchmark_serving.py --host="$IP" --port="$PORT" --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file + python3 benchmark_serving.py --host="$IP" --port="$PORT" --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH --scrape-server-metrics=$SCRAPE_SERVER_METRICS > $output_file cat $output_file sleep 5 # wait 5 seconds before next run done diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf index 0fb5c3b2b..d82e31daa 100644 --- a/benchmarks/benchmark/tools/profile-generator/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/main.tf @@ -77,4 +77,5 @@ module "latency-profile" { k8s_hf_secret = var.k8s_hf_secret hugging_face_secret = var.hugging_face_secret hugging_face_secret_version = var.hugging_face_secret_version + scrape_server_metrics = var.scrape_server_metrics } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf index 694e8c324..74d36a59d 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf @@ -61,5 +61,6 @@ resource "kubernetes_manifest" "latency-profile-generator" { hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret] k8s_hf_secret_list = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret] output_bucket = var.output_bucket + scrape_server_metrics = var.scrape_server_metrics })) } \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl index ba75c3ed1..0fd763a19 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl @@ -34,6 +34,8 @@ spec: value: ${request_rates} - name: OUTPUT_BUCKET value: ${output_bucket} + - name: SCRAPE_SERVER_METRICS + value: ${scrape_server_metrics} %{ for hugging_face_token_secret in hugging_face_token_secret_list ~} - name: HF_TOKEN valueFrom: diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf index a5dec1259..73bc93c6c 100644 --- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf @@ -153,3 +153,9 @@ variable "hugging_face_secret_version" { nullable = true default = null } + +variable "scrape_server_metrics" { + description = "Whether to scrape server metrics." + type = bool + default = false +} \ No newline at end of file diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf index 26dd77d85..999c6cd95 100644 --- a/benchmarks/benchmark/tools/profile-generator/variables.tf +++ b/benchmarks/benchmark/tools/profile-generator/variables.tf @@ -144,4 +144,10 @@ variable "targets" { tokenizer = string }) }) +} + +variable "scrape_server_metrics" { + description = "Whether to scrape server metrics." + type = bool + default = false } \ No newline at end of file