From d730ea36d609490ab166d2f7f644b3ce69f84873 Mon Sep 17 00:00:00 2001
From: Anna Pendleton <pendleton@google.com>
Date: Thu, 12 Sep 2024 00:54:31 +0000
Subject: [PATCH] Add flag and add metrics to json output

---
 .../tools/profile-generator/README.md         | 32 ++++++++-----
 .../container/benchmark_serving.py            | 45 +++++++++++--------
 .../container/latency_throughput_curve.sh     |  2 +-
 .../benchmark/tools/profile-generator/main.tf |  1 +
 .../modules/latency-profile/main.tf           |  1 +
 .../latency-profile-generator.yaml.tpl        |  2 +
 .../modules/latency-profile/variables.tf      |  6 +++
 .../tools/profile-generator/variables.tf      |  6 +++
 8 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/benchmarks/benchmark/tools/profile-generator/README.md b/benchmarks/benchmark/tools/profile-generator/README.md
index 6d136bab9..e8375e7ed 100644
--- a/benchmarks/benchmark/tools/profile-generator/README.md
+++ b/benchmarks/benchmark/tools/profile-generator/README.md
@@ -1,17 +1,18 @@
 # AI on GKE Benchmark Latency Profile Generator
 
 <!-- TOC -->
-* [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator)
-  * [Overview](#overview)
-  * [Instructions](#instructions)
-    * [Step 1: create output bucket](#step-1--create-output-bucket)
-    * [Step 2: create and give service account access to write to output gcs bucket](#step-2--create-and-give-service-account-access-to-write-to-output-gcs-bucket)
-    * [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3--create-artifact-repository-for-automated-latency-profile-generator-docker-build)
-    * [Step 4: create and configure terraform.tfvars](#step-4--create-and-configure-terraformtfvars)
-      * [[optional] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
-      * [[optional] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager)
-    * [Step 6: terraform initialize, plan and apply](#step-6--terraform-initialize-plan-and-apply)
-  * [Inputs](#inputs)
+- [AI on GKE Benchmark Latency Profile Generator](#ai-on-gke-benchmark-latency-profile-generator)
+  - [Overview](#overview)
+  - [Instructions](#instructions)
+    - [Step 1: create output bucket](#step-1-create-output-bucket)
+    - [Step 2: create and give service account access to write to output gcs bucket](#step-2-create-and-give-service-account-access-to-write-to-output-gcs-bucket)
+      - [\[optional\] give service account access to read Cloud Monitoring metrics](#optional-give-service-account-access-to-read-cloud-monitoring-metrics)
+    - [Step 3: create artifact repository for automated Latency Profile Generator docker build](#step-3-create-artifact-repository-for-automated-latency-profile-generator-docker-build)
+    - [Step 4: create and configure terraform.tfvars](#step-4-create-and-configure-terraformtfvars)
+      - [\[optional\] set-up credentials config with kubeconfig](#optional-set-up-credentials-config-with-kubeconfig)
+      - [\[optional\] set up secret token in Secret Manager](#optional-set-up-secret-token-in-secret-manager)
+    - [Step 5: login to gcloud](#step-5-login-to-gcloud)
+    - [Step 6: terraform initialize, plan and apply](#step-6-terraform-initialize-plan-and-apply)
 <!-- TOC -->
 
 ## Overview
@@ -62,6 +63,15 @@ Your kubernetes service account will inherit the reader permissions.
 You will set the `latency_profile_kubernetes_service_account` in your
 `terraform.tfvars` to the kubernetes service account name.
 
+#### [optional] give service account access to read Cloud Monitoring metrics
+
+If `scrape-server-metrics` is set to True, you will need to give the service account access to read
+the Cloud Monitoring metrics. You can do so with the following command:
+
+```
+gcloud projects add-iam-policy-binding $PROJECT_ID   --member=serviceAccount:$GOOGLE_SERVICE_ACCOUNT@$PROJECT_ID.iam.gserviceaccount.com   --role=roles/monitoring.viewer
+```
+
 ### Step 3: create artifact repository for automated Latency Profile Generator docker build
 
 The latency profile generator rebuilds the docker file on each terraform apply 
diff --git a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
index 96cd9df00..0498f7f4f 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
+++ b/benchmarks/benchmark/tools/profile-generator/container/benchmark_serving.py
@@ -314,7 +314,7 @@ def metrics_to_scrape(backend: str) -> List[str]:
   else:
     return []
 
-def print_metrics(metrics: List[str], duration: str, backend: str) -> None:
+def print_metrics(metrics: List[str], duration: float, backend: str):
   # Creates a credentials object from the default service account file
   # Assumes that script has appropriate default credentials set up, ref:
   # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
@@ -322,18 +322,20 @@ def print_metrics(metrics: List[str], duration: str, backend: str) -> None:
   # Prepare an authentication request - helps format the request auth token
   auth_req = google.auth.transport.requests.Request()
 
+  all_metric_results = {}
+
   for metric in metrics:
     print("Metric Name: %s" % (metric))
-
+    metric_results = {}
     # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related
     # podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
     queries = {
-      "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
-      "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
-      "Min": "min_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
-      "Max": "max_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
-      "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
-      "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
+      "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+      "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+      "Min": "min_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+      "Max": "max_over_time(%s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+      "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
+      "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%.0fs])" % (metric, backend, duration),
     }
     for query_name, query in queries.items():
       # Request refresh tokens
@@ -343,16 +345,20 @@ def print_metrics(metrics: List[str], duration: str, backend: str) -> None:
       url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id)
       headers_api = {'Authorization': 'Bearer ' + credentials.token}
       params = {'query': query}
-      response = requests.get(url=url, headers=headers_api, params=params)
+      request_post = requests.get(url=url, headers=headers_api, params=params)
+      response = request_post.json()
 
       # handle response
-      if response.ok:
+      if request_post.ok:
         if response["status"] == "success":
+          metric_results[query_name] = response["data"]["result"][0]["value"][1]
           print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
         else:
           print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
       else:
-        print("HTTP Error: %s" % (response.text))
+        print("HTTP Error: %s" % (response))
+    all_metric_results[metric] = metric_results
+  return all_metric_results
 
 
 def main(args: argparse.Namespace):
@@ -472,14 +478,9 @@ def main(args: argparse.Namespace):
   )
   benchmark_result['avg_output_len'] = avg_output_len
 
-  '''
-  TODO: Add flag for enabling model server scraping
-  Scrape and print model server metrics
-  1. map model server to metrics list
-  2. loop through metrics list, call the same promql queries on each metric, print out the data received
-  '''
-  metrics = metrics_to_scrape(args.backend)
-  print_metrics(metrics, benchmark_time, args.backend)
+  if args.scrape_server_metrics:
+    server_metrics = print_metrics(metrics_to_scrape(args.backend), benchmark_time, args.backend)
+    benchmark_result['server_metrics'] = server_metrics
 
   if args.save_json_results:
     save_json_results(args, benchmark_result)
@@ -606,5 +607,11 @@ def main(args: argparse.Namespace):
           " the form of a string."
       ),
   )
+  parser.add_argument(
+      "--scrape-server-metrics",
+      type=bool,
+      default=False,
+      help="Whether to scrape server metrics.",
+  )
   cmd_args = parser.parse_args()
   main(cmd_args)
diff --git a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh
index 033391830..fedf69c27 100644
--- a/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh
+++ b/benchmarks/benchmark/tools/profile-generator/container/latency_throughput_curve.sh
@@ -23,7 +23,7 @@ for request_rate in $(echo $REQUEST_RATES | tr ',' ' '); do
   # TODO: Check if profile already exists, if so then skip
   timestamp=$(date +"%Y-%m-%d_%H-%M-%S")
   output_file="latency-profile-${timestamp}.txt"
-  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH > $output_file
+  python3 benchmark_serving.py   --host="$IP"   --port="$PORT"   --model="$TOKENIZER" --dataset=ShareGPT_V3_unfiltered_cleaned_split.json   --tokenizer="$TOKENIZER" --request-rate=$request_rate --backend="$BACKEND" --num-prompts=$((request_rate * 30)) --max-input-length=$INPUT_LENGTH --max-output-length=$OUTPUT_LENGTH --scrape-server-metrics=$SCRAPE_SERVER_METRICS > $output_file
   cat $output_file
   sleep 5 # wait 5 seconds before next run
 done
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index 0fb5c3b2b..d82e31daa 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -77,4 +77,5 @@ module "latency-profile" {
   k8s_hf_secret                              = var.k8s_hf_secret
   hugging_face_secret                        = var.hugging_face_secret
   hugging_face_secret_version                = var.hugging_face_secret_version
+  scrape_server_metrics                      = var.scrape_server_metrics
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
index 694e8c324..74d36a59d 100644
--- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/main.tf
@@ -61,5 +61,6 @@ resource "kubernetes_manifest" "latency-profile-generator" {
     hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
     k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
     output_bucket                              = var.output_bucket
+    scrape_server_metrics                      = var.scrape_server_metrics
   }))
 }
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index ba75c3ed1..0fd763a19 100644
--- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -34,6 +34,8 @@ spec:
               value: ${request_rates}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
+            - name: SCRAPE_SERVER_METRICS
+              value: ${scrape_server_metrics}
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HF_TOKEN
               valueFrom:
diff --git a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf
index a5dec1259..73bc93c6c 100644
--- a/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/modules/latency-profile/variables.tf
@@ -153,3 +153,9 @@ variable "hugging_face_secret_version" {
   nullable    = true
   default     = null
 }
+
+variable "scrape_server_metrics" {
+  description = "Whether to scrape server metrics."
+  type        = bool
+  default     = false
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index 26dd77d85..999c6cd95 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -144,4 +144,10 @@ variable "targets" {
       tokenizer    = string
     })
   })
+}
+
+variable "scrape_server_metrics" {
+  description = "Whether to scrape server metrics."
+  type        = bool
+  default     = false
 }
\ No newline at end of file