GoogleCloudPlatform · annapendleton · Sep 12, 2024 · Sep 11, 2024 · Sep 12, 2024 · Sep 12, 2024
@@ -10,9 +10,13 @@
 from datetime import datetime
 import json
 import random
+import requests
 import time
 from typing import AsyncGenerator, List, Tuple
 
+import google.auth
+import google.auth.transport.requests
+
 import aiohttp
 import numpy as np
 from transformers import AutoTokenizer
@@ -302,6 +306,54 @@ def save_json_results(args: argparse.Namespace, benchmark_result):
   with open(file_name, "w", encoding="utf-8") as outfile:
     json.dump(final_json, outfile)
 
+def metrics_to_scrape(backend: str) -> List[str]:
+  if backend == "vllm":
+    return ["vllm:gpu_cache_usage_perc", "vllm:num_requests_waiting"]
+  elif backend == "jetstream":
+    return ["jetstream_slots_used_percentage", "jetstream_prefill_backlog_size"]
+  else:
+    return []
+
+def print_metrics(metrics: List[str], duration: str, backend: str) -> None:
+  # Creates a credentials object from the default service account file
+  # Assumes that script has appropriate default credentials set up, ref:
+  # https://googleapis.dev/python/google-auth/latest/user-guide.html#application-default-credentials
+  credentials, project_id = google.auth.default()
+  # Prepare an authentication request - helps format the request auth token
+  auth_req = google.auth.transport.requests.Request()
+
+  for metric in metrics:
+    print("Metric Name: %s" % (metric))
+
+    # Queries scrape all metrics collected from the last $DURATION seconds from the backend's related
+    # podmonitoring spec assumed to be named "$BACKEND-podmonitoring"
+    queries = {
+      "Mean": "avg_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
+      "Median": "quantile_over_time(0.5, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
+      "Min": "min_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
+      "Max": "max_over_time(%s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
+      "P90": "quantile_over_time(0.9, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
+      "P99": "quantile_over_time(0.99, %s{job='%s-podmonitoring'}[%ss])" % (metric, backend, duration),
+    }
+    for query_name, query in queries.items():
+      # Request refresh tokens
+      credentials.refresh(auth_req)
+
+      # Configure respective query
+      url='https://monitoring.googleapis.com/v1/projects/%s/location/global/prometheus/api/v1/query' % (project_id)
+      headers_api = {'Authorization': 'Bearer ' + credentials.token}
+      params = {'query': query}
+      response = requests.get(url=url, headers=headers_api, params=params)
+
+      # handle response
+      if response.ok:
+        if response["status"] == "success":
+          print("%s: %s" % (query_name, response["data"]["result"][0]["value"][1]))
+        else:
+          print("Cloud Monitoring PromQL Error: %s" % (response["error"]))
+      else:
+        print("HTTP Error: %s" % (response.text))
+
 
 def main(args: argparse.Namespace):
   print(args)
@@ -420,6 +472,15 @@ def main(args: argparse.Namespace):
   )
   benchmark_result['avg_output_len'] = avg_output_len
 
+  '''
+  TODO: Add flag for enabling model server scraping
+  Scrape and print model server metrics
+  1. map model server to metrics list
+  2. loop through metrics list, call the same promql queries on each metric, print out the data received
+  '''
+  metrics = metrics_to_scrape(args.backend)
+  print_metrics(metrics, benchmark_time, args.backend)
+
   if args.save_json_results:
     save_json_results(args, benchmark_result)
 

@@ -34,4 +34,5 @@ pydantic >= 2.0  # Required for OpenAI server.
 aioprometheus[starlette]
 pynvml == 11.5.0
 accelerate
-aiohttp
+aiohttp
+google-auth