WIP changes/reversions

GoogleCloudPlatform · Aug 16, 2024 · 91fe581 · 91fe581
1 parent 44143e7
commit 91fe581
Show file tree

Hide file tree

Showing 9 changed files with 387 additions and 80 deletions.
diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -28,6 +28,11 @@ locals {
     ? "${path.module}/manifest-templates"
     : pathexpand(var.templates_path)
   )
+  hugging_face_token_secret = (
+    var.hugging_face_secret == null || var.hugging_face_secret_version == null
+    ? null
+    : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
+  )
 
   all_manifests = flatten([for manifest_file in local.templates :
     [for data in split("---", templatefile(manifest_file, {
@@ -38,13 +43,12 @@ locals {
       inference_server_framework                 = var.inference_server_framework
       ksa                                        = var.ksa
       latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
-      google_service_account                     = var.google_service_account
       max_num_prompts                            = var.max_num_prompts
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
       request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
       tokenizer                                  = var.tokenizer
-      hugging_face_token_b64                     = var.hugging_face_token_b64
+      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
       k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
       output_bucket                              = var.output_bucket
     })) : data]

diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
diff --git a/...rks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/...rks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -41,11 +41,20 @@ spec:
               value: ${request_rates}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: hf-token
                   key: HF_TOKEN
+%{ endfor ~}
+%{ for hf_token in k8s_hf_secret_list ~}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: HF_TOKEN
+%{ endfor ~}
       nodeSelector:
         cloud.google.com/gke-accelerator: nvidia-l4   # nvidia-h100-80gb, nvidia-l4
         iam.gke.io/gke-metadata-server-enabled: "true"
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -143,13 +143,6 @@ variable "latency_profile_kubernetes_service_account" {
   default     = "sample-runner-ksa"
 }
 
-variable "google_service_account" {
-  description = "Google Service Account bound to the kubernetes service account"
-  type        = string
-  default     = ""
-  nullable    = false
-}
-
 // TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
 variable "k8s_hf_secret" {
   description = "Name of secret for huggingface token; stored in k8s "
@@ -158,59 +151,16 @@ variable "k8s_hf_secret" {
   default     = null
 }
 
-variable "hugging_face_token_b64" {
-  description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
+variable "hugging_face_secret" {
+  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
   type        = string
-  nullable    = false
+  nullable    = true
+  default     = null
 }
 
-variable "pipeline_config" {
-  description = "All combinations of model/model_server/accelerators to benchmark"
-  type = object({
-    valid_models       = list(string)
-    valid_accelerators = list(string)
-    request_rates      = list(number)
-
-    config = list(object({
-      model_server = string # Model server name
-      model_server_configs = list(object({
-        models = list(string) # model name
-        model_configs = list(object({
-          accelerators = list(string) # Accelerator name
-          accelerator_configs = list(object({
-            accelerator_count = number # Number of accelerators
-          }))
-        }))
-      }))
-    }))
-  })
-
-  validation {
-    condition = alltrue([
-      for cfg in var.pipeline_config.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model_config in model_server_config.model_configs :
-            alltrue([for accelerator in model_config.accelerators :
-            contains(var.pipeline_config.valid_accelerators, accelerator)])
-          ])
-        )
-      ])
-    ])
-    error_message = "Each accelerator must be in the valid_accelerators list."
-  }
-
-  validation {
-    condition = alltrue([
-      for cfg in var.pipeline_config.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model in model_server_config.models :
-            contains(var.pipeline_config.valid_models, model)
-          ])
-        )
-      ])
-    ])
-    error_message = "Each model must be in the valid_models list."
-  }
-}
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+module "latency-profile" {
+  source = "../latency-profile"
+
+  credentials_config                         = var.credentials_config
+  namespace                                  = var.namespace
+  project_id                                 = var.project_id
+  ksa                                        = var.ksa
+  templates_path                             = var.templates_path
+  artifact_registry                          = var.artifact_registry
+  inference_server_service                   = var.inference_server_service
+  inference_server_service_port              = var.inference_server_service_port
+  inference_server_framework                 = var.inference_server_framework
+  max_num_prompts                            = var.max_num_prompts
+  max_output_len                             = var.max_output_len
+  max_prompt_len                             = var.max_prompt_len
+  request_rates                              = var.request_rates
+  tokenizer                                  = var.tokenizer
+  output_bucket                              = var.output_bucket
+  latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+  k8s_hf_secret                              = var.k8s_hf_secret
+  hugging_face_secret                        = var.hugging_face_secret
+  hugging_face_secret_version                = var.hugging_face_secret_version
+}
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -0,0 +1,105 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+credentials_config = {
+  kubeconfig = {
+    path = "~/.kube/config"
+  }
+}
+
+project_id = "tpu-vm-gke-testing"
+
+
+# Latency profile generator service configuration
+artifact_registry                          = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
+inference_server_service                   = "maxengine-server" # inference server service name
+inference_server_service_port              = 8000
+latency_profile_kubernetes_service_account = "prom-frontend-sa"
+output_bucket                              = "tpu-vm-gke-testing-benchmark-output-bucket"
+k8s_hf_secret                              = "hf-token"
+
+# Benchmark configuration for Locust Docker accessing inference server
+inference_server_framework = "jetstream"
+tokenizer                  = "google/gemma-7b"
+request_rates              = [5, 10, 15, 20]
+
+profiles = {
+  valid_models = [
+    "gemma2-2b",
+    "gemma2-9b",
+    "gemma2-27b",
+    "llama3-8b",
+    "llama3-70b",
+    "llama3-405b"
+  ]
+  valid_accelerators = [
+    "tpu-v4-podslice",
+    "tpu-v5-lite-podslice",
+    "tpu-v5p-slice",
+    "nvidia-a100-80gb",
+    "nvidia-h100-80gb",
+    "nvidia-l4"
+  ]
+  request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+
+  config = [{
+    model_server = "Jetstream"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "vllm"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b",
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "tgi"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b",
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "tensorrt-llm"
+    model_server_configs = [{
+      models = [
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+  }]
+}