diff --git a/benchmarks/benchmark/tools/latency-profile/main.tf b/benchmarks/benchmark/tools/latency-profile/main.tf
index 3906c6417..be95ee59e 100644
--- a/benchmarks/benchmark/tools/latency-profile/main.tf
+++ b/benchmarks/benchmark/tools/latency-profile/main.tf
@@ -28,6 +28,11 @@ locals {
     ? "${path.module}/manifest-templates"
     : pathexpand(var.templates_path)
   )
+  hugging_face_token_secret = (
+    var.hugging_face_secret == null || var.hugging_face_secret_version == null
+    ? null
+    : "${var.hugging_face_secret}/versions/${var.hugging_face_secret_version}"
+  )
 
   all_manifests = flatten([for manifest_file in local.templates :
     [for data in split("---", templatefile(manifest_file, {
@@ -38,13 +43,12 @@ locals {
       inference_server_framework                 = var.inference_server_framework
       ksa                                        = var.ksa
       latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
-      google_service_account                     = var.google_service_account
       max_num_prompts                            = var.max_num_prompts
       max_output_len                             = var.max_output_len
       max_prompt_len                             = var.max_prompt_len
       request_rates                              = join(",", [for number in var.request_rates : tostring(number)])
       tokenizer                                  = var.tokenizer
-      hugging_face_token_b64                     = var.hugging_face_token_b64
+      hugging_face_token_secret_list             = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
       k8s_hf_secret_list                         = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
       output_bucket                              = var.output_bucket
     })) : data]
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
deleted file mode 100644
index d61629ee4..000000000
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/hf-secret.yaml.tpl
+++ /dev/null
@@ -1,7 +0,0 @@
-apiVersion: v1
-kind: Secret
-metadata:
-  name: hf-token
-  namespace: ${namespace}
-data:
-  HF_TOKEN: ${hugging_face_token_b64}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
index c3fb03b83..72e5773a2 100644
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
+++ b/benchmarks/benchmark/tools/latency-profile/manifest-templates/latency-profile-generator.yaml.tpl
@@ -41,11 +41,20 @@ spec:
               value: ${request_rates}
             - name: OUTPUT_BUCKET
               value: ${output_bucket}
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
                   name: hf-token
                   key: HF_TOKEN
+%{ endfor ~}
+%{ for hf_token in k8s_hf_secret_list ~}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token
+                  key: HF_TOKEN
+%{ endfor ~}
       nodeSelector:
         cloud.google.com/gke-accelerator: nvidia-l4   # nvidia-h100-80gb, nvidia-l4
         iam.gke.io/gke-metadata-server-enabled: "true"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
deleted file mode 100644
index 95e400737..000000000
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/namespace.yaml.tpl
+++ /dev/null
@@ -1,4 +0,0 @@
-apiVersion: v1
-kind: Namespace
-metadata:
-  name: ${namespace}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl b/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
deleted file mode 100644
index 02eff324f..000000000
--- a/benchmarks/benchmark/tools/latency-profile/manifest-templates/service-account.yaml.tpl
+++ /dev/null
@@ -1,7 +0,0 @@
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: ${latency_profile_kubernetes_service_account}
-  namespace: ${namespace}
-  annotations:
-    iam.gke.io/gcp-service-account: "${google_service_account}@tpu-vm-gke-testing.iam.gserviceaccount.com"
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/latency-profile/variables.tf b/benchmarks/benchmark/tools/latency-profile/variables.tf
index 896934cf0..935a5bab8 100644
--- a/benchmarks/benchmark/tools/latency-profile/variables.tf
+++ b/benchmarks/benchmark/tools/latency-profile/variables.tf
@@ -143,13 +143,6 @@ variable "latency_profile_kubernetes_service_account" {
   default     = "sample-runner-ksa"
 }
 
-variable "google_service_account" {
-  description = "Google Service Account bound to the kubernetes service account"
-  type        = string
-  default     = ""
-  nullable    = false
-}
-
 // TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
 variable "k8s_hf_secret" {
   description = "Name of secret for huggingface token; stored in k8s "
@@ -158,59 +151,16 @@ variable "k8s_hf_secret" {
   default     = null
 }
 
-variable "hugging_face_token_b64" {
-  description = "Base 64 encoded hugging face token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
+variable "hugging_face_secret" {
+  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
   type        = string
-  nullable    = false
+  nullable    = true
+  default     = null
 }
 
-variable "pipeline_config" {
-  description = "All combinations of model/model_server/accelerators to benchmark"
-  type = object({
-    valid_models       = list(string)
-    valid_accelerators = list(string)
-    request_rates      = list(number)
-
-    config = list(object({
-      model_server = string # Model server name
-      model_server_configs = list(object({
-        models = list(string) # model name
-        model_configs = list(object({
-          accelerators = list(string) # Accelerator name
-          accelerator_configs = list(object({
-            accelerator_count = number # Number of accelerators
-          }))
-        }))
-      }))
-    }))
-  })
-
-  validation {
-    condition = alltrue([
-      for cfg in var.pipeline_config.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model_config in model_server_config.model_configs :
-            alltrue([for accelerator in model_config.accelerators :
-            contains(var.pipeline_config.valid_accelerators, accelerator)])
-          ])
-        )
-      ])
-    ])
-    error_message = "Each accelerator must be in the valid_accelerators list."
-  }
-
-  validation {
-    condition = alltrue([
-      for cfg in var.pipeline_config.config : alltrue([
-        for model_server_config in cfg.model_server_configs : (
-          alltrue([
-            for model in model_server_config.models :
-            contains(var.pipeline_config.valid_models, model)
-          ])
-        )
-      ])
-    ])
-    error_message = "Each model must be in the valid_models list."
-  }
-}
\ No newline at end of file
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}
diff --git a/benchmarks/benchmark/tools/profile-generator/main.tf b/benchmarks/benchmark/tools/profile-generator/main.tf
index e69de29bb..e8ea340a2 100644
--- a/benchmarks/benchmark/tools/profile-generator/main.tf
+++ b/benchmarks/benchmark/tools/profile-generator/main.tf
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+module "latency-profile" {
+  source = "../latency-profile"
+
+  credentials_config                         = var.credentials_config
+  namespace                                  = var.namespace
+  project_id                                 = var.project_id
+  ksa                                        = var.ksa
+  templates_path                             = var.templates_path
+  artifact_registry                          = var.artifact_registry
+  inference_server_service                   = var.inference_server_service
+  inference_server_service_port              = var.inference_server_service_port
+  inference_server_framework                 = var.inference_server_framework
+  max_num_prompts                            = var.max_num_prompts
+  max_output_len                             = var.max_output_len
+  max_prompt_len                             = var.max_prompt_len
+  request_rates                              = var.request_rates
+  tokenizer                                  = var.tokenizer
+  output_bucket                              = var.output_bucket
+  latency_profile_kubernetes_service_account = var.latency_profile_kubernetes_service_account
+  k8s_hf_secret                              = var.k8s_hf_secret
+  hugging_face_secret                        = var.hugging_face_secret
+  hugging_face_secret_version                = var.hugging_face_secret_version
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/sample.tfvars b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
new file mode 100644
index 000000000..907b76eef
--- /dev/null
+++ b/benchmarks/benchmark/tools/profile-generator/sample.tfvars
@@ -0,0 +1,105 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+credentials_config = {
+  kubeconfig = {
+    path = "~/.kube/config"
+  }
+}
+
+project_id = "tpu-vm-gke-testing"
+
+
+# Latency profile generator service configuration
+artifact_registry                          = "us-central1-docker.pkg.dev/tpu-vm-gke-testing/ai-benchmark"
+inference_server_service                   = "maxengine-server" # inference server service name
+inference_server_service_port              = 8000
+latency_profile_kubernetes_service_account = "prom-frontend-sa"
+output_bucket                              = "tpu-vm-gke-testing-benchmark-output-bucket"
+k8s_hf_secret                              = "hf-token"
+
+# Benchmark configuration for Locust Docker accessing inference server
+inference_server_framework = "jetstream"
+tokenizer                  = "google/gemma-7b"
+request_rates              = [5, 10, 15, 20]
+
+profiles = {
+  valid_models = [
+    "gemma2-2b",
+    "gemma2-9b",
+    "gemma2-27b",
+    "llama3-8b",
+    "llama3-70b",
+    "llama3-405b"
+  ]
+  valid_accelerators = [
+    "tpu-v4-podslice",
+    "tpu-v5-lite-podslice",
+    "tpu-v5p-slice",
+    "nvidia-a100-80gb",
+    "nvidia-h100-80gb",
+    "nvidia-l4"
+  ]
+  request_rates = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+
+  config = [{
+    model_server = "Jetstream"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "vllm"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b",
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "tgi"
+    model_server_configs = [{
+      models = [
+        "gemma2-2b",
+        "gemma2-9b",
+        "gemma2-27b",
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+    }, {
+    model_server = "tensorrt-llm"
+    model_server_configs = [{
+      models = [
+        "llama3-8b",
+        "llama3-70b",
+        "llama3-405b"
+      ]
+      model_configs = []
+    }]
+  }]
+}
\ No newline at end of file
diff --git a/benchmarks/benchmark/tools/profile-generator/variables.tf b/benchmarks/benchmark/tools/profile-generator/variables.tf
index e69de29bb..dbbfddc1a 100644
--- a/benchmarks/benchmark/tools/profile-generator/variables.tf
+++ b/benchmarks/benchmark/tools/profile-generator/variables.tf
@@ -0,0 +1,218 @@
+/**
+ * Copyright 2024 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+variable "credentials_config" {
+  description = "Configure how Terraform authenticates to the cluster."
+  type = object({
+    fleet_host = optional(string)
+    kubeconfig = optional(object({
+      context = optional(string)
+      path    = optional(string, "~/.kube/config")
+    }))
+  })
+  nullable = false
+  validation {
+    condition = (
+      (var.credentials_config.fleet_host != null) !=
+      (var.credentials_config.kubeconfig != null)
+    )
+    error_message = "Exactly one of fleet host or kubeconfig must be set."
+  }
+}
+
+variable "namespace" {
+  description = "Namespace used for model and benchmarking deployments."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "project_id" {
+  description = "Project id of existing or created project."
+  type        = string
+  nullable    = false
+}
+
+
+variable "ksa" {
+  description = "Kubernetes Service Account used for workload."
+  type        = string
+  nullable    = false
+  default     = "default"
+}
+
+variable "templates_path" {
+  description = "Path where manifest templates will be read from. Set to null to use the default manifests"
+  type        = string
+  default     = null
+}
+
+variable "artifact_registry" {
+  description = "Artifact registry for storing Latency Profile Generator container."
+  type        = string
+  default     = null
+}
+
+variable "inference_server_service" {
+  description = "Inference server service"
+  type        = string
+  nullable    = false
+}
+
+variable "inference_server_service_port" {
+  description = "Inference server service port"
+  type        = number
+  nullable    = false
+}
+
+variable "inference_server_framework" {
+  description = "Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt_llm_triton, sax"
+  type        = string
+  nullable    = false
+  default     = "tgi"
+  validation {
+    condition     = var.inference_server_framework == "vllm" || var.inference_server_framework == "tgi" || var.inference_server_framework == "tensorrt_llm_triton" || var.inference_server_framework == "sax" || var.inference_server_framework == "jetstream"
+    error_message = "The inference_server_framework must be one of: vllm, tgi, tensorrt_llm_triton, sax, or jetstream."
+  }
+}
+
+variable "max_num_prompts" {
+  description = "Benchmark server configuration for max number of prompts."
+  type        = number
+  default     = 1000
+  validation {
+    condition     = var.max_num_prompts > 0
+    error_message = "The max_num_prompts value must be greater than 0."
+  }
+}
+
+variable "max_output_len" {
+  description = "Benchmark server configuration for max output length."
+  type        = number
+  default     = 256
+  validation {
+    condition     = var.max_output_len > 4
+    error_message = "The max_output_len value must be greater than 4. TGI framework throws an error for too short of sequences."
+  }
+}
+
+variable "max_prompt_len" {
+  description = "Benchmark server configuration for max prompt length."
+  type        = number
+  default     = 256
+  validation {
+    condition     = var.max_prompt_len > 4
+    error_message = "The max_prompt_len value must be greater than 4. TGI framework throws an error for too short of sequences."
+  }
+}
+
+variable "request_rates" {
+  description = ""
+  type        = list(number)
+  default     = [1, 2]
+  nullable    = false
+}
+
+variable "tokenizer" {
+  description = "Benchmark server configuration for tokenizer."
+  type        = string
+  nullable    = false
+  default     = "tiiuae/falcon-7b"
+}
+
+variable "output_bucket" {
+  description = "Bucket name for storing results"
+  type        = string
+}
+
+variable "latency_profile_kubernetes_service_account" {
+  description = "Kubernetes Service Account to be used for the latency profile generator tool"
+  type        = string
+  default     = "sample-runner-ksa"
+}
+
+// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
+variable "k8s_hf_secret" {
+  description = "Name of secret for huggingface token; stored in k8s "
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "hugging_face_secret" {
+  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
+variable "profiles" {
+  description = "Model servers to benchmark"
+  type = object({
+    valid_models       = list(string)
+    valid_accelerators = list(string)
+    request_rates      = list(number)
+
+    config = list(object({
+      model_server = string # Model server name
+      model_server_configs = list(object({
+        models = list(string) # model name
+        model_configs = list(object({
+          accelerators = list(string) # Accelerator name
+          accelerator_configs = list(object({
+            accelerator_count = number # Number of accelerators
+          }))
+        }))
+      }))
+    }))
+  })
+
+  validation {
+    condition = alltrue([
+      for cfg in var.profiles.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model_config in model_server_config.model_configs :
+            alltrue([for accelerator in model_config.accelerators :
+            contains(var.profiles.valid_accelerators, accelerator)])
+          ])
+        )
+      ])
+    ])
+    error_message = "Each accelerator must be in the valid_accelerators list."
+  }
+
+  validation {
+    condition = alltrue([
+      for cfg in var.profiles.config : alltrue([
+        for model_server_config in cfg.model_server_configs : (
+          alltrue([
+            for model in model_server_config.models :
+            contains(var.profiles.valid_models, model)
+          ])
+        )
+      ])
+    ])
+    error_message = "Each model must be in the valid_models list."
+  }
+}
\ No newline at end of file