diff --git a/.gitignore b/.gitignore index 20c55e372..876114dc7 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,4 @@ default.tfstate.backup terraform.tfstate* terraform.tfvars tfplan +.vscode/ diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py index cbc269f9d..99f617ab8 100644 --- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py +++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py @@ -333,7 +333,7 @@ class GrpcBenchmarkUser(GrpcUser): def grpc_infer(self): prompt = get_random_prompt(self) request = jetstream_pb2.DecodeRequest( - text_content=jetstream_pb2.DecodeRequest.TextContent(text=request.prompt), + text_content=jetstream_pb2.DecodeRequest.TextContent(text=prompt), priority=0, max_tokens=model_params["max_output_len"], ) diff --git a/benchmarks/inference-server/jetstream/jetstream.yaml b/benchmarks/inference-server/jetstream/jetstream.yaml index b8bb42f98..e63a7d4a0 100644 --- a/benchmarks/inference-server/jetstream/jetstream.yaml +++ b/benchmarks/inference-server/jetstream/jetstream.yaml @@ -18,7 +18,7 @@ spec: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice containers: - name: maxengine-server - image: us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.2 args: - model_name=gemma-7b - tokenizer_path=assets/tokenizer.gemma @@ -32,6 +32,8 @@ spec: - scan_layers=false - weight_dtype=bfloat16 - load_parameters_path=gs://GEMMA_BUCKET_NAME/final/unscanned/gemma_7b-it/0/checkpoints/0/items + - attention=dot_product + - prometheus_port=9100 ports: - containerPort: 9000 resources: @@ -40,7 +42,7 @@ spec: limits: google.com/tpu: 4 - name: jetstream-http - image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2 ports: - containerPort: 8000 --- diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md index eca76bc18..5a2d6418b 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/README.md @@ -121,9 +121,11 @@ Completed unscanning checkpoint to gs://BUCKET_NAME/final/unscanned/gemma_7b-it/ ## Deploy Maxengine Server and HTTP Server -In this example, we will deploy a Maxengine server targeting Gemma-7b model. You can use the provided Maxengine server and HTTP server images already in `deployment.yaml` or [build your own](#optionals). +Next, deploy a Maxengine server hosting the Gemma-7b model. You can use the provided Maxengine server and HTTP server images or [build your own](#build-and-upload-maxengine-server-image). Depending on your needs and constraints you can elect to deploy either via Terraform or via Kubectl. -Add desired overrides to your yaml file by editing the `args` in `deployment.yaml`. You can reference the [MaxText base config file](https://github.com/google/maxtext/blob/main/MaxText/configs/base.yml) on what values can be overridden. +### Deploy via Kubectl + +First navigate to the `./kubectl` directory. Add desired overrides to your yaml file by editing the `args` in `deployment.yaml`. You can reference the [MaxText base config file](https://github.com/google/maxtext/blob/main/MaxText/configs/base.yml) on what values can be overridden. In the manifest, ensure the value of the BUCKET_NAME is the name of the Cloud Storage bucket that was used when converting your checkpoint. @@ -147,7 +149,55 @@ Deploy the manifest file for the Maxengine server and HTTP server: kubectl apply -f deployment.yaml ``` -## Verify the deployment +### Deploy via Terraform + +Navigate to the `./terraform` directory and do the standard [`terraform init`](https://developer.hashicorp.com/terraform/cli/commands/init). The deployment requires some inputs, an example `sample-terraform.tfvars` is provided as a starting point, run `cp sample-terraform.tfvars terraform.tfvars` and modify the resulting `terraform.tfvars` as needed. Finally run `terraform apply` to apply these resources to your cluster. + +#### (optional) Enable Horizontal Pod Autoscaling via Terraform + +Applying the following resources to your cluster will enable autoscaling with customer metrics: + - PodMonitoring: For scraping metrics and exporting them to Google Cloud Monitoring + - Custom Metrics Stackdriver Adapter (CMSA): For enabling your HPA objects to read metrics from the Google Cloud Monitoring API. + - [Horizontal Pod Autoscaler (HPA)](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/): For reading metrics and setting the maxengine-servers deployments replica count accordingly. + +These components require a few more inputs and rerunning the [prior step](#deploy-via-terraform) with these set will deploy the components. The following input conditions should be satisfied: `custom_metrics_enabled` should be `true` and `metrics_port`, `hpa_type`, `hpa_averagevalue_target`, `hpa_min_replicas`, `hpa_max_replicas` should all be set. + + Note that only one HPA resource will be created. For those who want to scale based on multiple metrics, we recommend using the following template to apply more HPA resources: + +``` +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: jetstream-hpa +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: maxengine-server + minReplicas: + maxReplicas: + metrics: + - type: Pods + pods: + metric: + name: prometheus.googleapis.com||gauge + target: + type: AverageValue + averageValue: +``` + +If you would like to probe the metrics manually, `cURL` your maxengine-server container on whatever metrics port you set and you should see something similar to the following: + +``` +# HELP jetstream_prefill_backlog_size Size of prefill queue +# TYPE jetstream_prefill_backlog_size gauge +jetstream_prefill_backlog_size{id="SOME-HOSTNAME-HERE>"} 0.0 +# HELP jetstream_slots_used_percentage The percentage of decode slots currently being used +# TYPE jetstream_slots_used_percentage gauge +jetstream_slots_used_percentage{id="",idx="0"} 0.04166666666666663 +``` + +### Verify the deployment Wait for the containers to finish creating: ``` @@ -199,7 +249,7 @@ The output should be similar to the following: } ``` -## Optionals +## Other optional steps ### Build and upload Maxengine Server image Build the Maxengine Server from [here](../maxengine-server) and upload to your project @@ -223,7 +273,7 @@ docker push gcr.io/${PROJECT_ID}/jetstream/maxtext/jetstream-http:latest The Jetstream HTTP server is great for initial testing and validating end-to-end requests and responses. If you would like to interact directly with the Maxengine server directly for use cases such as [benchmarking](https://github.com/google/JetStream/tree/main/benchmarks), you can do so by following the Jetstream benchmarking setup and applying the `deployment.yaml` manifest file and interacting with the Jetstream gRPC server at port 9000. ``` -kubectl apply -f deployment.yaml +kubectl apply -f kubectl/deployment.yaml kubectl port-forward svc/jetstream-svc 9000:9000 ``` diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/deployment.yaml b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/kubectl/deployment.yaml similarity index 97% rename from tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/deployment.yaml rename to tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/kubectl/deployment.yaml index f95e88dd4..c261bc5ff 100644 --- a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/deployment.yaml +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/kubectl/deployment.yaml @@ -17,7 +17,7 @@ spec: cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice containers: - name: maxengine-server - image: us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.0 + image: us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.2 imagePullPolicy: Always securityContext: privileged: true @@ -34,6 +34,7 @@ spec: - scan_layers=false - weight_dtype=bfloat16 - load_parameters_path=gs://BUCKET_NAME/final/unscanned/gemma_7b-it/0/checkpoints/0/items + - attention=dot_product - prometheus_port=9100 ports: - containerPort: 9000 @@ -64,4 +65,3 @@ spec: name: jetstream-grpc port: 9000 targetPort: 9000 - diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/README.md b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/README.md new file mode 100644 index 000000000..855e55be4 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/README.md @@ -0,0 +1,26 @@ +# Custom Metrics Stackdriver Adapter + +Adapted from https://raw.githubusercontent.com/GoogleCloudPlatform/k8s-stackdriver/master/custom-metrics-stackdriver-adapter/deploy/production/adapter_new_resource_model.yaml + +## Usage + +To use this module, include it from your main terraform config, i.e.: + +``` +module "custom_metrics_stackdriver_adapter" { + source = "./path/to/custom-metrics-stackdriver-adapter" +} +``` + +For a workload identity enabled cluster, some additional configuration is +needed: + +``` +module "custom_metrics_stackdriver_adapter" { + source = "./path/to/custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = "" + } +} +``` \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/main.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/main.tf new file mode 100644 index 000000000..3ecb5f674 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/main.tf @@ -0,0 +1,291 @@ +resource "kubernetes_namespace_v1" "custom-metrics" { + metadata { + name = "custom-metrics" + } +} + +resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-no-wi" { + count = var.workload_identity.enabled ? 0 : 1 + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_service_account_v1" "custom-metrics-stackdriver-adapter-wi" { + count = var.workload_identity.enabled ? 1 : 0 + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + annotations = { + "iam.gke.io/gcp-service-account" = google_service_account.cmsa-sa[0].email + } + } +} + +resource "kubernetes_cluster_role_binding_v1" "custom-metrics-system-auth-delegator" { + metadata { + name = "custom-metrics:system:auth-delegator" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "system:auth-delegator" + } + subject { + kind = "ServiceAccount" + name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_role_binding_v1" "custom-metrics-auth-reader" { + metadata { + name = "custom-metrics-auth-reader" + namespace = "kube-system" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = "extension-apiserver-authentication-reader" + } + subject { + kind = "ServiceAccount" + name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_cluster_role_v1" "custom-metrics-resource-reader" { + metadata { + name = "custom-metrics-resource-reader" + } + rule { + api_groups = [""] + resources = ["pods", "nodes", "nodes/stats"] + verbs = ["get", "list", "watch"] + } +} + +resource "kubernetes_cluster_role_binding_v1" "custom-metrics-resource-reader" { + metadata { + name = "custom-metrics-resource-reader" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = kubernetes_cluster_role_v1.custom-metrics-resource-reader.metadata[0].name + } + subject { + kind = "ServiceAccount" + name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } +} + +resource "kubernetes_deployment_v1" "custom-metrics-stackdriver-adapter" { + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + } + } + spec { + replicas = 1 + + selector { + match_labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + } + } + + template { + metadata { + labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + "kubernetes.io/cluster-service" = "true" + } + } + + spec { + service_account_name = (var.workload_identity.enabled + ? kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-wi[0].metadata[0].name + : kubernetes_service_account_v1.custom-metrics-stackdriver-adapter-no-wi[0].metadata[0].name + ) + + container { + image = "gcr.io/gke-release/custom-metrics-stackdriver-adapter:v0.14.2-gke.0" + image_pull_policy = "Always" + name = "pod-custom-metrics-stackdriver-adapter" + command = ["/adapter", "--use-new-resource-model=true", "--fallback-for-container-metrics=true"] + resources { + limits = { + cpu = "250m" + memory = "200Mi" + } + requests = { + cpu = "250m" + memory = "200Mi" + } + } + } + } + } + } +} + +resource "kubernetes_service_v1" "custom-metrics-stackdriver-adapter" { + metadata { + name = "custom-metrics-stackdriver-adapter" + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + labels = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + "kubernetes.io/cluster-service" = "true" + "kubernetes.io/name" = "Adapter" + } + } + spec { + selector = { + run = "custom-metrics-stackdriver-adapter" + k8s-app = "custom-metrics-stackdriver-adapter" + } + port { + port = 443 + protocol = "TCP" + target_port = 443 + } + type = "ClusterIP" + } +} + +resource "kubernetes_api_service_v1" "v1beta1-custom-metrics-k8s-io" { + metadata { + name = "v1beta1.custom.metrics.k8s.io" + } + spec { + insecure_skip_tls_verify = true + group = "custom.metrics.k8s.io" + group_priority_minimum = 100 + version_priority = 100 + service { + name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } + version = "v1beta1" + } +} + +resource "kubernetes_api_service_v1" "v1beta2-custom-metrics-k8s-io" { + metadata { + name = "v1beta2.custom.metrics.k8s.io" + } + spec { + insecure_skip_tls_verify = true + group = "custom.metrics.k8s.io" + group_priority_minimum = 100 + version_priority = 200 + service { + name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } + version = "v1beta2" + } +} + +resource "kubernetes_api_service_v1" "v1beta1-external-metrics-k8s-io" { + metadata { + name = "v1beta1.external.metrics.k8s.io" + } + spec { + insecure_skip_tls_verify = true + group = "external.metrics.k8s.io" + group_priority_minimum = 100 + version_priority = 100 + service { + name = kubernetes_service_v1.custom-metrics-stackdriver-adapter.metadata[0].name + namespace = kubernetes_namespace_v1.custom-metrics.metadata[0].name + } + version = "v1beta1" + } +} + +resource "kubernetes_cluster_role_binding_v1" "external-metrics-reader" { + metadata { + name = "external-metrics-reader" + } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "ClusterRole" + name = "external-metrics-reader" + } + subject { + kind = "ServiceAccount" + name = "horizontal-pod-autoscaler" + namespace = "kube-system" + } +} + + +# If workload identity is enabled, extra steps are required. We need to: +# - create a service account +# - grant it the monitoring.viewer IAM role +# - bind it to the workload identity user for the cmsa +# - annotate the cmsa service account (done above) + +resource "google_service_account" "cmsa-sa" { + count = var.workload_identity.enabled ? 1 : 0 + account_id = "cmsa-sa" + project = var.workload_identity.project_id +} + +# Equivalent to: +# gcloud projects add-iam-policy-binding PROJECT_ID \ +# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ +# --role=roles/monitoring.viewer +resource "google_project_iam_binding" "cmsa-project-binding-monitoring-viewer" { + count = var.workload_identity.enabled ? 1 : 0 + project = var.workload_identity.project_id + role = "roles/monitoring.viewer" + members = [ + "serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" + ] +} + +# Equivalent to: +# gcloud projects add-iam-policy-binding PROJECT_ID \ +# --member=serviceAccount:cmsa-sa@PROJECT_ID.iam.gserviceaccount.com \ +# --role=roles/iam.serviceAccountTokenCreator +resource "google_project_iam_binding" "cmsa-project-binding-sa-token-creator" { + count = var.workload_identity.enabled ? 1 : 0 + project = var.workload_identity.project_id + role = "roles/iam.serviceAccountTokenCreator" + members = [ + "serviceAccount:${google_service_account.cmsa-sa[0].account_id}@${var.workload_identity.project_id}.iam.gserviceaccount.com" + ] +} + +# Equivalent to: +# gcloud iam service-accounts add-iam-policy-binding \ +# --role roles/iam.workloadIdentityUser \ +# --member "serviceAccount:PROJECT_ID.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" \ +# cmsa-sa@PROJECT_ID.iam.gserviceaccount.com +resource "google_service_account_iam_member" "cmsa-bind-to-gsa" { + count = var.workload_identity.enabled ? 1 : 0 + service_account_id = google_service_account.cmsa-sa[0].name + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${var.workload_identity.project_id}.svc.id.goog[custom-metrics/custom-metrics-stackdriver-adapter]" +} diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/variables.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/variables.tf new file mode 100644 index 000000000..392a845ba --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/custom-metrics-stackdriver-adapter/variables.tf @@ -0,0 +1,32 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "workload_identity" { + type = object({ + enabled = bool + project_id = optional(string) + }) + default = { + enabled = false + } + validation { + condition = ( + (var.workload_identity.enabled && var.workload_identity.project_id != null) + || (!var.workload_identity.enabled) + ) + error_message = "A project_id must be specified if workload_identity_enabled is set." + } +} diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/hpa-templates/hpa.jetstream.yaml.tftpl b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/hpa-templates/hpa.jetstream.yaml.tftpl new file mode 100644 index 000000000..7bf9bd0fd --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/hpa-templates/hpa.jetstream.yaml.tftpl @@ -0,0 +1,31 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: jetstream-hpa + namespace: ${namespace} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: maxengine-server + minReplicas: ${hpa_min_replicas} + maxReplicas: ${hpa_max_replicas} + metrics: +%{ if length(regexall("jetstream_.*", hpa_type)) > 0 } + - type: Pods + pods: + metric: + name: prometheus.googleapis.com|${hpa_type}|gauge + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} +%{ else } + - type: Pods + pods: + metric: + name: kubernetes.io|node|accelerator|memory_used + target: + type: AverageValue + averageValue: ${hpa_averagevalue_target} +%{ endif } + diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf new file mode 100644 index 000000000..d7c267423 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/main.tf @@ -0,0 +1,58 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + hpa_cpu_template = "${path.module}/hpa-templates/hpa.cpu.yaml.tftpl" + hpa_jetstream_template = "${path.module}/hpa-templates/hpa.jetstream.yaml.tftpl" + jetstream_podmonitoring = "${path.module}/monitoring-templates/jetstream-podmonitoring.yaml.tftpl" +} + +module "custom_metrics_stackdriver_adapter" { + count = var.custom_metrics_enabled ? 1 : 0 + source = "./custom-metrics-stackdriver-adapter" + workload_identity = { + enabled = true + project_id = var.project_id + } +} + +module "maxengine" { + count = 1 + source = "./maxengine" + bucket_name = var.bucket_name + metrics_port = var.metrics_port + maxengine_server_image = var.maxengine_server_image + jetstream_http_server_image = var.jetstream_http_server_image +} + +resource "kubernetes_manifest" "tgi-pod-monitoring" { + count = var.custom_metrics_enabled && var.metrics_port != null ? 1 : 0 + manifest = yamldecode(templatefile(local.jetstream_podmonitoring, { + namespace = var.namespace + metrics_port = try(var.metrics_port, -1) + })) +} + +resource "kubernetes_manifest" "hpa_custom_metric" { + count = (var.custom_metrics_enabled && var.hpa_type != null || var.hpa_type != "memory_used") && var.hpa_averagevalue_target != null ? 1 : 0 + manifest = yamldecode(templatefile(local.hpa_jetstream_template, { + namespace = var.namespace + hpa_type = try(var.hpa_type, "") + hpa_averagevalue_target = try(var.hpa_averagevalue_target, 1) + hpa_min_replicas = var.hpa_min_replicas + hpa_max_replicas = var.hpa_max_replicas + })) +} diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/main.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/main.tf new file mode 100644 index 000000000..3b92ab790 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/main.tf @@ -0,0 +1,108 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +resource "kubernetes_deployment" "deployment_maxengine_server" { + metadata { + name = "maxengine-server" + } + spec { + replicas = 2 + selector { + match_labels = { + app = "maxengine-server" + } + } + template { + metadata { + labels = { + app = "maxengine-server" + } + } + spec { + container { + args = [ + "model_name=gemma-7b", + "tokenizer_path=assets/tokenizer.gemma", + "per_device_batch_size=4", + "max_prefill_predict_length=1024", + "max_target_length=2048", + "async_checkpointing=false", + "ici_fsdp_parallelism=1", + "ici_autoregressive_parallelism=-1", + "ici_tensor_parallelism=1", + "scan_layers=false", + "weight_dtype=bfloat16", + format("load_parameters_path=gs://%s/final/unscanned/gemma_7b-it/0/checkpoints/0/items", var.bucket_name), + "attention=dot_product", + var.metrics_port != null ? format("prometheus_port=%d", var.metrics_port) : "", + ] + image = var.maxengine_server_image + image_pull_policy = "Always" + name = "maxengine-server" + port { + container_port = 9000 + } + resources { + limits = { + "google.com/tpu" = 8 + } + requests = { + "google.com/tpu" = 8 + } + } + security_context { + privileged = true + } + } + container { + image = var.jetstream_http_server_image + image_pull_policy = "Always" + name = "jetstream-http" + port { + container_port = 8000 + } + } + node_selector = { + "cloud.google.com/gke-tpu-accelerator" = "tpu-v5-lite-podslice" + "cloud.google.com/gke-tpu-topology" = "2x4" + } + } + } + } +} + +resource "kubernetes_service" "service_jetstream_svc" { + metadata { + name = "jetstream-svc" + } + spec { + port { + name = "jetstream-http" + port = 8000 + protocol = "TCP" + target_port = 8000 + } + port { + name = "jetstream-grpc" + port = 9000 + protocol = "TCP" + target_port = 9000 + } + selector = { + app = "maxengine-server" + } + } +} \ No newline at end of file diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/variables.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/variables.tf new file mode 100644 index 000000000..00096c088 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/maxengine/variables.tf @@ -0,0 +1,43 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "bucket_name" { + description = "Name of Google Cloud Storage bucket hosting unscanned checkpoints" + type = string + nullable = false +} + +variable "metrics_port" { + description = "Port to emit metrics from" + type = number + default = 9100 + nullable = true +} + +variable "maxengine_server_image" { + description = "maxengine-server container image" + type = string + default = "us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.2" + nullable = false +} + +variable "jetstream_http_server_image" { + description = "jetstream-http container image" + type = string + default = "us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2" + nullable = false +} + diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/monitoring-templates/jetstream-podmonitoring.yaml.tftpl b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/monitoring-templates/jetstream-podmonitoring.yaml.tftpl new file mode 100644 index 000000000..581d7d3b6 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/monitoring-templates/jetstream-podmonitoring.yaml.tftpl @@ -0,0 +1,12 @@ +apiVersion: monitoring.googleapis.com/v1 +kind: PodMonitoring +metadata: + name: "jetstream-podmonitoring" + namespace: ${namespace} +spec: + endpoints: + - port: ${metrics_port} + interval: 1s + path: / + targetLabels: + metadata: ['pod', 'container', 'node'] diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/providers.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/providers.tf new file mode 100644 index 000000000..70c82e817 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/providers.tf @@ -0,0 +1,36 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +data "google_client_config" "identity" { + count = var.credentials_config.fleet_host != null ? 1 : 0 +} + +provider "kubernetes" { + config_path = ( + var.credentials_config.kubeconfig == null + ? null + : pathexpand(var.credentials_config.kubeconfig.path) + ) + config_context = try( + var.credentials_config.kubeconfig.context, null + ) + host = ( + var.credentials_config.fleet_host == null + ? null + : var.credentials_config.fleet_host + ) + token = try(data.google_client_config.identity.0.access_token, null) +} diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars new file mode 100644 index 000000000..4114b6f5e --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/sample-terraform.tfvars @@ -0,0 +1,21 @@ +# How to (horizontally) scale the workload. Allowed values are: +# - null (no scaling), +# - Workload resources: +# - "cpu" (scale on cpu utilization). +# - Workload metrics (i.e. custom metrics): +# - "jetstream_prefill_backlog_size" +# - "jetstream_slots_used_percentage" +# - Other possibilities coming soon... +# +# See `autoscaling.md` for more details and recommendations. +custom_metrics_enabled = true +metrics_port = 9100 + +# Demonstrating autoscaling with jetstream_prefill_backlog_size, change as desired. +# For jetstream_prefill_backlog_size. (experiment with this to determine optimal values). +hpa_type = "jetstream_prefill_backlog_size" +hpa_averagevalue_target = 10 + +# Adjust these if you want different min/max values +hpa_min_replicas = 1 +hpa_max_replicas = 2 diff --git a/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/variables.tf b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/variables.tf new file mode 100644 index 000000000..fff306d01 --- /dev/null +++ b/tutorials-and-examples/inference-servers/jetstream/maxtext/single-host-inference/terraform/variables.tf @@ -0,0 +1,127 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +variable "credentials_config" { + description = "Configure how Terraform authenticates to the cluster." + type = object({ + fleet_host = optional(string) + kubeconfig = optional(object({ + context = optional(string) + path = optional(string, "~/.kube/config") + })) + }) + nullable = true + default = { + kubeconfig = { + path : "~/.kube/config" + } + } + validation { + condition = ( + (var.credentials_config.fleet_host != null) != + (var.credentials_config.kubeconfig != null) + ) + error_message = "Exactly one of fleet host or kubeconfig must be set." + } +} + +variable "namespace" { + description = "Namespace used for Jetstream resources." + type = string + nullable = false + default = "default" +} + +variable "maxengine_server_image" { + description = "maxengine-server container image" + type = string + default = "us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.2" + nullable = false +} + +variable "jetstream_http_server_image" { + description = "jetstream-http container image" + type = string + default = "us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.2" + nullable = false +} + + +variable "bucket_name" { + description = "Name of Google Cloud Storage bucket hosting unscanned checkpoints" + type = string + nullable = false +} + +variable "templates_path" { + description = "Path where manifest templates will be read from. Set to null to use the default manifests" + type = string + default = null +} + +variable "hpa_type" { + description = "How the Jetstream workload should be scaled." + type = string + default = null + nullable = true + validation { + condition = var.hpa_type == null ? true : length(regexall("jetstream_.*", var.hpa_type)) > 0 || length(regexall("memory_used", var.hpa_type)) > 0 + error_message = "Allows values for hpa_type are {null, memory_used, jetstream metrics (e.g., \"jetstream_prefill_backlog_size\", \"jetstream_slots_used_percentage\")}" + } +} + +variable "hpa_min_replicas" { + description = "Minimum number of HPA replicas." + type = number + default = 1 + nullable = false +} + +variable "hpa_max_replicas" { + description = "Maximum number of HPA replicas." + type = number + default = 5 + nullable = false +} + +# TODO: combine hpa variables into a single object (so that they can be +# validated together) +variable "hpa_averagevalue_target" { + description = "AverageValue target for the `hpa_type` metric. Must be set if `hpa_type` is not null." + type = number + default = null + nullable = true +} + +variable "project_id" { + description = "Project id of existing or created project." + type = string + nullable = false +} + +variable "custom_metrics_enabled" { + description = "Enable custom metrics collection" + type = bool + default = false + nullable = false +} + +variable "metrics_port" { + description = "Port to scrape metrics from" + type = number + nullable = true +} \ No newline at end of file