fix: refactor kuberay-monitoring and inference server (#773)

fix: refactor kube-monitoring Change-Id: Ib74be5baa3e3cdb2348532ae4a3d1831ecde37d1 Co-authored-by: Gen Lu <[email protected]>
GoogleCloudPlatform · Aug 16, 2024 · 98074e1 · 98074e1
1 parent 1f4c968
commit 98074e1
Show file tree

Hide file tree

Showing 23 changed files with 465 additions and 217 deletions.
diff --git a/applications/rag/main.tf b/applications/rag/main.tf
@@ -286,7 +286,7 @@ module "kuberay-cluster" {
 }
 
 module "inference-server" {
-  source            = "../../tutorials-and-examples/hf-tgi"
+  source            = "../../modules/inference-service"
   providers         = { kubernetes = kubernetes.rag }
   namespace         = local.kubernetes_namespace
   additional_labels = var.additional_labels
@@ -324,3 +324,17 @@ module "frontend" {
   members_allowlist        = var.frontend_members_allowlist != "" ? split(",", var.frontend_members_allowlist) : []
   depends_on               = [module.namespace]
 }
+
+resource "helm_release" "gmp-apps" {
+  name      = "gmp-apps"
+  provider  = helm.rag
+  chart     = "../../charts/gmp-engine/"
+  namespace = local.kubernetes_namespace
+  # Timeout is increased to guarantee sufficient scale-up time for Autopilot nodes.
+  timeout    = 1200
+  depends_on = [module.inference-server, module.frontend]
+  values = [
+    "${file("${path.module}/podmonitoring.yaml")}"
+  ]
+}
+
diff --git a/applications/rag/podmonitoring.yaml b/applications/rag/podmonitoring.yaml
@@ -0,0 +1,11 @@
+podMonitoring:
+- name: mistral-7b-instruct
+  selector:
+    app: mistral-7b-instruct
+  port: metrics
+  interval: 30s
+- name: rag-frontend
+  selector:
+    app: rag-frontend
+  port: metrics
+  interval: 30s
diff --git a/charts/gmp-engine/Chart.yaml b/charts/gmp-engine/Chart.yaml
@@ -0,0 +1,28 @@
+apiVersion: v2
+name: gmp-engine
+description: A Helm chart for Kubernetes
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.0.0"
+
+dependencies:
+- name: gmp-frontend
+  condition: gmp-frontend.enabled
diff --git a/...y-monitoring/charts/gmp-engine/Chart.yaml → ...gmp-engine/charts/gmp-frontend/Chart.yaml b/...y-monitoring/charts/gmp-engine/Chart.yaml → ...gmp-engine/charts/gmp-frontend/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
-name: gmp-engine
+name: gmp-frontend
 description: A Helm chart for Kubernetes
 
 # A chart can be either an 'application' or a 'library' chart.

diff --git a/...arts/gmp-engine/templates/deployment.yaml → ...ts/gmp-frontend/templates/deployment.yaml b/...arts/gmp-engine/templates/deployment.yaml → ...ts/gmp-frontend/templates/deployment.yaml
@@ -28,14 +28,14 @@ spec:
       labels:
         app: {{ .Values.name }}
     spec:
-      serviceAccountName: {{ .Values.serviceAccount }}
+      serviceAccountName: {{ required "serviceAccount is required!" .Values.serviceAccount }}
       containers:
         - name: {{ .Values.name }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
           args:
           - "--web.listen-address=:9090"
-          - "--query.project-id={{ .Values.projectID }}"
+          - "--query.project-id={{ required "projectID is required!" .Values.projectID }}"
           ports:
           - name: web
             containerPort: 9090

diff --git a/.../charts/gmp-engine/templates/service.yaml → ...harts/gmp-frontend/templates/service.yaml b/.../charts/gmp-engine/templates/service.yaml → ...harts/gmp-frontend/templates/service.yaml
diff --git a/charts/gmp-engine/charts/gmp-frontend/values.yaml b/charts/gmp-engine/charts/gmp-frontend/values.yaml
@@ -0,0 +1,17 @@
+# Default values for gmp-frontend.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+name: "gmp-frontend"
+projectID: ""
+serviceAccount: ""
+
+image:
+  repository: gke.gcr.io/prometheus-engine/frontend
+  pullPolicy: IfNotPresent
+  tag: "v0.5.0-gke.0"
+
+replicaCount: 2
+
+cpu: "1m"
+memory: "5Mi"
diff --git a/...s/gmp-engine/templates/podmonitoring.yaml → ...s/gmp-engine/templates/podmonitoring.yaml b/...s/gmp-engine/templates/podmonitoring.yaml → ...s/gmp-engine/templates/podmonitoring.yaml
diff --git a/charts/gmp-engine/values.yaml b/charts/gmp-engine/values.yaml
@@ -0,0 +1,10 @@
+# Default values for iap_jupyter.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+podMonitoring: []
+
+gmp-frontend:
+  enabled: false
+  projectID: ""
+  serviceAccount: ""
diff --git a/modules/inference-service/README b/modules/inference-service/README
@@ -0,0 +1,2 @@
+# Inference Service
+This module is currently designed specifically for the Mistral-7B-Instruct-v0.1 model. Future developments will expand the module to support the creation of customized models more broadly.
diff --git a/modules/inference-service/main.tf b/modules/inference-service/main.tf
@@ -0,0 +1,187 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+locals {
+  additional_labels = length(var.additional_labels) == 0 ? {} : tomap({
+    for item in split(",", var.additional_labels) :
+    split("=", item)[0] => split("=", item)[1]
+  })
+}
+
+resource "kubernetes_service" "inference_service" {
+  metadata {
+    name = "mistral-7b-instruct-service"
+    labels = {
+      app = "mistral-7b-instruct"
+    }
+    namespace = var.namespace
+    annotations = {
+      "cloud.google.com/load-balancer-type" = "Internal"
+      "cloud.google.com/neg"                = "{\"ingress\":true}"
+    }
+  }
+  spec {
+    selector = {
+      app = "mistral-7b-instruct"
+    }
+    session_affinity = "ClientIP"
+    port {
+      protocol    = "TCP"
+      port        = 80
+      target_port = 8080
+    }
+
+    type = "LoadBalancer"
+  }
+}
+
+resource "kubernetes_deployment" "inference_deployment" {
+  timeouts {
+    create = "30m"
+  }
+  metadata {
+    name      = "mistral-7b-instruct"
+    namespace = var.namespace
+    labels = merge({
+      app = "mistral-7b-instruct"
+    }, local.additional_labels)
+  }
+
+  spec {
+    # It takes more than 10m for the deployment to be ready on Autopilot cluster
+    # Set the progress deadline to 30m to avoid the deployment controller
+    # considering the deployment to be failed
+    progress_deadline_seconds = 1800
+    replicas                  = 1
+
+    selector {
+      match_labels = merge({
+        app = "mistral-7b-instruct"
+      }, local.additional_labels)
+    }
+
+    template {
+      metadata {
+        labels = merge({
+          app = "mistral-7b-instruct"
+        }, local.additional_labels)
+      }
+
+      spec {
+        init_container {
+          name    = "download-model"
+          image   = "google/cloud-sdk:473.0.0-alpine"
+          command = ["gsutil", "cp", "-r", "gs://vertex-model-garden-public-us/mistralai/Mistral-7B-Instruct-v0.1/", "/model-data/"]
+          volume_mount {
+            mount_path = "/model-data"
+            name       = "model-storage"
+          }
+        }
+        container {
+          image = "ghcr.io/huggingface/text-generation-inference:1.1.0"
+          name  = "mistral-7b-instruct"
+
+          port {
+            name           = "metrics"
+            container_port = 8080
+            protocol       = "TCP"
+          }
+
+          args = ["--model-id", "$(MODEL_ID)"]
+
+          env {
+            name  = "MODEL_ID"
+            value = "/model/Mistral-7B-Instruct-v0.1"
+          }
+
+          env {
+            name  = "NUM_SHARD"
+            value = "2"
+          }
+
+          env {
+            name  = "PORT"
+            value = "8080"
+          }
+
+          resources {
+            limits = {
+              "nvidia.com/gpu" = "2"
+            }
+            requests = {
+              # Sufficient storage to fit the Mistral-7B-Instruct-v0.1 model
+              "ephemeral-storage" = "20Gi"
+              "nvidia.com/gpu"    = "2"
+            }
+          }
+
+          volume_mount {
+            mount_path = "/dev/shm"
+            name       = "dshm"
+          }
+
+          volume_mount {
+            mount_path = "/data"
+            name       = "data"
+          }
+
+          volume_mount {
+            mount_path = "/model"
+            name       = "model-storage"
+            read_only  = "true"
+          }
+
+          #liveness_probe {
+          #http_get {
+          #path = "/"
+          #port = 8080
+
+          #http_header {
+          #name  = "X-Custom-Header"
+          #value = "Awesome"
+          #}
+          #}
+
+          #initial_delay_seconds = 3
+          #period_seconds        = 3
+          #}
+        }
+
+        volume {
+          name = "dshm"
+          empty_dir {
+            medium = "Memory"
+          }
+        }
+
+        volume {
+          name = "data"
+          empty_dir {}
+        }
+
+        volume {
+          name = "model-storage"
+          empty_dir {}
+        }
+
+        node_selector = merge({
+          "cloud.google.com/gke-accelerator" = "nvidia-l4"
+          }, var.autopilot_cluster ? {
+          "cloud.google.com/gke-ephemeral-storage-local-ssd" = "true"
+          "cloud.google.com/compute-class"                   = "Accelerator"
+        } : {})
+      }
+    }
+  }
+}
diff --git a/modules/inference-service/outputs.tf b/modules/inference-service/outputs.tf
@@ -0,0 +1,28 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+output "inference_service_name" {
+  description = "Name of model inference service"
+  value       = kubernetes_service.inference_service.metadata[0].name
+}
+
+output "inference_service_namespace" {
+  description = "Namespace of model inference service"
+  value       = kubernetes_service.inference_service.metadata[0].namespace
+}
+
+output "inference_service_endpoint" {
+  description = "Endpoint of model inference service"
+  value       = kubernetes_service.inference_service.status != null ? (kubernetes_service.inference_service.status[0].load_balancer != null ? "${kubernetes_service.inference_service.status[0].load_balancer[0].ingress[0].ip}" : "") : ""
+}
diff --git a/modules/inference-service/variables.tf b/modules/inference-service/variables.tf
@@ -0,0 +1,31 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+variable "namespace" {
+  type        = string
+  description = "Kubernetes namespace where resources are deployed"
+  default     = "default"
+}
+
+variable "additional_labels" {
+  // string is used instead of map(string) since blueprint metadata does not support maps.
+  type        = string
+  description = "Additional labels to add to Kubernetes resources."
+  default     = ""
+}
+
+variable "autopilot_cluster" {
+  type    = bool
+  default = false
+}