Skip to content

Commit

Permalink
fix: refactor kuberay-monitoring and inference server (#773)
Browse files Browse the repository at this point in the history
fix: refactor kube-monitoring

Change-Id: Ib74be5baa3e3cdb2348532ae4a3d1831ecde37d1

Co-authored-by: Gen Lu <[email protected]>
  • Loading branch information
genlu2011 and Gen Lu committed Aug 16, 2024
1 parent 1f4c968 commit 98074e1
Show file tree
Hide file tree
Showing 23 changed files with 465 additions and 217 deletions.
16 changes: 15 additions & 1 deletion applications/rag/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ module "kuberay-cluster" {
}

module "inference-server" {
source = "../../tutorials-and-examples/hf-tgi"
source = "../../modules/inference-service"
providers = { kubernetes = kubernetes.rag }
namespace = local.kubernetes_namespace
additional_labels = var.additional_labels
Expand Down Expand Up @@ -324,3 +324,17 @@ module "frontend" {
members_allowlist = var.frontend_members_allowlist != "" ? split(",", var.frontend_members_allowlist) : []
depends_on = [module.namespace]
}

resource "helm_release" "gmp-apps" {
name = "gmp-apps"
provider = helm.rag
chart = "../../charts/gmp-engine/"
namespace = local.kubernetes_namespace
# Timeout is increased to guarantee sufficient scale-up time for Autopilot nodes.
timeout = 1200
depends_on = [module.inference-server, module.frontend]
values = [
"${file("${path.module}/podmonitoring.yaml")}"
]
}

11 changes: 11 additions & 0 deletions applications/rag/podmonitoring.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
podMonitoring:
- name: mistral-7b-instruct
selector:
app: mistral-7b-instruct
port: metrics
interval: 30s
- name: rag-frontend
selector:
app: rag-frontend
port: metrics
interval: 30s
28 changes: 28 additions & 0 deletions charts/gmp-engine/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
apiVersion: v2
name: gmp-engine
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application

# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0

# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.0.0"

dependencies:
- name: gmp-frontend
condition: gmp-frontend.enabled
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
apiVersion: v2
name: gmp-engine
name: gmp-frontend
description: A Helm chart for Kubernetes

# A chart can be either an 'application' or a 'library' chart.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,14 @@ spec:
labels:
app: {{ .Values.name }}
spec:
serviceAccountName: {{ .Values.serviceAccount }}
serviceAccountName: {{ required "serviceAccount is required!" .Values.serviceAccount }}
containers:
- name: {{ .Values.name }}
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
args:
- "--web.listen-address=:9090"
- "--query.project-id={{ .Values.projectID }}"
- "--query.project-id={{ required "projectID is required!" .Values.projectID }}"
ports:
- name: web
containerPort: 9090
Expand Down
17 changes: 17 additions & 0 deletions charts/gmp-engine/charts/gmp-frontend/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Default values for gmp-frontend.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

name: "gmp-frontend"
projectID: ""
serviceAccount: ""

image:
repository: gke.gcr.io/prometheus-engine/frontend
pullPolicy: IfNotPresent
tag: "v0.5.0-gke.0"

replicaCount: 2

cpu: "1m"
memory: "5Mi"
10 changes: 10 additions & 0 deletions charts/gmp-engine/values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Default values for iap_jupyter.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

podMonitoring: []

gmp-frontend:
enabled: false
projectID: ""
serviceAccount: ""
2 changes: 2 additions & 0 deletions modules/inference-service/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Inference Service
This module is currently designed specifically for the Mistral-7B-Instruct-v0.1 model. Future developments will expand the module to support the creation of customized models more broadly.
187 changes: 187 additions & 0 deletions modules/inference-service/main.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

locals {
additional_labels = length(var.additional_labels) == 0 ? {} : tomap({
for item in split(",", var.additional_labels) :
split("=", item)[0] => split("=", item)[1]
})
}

resource "kubernetes_service" "inference_service" {
metadata {
name = "mistral-7b-instruct-service"
labels = {
app = "mistral-7b-instruct"
}
namespace = var.namespace
annotations = {
"cloud.google.com/load-balancer-type" = "Internal"
"cloud.google.com/neg" = "{\"ingress\":true}"
}
}
spec {
selector = {
app = "mistral-7b-instruct"
}
session_affinity = "ClientIP"
port {
protocol = "TCP"
port = 80
target_port = 8080
}

type = "LoadBalancer"
}
}

resource "kubernetes_deployment" "inference_deployment" {
timeouts {
create = "30m"
}
metadata {
name = "mistral-7b-instruct"
namespace = var.namespace
labels = merge({
app = "mistral-7b-instruct"
}, local.additional_labels)
}

spec {
# It takes more than 10m for the deployment to be ready on Autopilot cluster
# Set the progress deadline to 30m to avoid the deployment controller
# considering the deployment to be failed
progress_deadline_seconds = 1800
replicas = 1

selector {
match_labels = merge({
app = "mistral-7b-instruct"
}, local.additional_labels)
}

template {
metadata {
labels = merge({
app = "mistral-7b-instruct"
}, local.additional_labels)
}

spec {
init_container {
name = "download-model"
image = "google/cloud-sdk:473.0.0-alpine"
command = ["gsutil", "cp", "-r", "gs://vertex-model-garden-public-us/mistralai/Mistral-7B-Instruct-v0.1/", "/model-data/"]
volume_mount {
mount_path = "/model-data"
name = "model-storage"
}
}
container {
image = "ghcr.io/huggingface/text-generation-inference:1.1.0"
name = "mistral-7b-instruct"

port {
name = "metrics"
container_port = 8080
protocol = "TCP"
}

args = ["--model-id", "$(MODEL_ID)"]

env {
name = "MODEL_ID"
value = "/model/Mistral-7B-Instruct-v0.1"
}

env {
name = "NUM_SHARD"
value = "2"
}

env {
name = "PORT"
value = "8080"
}

resources {
limits = {
"nvidia.com/gpu" = "2"
}
requests = {
# Sufficient storage to fit the Mistral-7B-Instruct-v0.1 model
"ephemeral-storage" = "20Gi"
"nvidia.com/gpu" = "2"
}
}

volume_mount {
mount_path = "/dev/shm"
name = "dshm"
}

volume_mount {
mount_path = "/data"
name = "data"
}

volume_mount {
mount_path = "/model"
name = "model-storage"
read_only = "true"
}

#liveness_probe {
#http_get {
#path = "/"
#port = 8080

#http_header {
#name = "X-Custom-Header"
#value = "Awesome"
#}
#}

#initial_delay_seconds = 3
#period_seconds = 3
#}
}

volume {
name = "dshm"
empty_dir {
medium = "Memory"
}
}

volume {
name = "data"
empty_dir {}
}

volume {
name = "model-storage"
empty_dir {}
}

node_selector = merge({
"cloud.google.com/gke-accelerator" = "nvidia-l4"
}, var.autopilot_cluster ? {
"cloud.google.com/gke-ephemeral-storage-local-ssd" = "true"
"cloud.google.com/compute-class" = "Accelerator"
} : {})
}
}
}
}
28 changes: 28 additions & 0 deletions modules/inference-service/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

output "inference_service_name" {
description = "Name of model inference service"
value = kubernetes_service.inference_service.metadata[0].name
}

output "inference_service_namespace" {
description = "Namespace of model inference service"
value = kubernetes_service.inference_service.metadata[0].namespace
}

output "inference_service_endpoint" {
description = "Endpoint of model inference service"
value = kubernetes_service.inference_service.status != null ? (kubernetes_service.inference_service.status[0].load_balancer != null ? "${kubernetes_service.inference_service.status[0].load_balancer[0].ingress[0].ip}" : "") : ""
}
31 changes: 31 additions & 0 deletions modules/inference-service/variables.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

variable "namespace" {
type = string
description = "Kubernetes namespace where resources are deployed"
default = "default"
}

variable "additional_labels" {
// string is used instead of map(string) since blueprint metadata does not support maps.
type = string
description = "Additional labels to add to Kubernetes resources."
default = ""
}

variable "autopilot_cluster" {
type = bool
default = false
}
Loading

0 comments on commit 98074e1

Please sign in to comment.