kubernetes: first helm working with monitoring

phymbert · Feb 27, 2024 · cffa3a6 · cffa3a6
1 parent 19bc58e
commit cffa3a6
Show file tree

Hide file tree

Showing 23 changed files with 548 additions and 236 deletions.
diff --git a/.github/workflows/kubernetes.yaml b/.github/workflows/kubernetes.yaml
@@ -16,8 +16,39 @@ jobs:
       - name: Check out the repo
         uses: actions/checkout@v3
 
+      - name: Set up Helm
+        uses: azure/setup-helm@v3
+        with:
+          version: v3.12.1
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+          check-latest: true
+
+      - name: Set up chart-testing
+        uses: helm/[email protected]
+
+      - name: Install repo
+        id: install-repo
+        run: |
+          ct install --chart-dirs examples/kubernetes/ --charts examples/kubernetes/
+
+      - name: Run chart-testing (list-changed)
+        id: list-changed
+        run: |
+          changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
+          if [[ -n "$changed" ]]; then
+            echo "changed=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Run chart-testing (lint)
+        if: steps.list-changed.outputs.changed == 'true'
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }}
+
       - name: Create k8s Kind Cluster
         id: create_kind_cluster
+        if: steps.list-changed.outputs.changed == 'true'
         uses: container-tools/kind-action@v1
 #
 #      - name: Build and push Docker image
@@ -33,6 +64,7 @@ jobs:
 
       - name: Docker build
         id: docker_build
+        if: steps.list-changed.outputs.changed == 'true'
         run: |
           kubectl cluster-info
           kubectl get storageclass standard
@@ -48,12 +80,18 @@ jobs:
 
       - name: Deploy prometheus
         id: deploy_prometheus
+        if: steps.list-changed.outputs.changed == 'true'
         run: |
           helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
           helm repo update
           helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack
 
       - name: Deploy llama.cpp server
         id: deploy_llama_cpp
+        if: steps.list-changed.outputs.changed == 'true'
         run: |
           helm install llama-cpp-stack ./examples/kubernetes/helm-charts
+      - name: Set up Helm
+        uses: azure/setup-helm@v3
+        with:
+          version: v3.12.1
diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md
@@ -4,6 +4,15 @@ This example demonstrates how to deploy [llama.cpp server](../server) on a [kube
 
 ![llama.cpp.kubernetes.png](llama.cpp.kubernetes.png)
 
+We provide an [Helm chart](https://helm.sh/)  repository to deploy llama.cpp at scale for completions and embeddings:
+
+```shell
+
+helm repo add llama.cpp https://ggerganov.github.io/llama.cpp
+helm repo update
+helm install example llama-cpp --namespace llama-cpp --create-namespace
+```
+
 ## Prerequisites
 
 Obviously you need a kubernetes cluster.
@@ -15,11 +24,26 @@ Required access to an API server with the following `roles`:
 
 If you do not have a real k8s cluster, you can give a try to [kind](https://kind.sigs.k8s.io/).
 
+### Metrics monitoring
+
+You might want to deploy prometheus helm chart:
+
+```shell
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo update
+helm install \
+    --set prometheus.prometheusSpec.podMonitorSelectorNilUseHelmValues=false \
+    kube-prometheus-stack prometheus-community/kube-prometheus-stack \
+    --create-namespace \
+    --namespace monitoring
+```
+
 ## Goals
 
 Deploy a production ready LLM API over kubernetes, including:
 - High availability
 - multi models
+- support of embeddings and completions models
 - load balancing
 - Auto scaling
 - Security 
@@ -36,6 +60,7 @@ This example does not cover [NVidia based docker engine](https://docs.nvidia.com
 **Approach**
 1. Models file are downloaded once on a `PV` by a `Job` when the stack is deployed
 2. Server `Deployment` is using an init containers to verify if the model is downloaded
+3. `Ingress` rules are routing incoming request to the target models
 3. `Probes` are used to monitor the `pods` healthiness
 4. [Prometheus](https://prometheus.io/) is used as the metrics server
 
diff --git a/examples/kubernetes/helm-charts/Chart.yaml b/examples/kubernetes/helm-charts/Chart.yaml
diff --git a/examples/kubernetes/helm-charts/templates/deployment.yaml b/examples/kubernetes/helm-charts/templates/deployment.yaml
diff --git a/examples/kubernetes/helm-charts/templates/serviceaccount.yaml b/examples/kubernetes/helm-charts/templates/serviceaccount.yaml
diff --git a/examples/kubernetes/helm-charts/values.yaml b/examples/kubernetes/helm-charts/values.yaml
diff --git a/examples/kubernetes/index.yaml b/examples/kubernetes/index.yaml
@@ -0,0 +1,3 @@
+apiVersion: v1
+entries: {}
+generated: "2024-02-27T12:54:11.664702812+01:00"
diff --git a/examples/kubernetes/helm-charts/.helmignore → examples/kubernetes/llama-cpp/.helmignore b/examples/kubernetes/helm-charts/.helmignore → examples/kubernetes/llama-cpp/.helmignore
diff --git a/examples/kubernetes/llama-cpp/Chart.yaml b/examples/kubernetes/llama-cpp/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: llama-cpp
+description: llama.cpp Helm chart for Kubernetes
+type: application
+version: 0.0.1
+appVersion: "cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b"
diff --git a/...ubernetes/helm-charts/templates/NOTES.txt → .../kubernetes/llama-cpp/templates/NOTES.txt b/...ubernetes/helm-charts/templates/NOTES.txt → .../kubernetes/llama-cpp/templates/NOTES.txt
@@ -1,10 +1,16 @@
 1. Get the application URL by running these commands:
-{{- if .Values.ingress.enabled }}
-{{- range $host := .Values.ingress.hosts }}
+{{- if .Values.ingresses.completions.enabled }}
+{{- range $host := .Values.ingresses.completions.hosts }}
   {{- range .paths }}
-  http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
+  http{{ if $.Values.ingresses.completions.tls }}s{{ end }}://{{ if .host }}{{ .host }}{{else}}localhost{{ end }}{{ .path }} --data '{"messages": [{"role": "user", "message":"hello llama.cpp"}]}'
   {{- end }}
 {{- end }}
+{{- else if .Values.ingresses.embeddings.enabled }}
+{{- range $host := .Values.ingresses.embeddings.hosts }}
+  {{- range .paths }}
+  curl http{{ if $.Values.ingresses.embeddings.tls }}s{{ end }}://{{ $host.host }}{{ .path }} --data '{"input": "hello llama.cpp"}'
+a  {{- end }}
+{{- end }}
 {{- else if contains "NodePort" .Values.service.type }}
   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "server.llama.cpp.fullname" . }})
   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")

diff --git a/...rnetes/helm-charts/templates/_helpers.tpl → ...bernetes/llama-cpp/templates/_helpers.tpl b/...rnetes/helm-charts/templates/_helpers.tpl → ...bernetes/llama-cpp/templates/_helpers.tpl