diff --git a/.github/workflows/kubernetes.yaml b/.github/workflows/kubernetes.yaml index 61a15b53df1f6..4447b68bc76ef 100644 --- a/.github/workflows/kubernetes.yaml +++ b/.github/workflows/kubernetes.yaml @@ -16,8 +16,39 @@ jobs: - name: Check out the repo uses: actions/checkout@v3 + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.12.1 + + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + check-latest: true + + - name: Set up chart-testing + uses: helm/chart-testing-action@v2.6.0 + + - name: Install repo + id: install-repo + run: | + ct install --chart-dirs examples/kubernetes/ --charts examples/kubernetes/ + + - name: Run chart-testing (list-changed) + id: list-changed + run: | + changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }}) + if [[ -n "$changed" ]]; then + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: Run chart-testing (lint) + if: steps.list-changed.outputs.changed == 'true' + run: ct lint --target-branch ${{ github.event.repository.default_branch }} + - name: Create k8s Kind Cluster id: create_kind_cluster + if: steps.list-changed.outputs.changed == 'true' uses: container-tools/kind-action@v1 # # - name: Build and push Docker image @@ -33,6 +64,7 @@ jobs: - name: Docker build id: docker_build + if: steps.list-changed.outputs.changed == 'true' run: | kubectl cluster-info kubectl get storageclass standard @@ -48,6 +80,7 @@ jobs: - name: Deploy prometheus id: deploy_prometheus + if: steps.list-changed.outputs.changed == 'true' run: | helm repo add prometheus-community https://prometheus-community.github.io/helm-charts helm repo update @@ -55,5 +88,10 @@ jobs: - name: Deploy llama.cpp server id: deploy_llama_cpp + if: steps.list-changed.outputs.changed == 'true' run: | helm install llama-cpp-stack ./examples/kubernetes/helm-charts + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.12.1 diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md index 0bad7ce89a391..5a0806977ae77 100644 --- a/examples/kubernetes/README.md +++ b/examples/kubernetes/README.md @@ -4,6 +4,15 @@ This example demonstrates how to deploy [llama.cpp server](../server) on a [kube ![llama.cpp.kubernetes.png](llama.cpp.kubernetes.png) +We provide an [Helm chart](https://helm.sh/) repository to deploy llama.cpp at scale for completions and embeddings: + +```shell + +helm repo add llama.cpp https://ggerganov.github.io/llama.cpp +helm repo update +helm install example llama-cpp --namespace llama-cpp --create-namespace +``` + ## Prerequisites Obviously you need a kubernetes cluster. @@ -15,11 +24,26 @@ Required access to an API server with the following `roles`: If you do not have a real k8s cluster, you can give a try to [kind](https://kind.sigs.k8s.io/). +### Metrics monitoring + +You might want to deploy prometheus helm chart: + +```shell +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install \ + --set prometheus.prometheusSpec.podMonitorSelectorNilUseHelmValues=false \ + kube-prometheus-stack prometheus-community/kube-prometheus-stack \ + --create-namespace \ + --namespace monitoring +``` + ## Goals Deploy a production ready LLM API over kubernetes, including: - High availability - multi models +- support of embeddings and completions models - load balancing - Auto scaling - Security @@ -36,6 +60,7 @@ This example does not cover [NVidia based docker engine](https://docs.nvidia.com **Approach** 1. Models file are downloaded once on a `PV` by a `Job` when the stack is deployed 2. Server `Deployment` is using an init containers to verify if the model is downloaded +3. `Ingress` rules are routing incoming request to the target models 3. `Probes` are used to monitor the `pods` healthiness 4. [Prometheus](https://prometheus.io/) is used as the metrics server diff --git a/examples/kubernetes/helm-charts/Chart.yaml b/examples/kubernetes/helm-charts/Chart.yaml deleted file mode 100644 index 44540d42838a8..0000000000000 --- a/examples/kubernetes/helm-charts/Chart.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v2 -name: server.llama.cpp -description: llama.cpp Helm chart for Kubernetes - -# A chart can be either an 'application' or a 'library' chart. -# -# Application charts are a collection of templates that can be packaged into versioned archives -# to be deployed. -# -# Library charts provide useful utilities or functions for the chart developer. They're included as -# a dependency of application charts to inject those utilities and functions into the rendering -# pipeline. Library charts do not define any templates and therefore cannot be deployed. -type: application - -# This is the chart version. This version number should be incremented each time you make changes -# to the chart and its templates, including the app version. -# Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 - -# This is the version number of the application being deployed. This version number should be -# incremented each time you make changes to the application. Versions are not expected to -# follow Semantic Versioning. They should reflect the version the application is using. -# It is recommended to use it with quotes. -appVersion: "1.16.0" diff --git a/examples/kubernetes/helm-charts/templates/deployment.yaml b/examples/kubernetes/helm-charts/templates/deployment.yaml deleted file mode 100644 index 7b22a35c76dd6..0000000000000 --- a/examples/kubernetes/helm-charts/templates/deployment.yaml +++ /dev/null @@ -1,68 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "server.llama.cpp.fullname" . }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} -spec: - {{- if not .Values.autoscaling.enabled }} - replicas: {{ .Values.replicaCount }} - {{- end }} - selector: - matchLabels: - {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }} - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 8 }} - {{- with .Values.podLabels }} - {{- toYaml . | nindent 8 }} - {{- end }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "server.llama.cpp.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - ports: - - name: http - containerPort: {{ .Values.service.port }} - protocol: TCP - livenessProbe: - {{- toYaml .Values.livenessProbe | nindent 12 }} - readinessProbe: - {{- toYaml .Values.readinessProbe | nindent 12 }} - resources: - {{- toYaml .Values.resources | nindent 12 }} - {{- with .Values.volumeMounts }} - volumeMounts: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.volumes }} - volumes: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} diff --git a/examples/kubernetes/helm-charts/templates/serviceaccount.yaml b/examples/kubernetes/helm-charts/templates/serviceaccount.yaml deleted file mode 100644 index 1ad075ff18355..0000000000000 --- a/examples/kubernetes/helm-charts/templates/serviceaccount.yaml +++ /dev/null @@ -1,13 +0,0 @@ -{{- if .Values.serviceAccount.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "server.llama.cpp.serviceAccountName" . }} - labels: - {{- include "server.llama.cpp.labels" . | nindent 4 }} - {{- with .Values.serviceAccount.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -automountServiceAccountToken: {{ .Values.serviceAccount.automount }} -{{- end }} diff --git a/examples/kubernetes/helm-charts/values.yaml b/examples/kubernetes/helm-charts/values.yaml deleted file mode 100644 index 5f41f989fb1a5..0000000000000 --- a/examples/kubernetes/helm-charts/values.yaml +++ /dev/null @@ -1,113 +0,0 @@ -# Default values for server.llama.cpp. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -replicaCount: 1 - -image: - repository: nginx - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: "" - -imagePullSecrets: [] -nameOverride: "" -fullnameOverride: "" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # Automatically mount a ServiceAccount's API credentials? - automount: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - -podAnnotations: {} -podLabels: {} - -podSecurityContext: {} - # fsGroup: 2000 - -securityContext: {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsUser: 1000 - -service: - type: ClusterIP - port: 80 - -ingress: - enabled: false - className: "" - annotations: {} - # kubernetes.io/ingress.class: nginx - # kubernetes.io/tls-acme: "true" - hosts: - - host: chart-example.local - paths: - - path: / - pathType: ImplementationSpecific - tls: [] - # - secretName: chart-example-tls - # hosts: - # - chart-example.local - -resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -startupProbe: - httpGet: - path: /health - port: http - -livenessProbe: - httpGet: - path: /health - port: http - -readinessProbe: - httpGet: - path: / - port: http - -autoscaling: - enabled: true - minReplicas: 1 - maxReplicas: 100 - targetCPUUtilizationPercentage: 80 - # targetMemoryUtilizationPercentage: 80 - -# Additional volumes on the output Deployment definition. -volumes: [] -# - name: foo -# secret: -# secretName: mysecret -# optional: false - -# Additional volumeMounts on the output Deployment definition. -volumeMounts: [] -# - name: foo -# mountPath: "/etc/foo" -# readOnly: true - -nodeSelector: {} - -tolerations: [] - -affinity: {} diff --git a/examples/kubernetes/index.yaml b/examples/kubernetes/index.yaml new file mode 100644 index 0000000000000..32edb60096fc7 --- /dev/null +++ b/examples/kubernetes/index.yaml @@ -0,0 +1,3 @@ +apiVersion: v1 +entries: {} +generated: "2024-02-27T12:54:11.664702812+01:00" diff --git a/examples/kubernetes/helm-charts/.helmignore b/examples/kubernetes/llama-cpp/.helmignore similarity index 100% rename from examples/kubernetes/helm-charts/.helmignore rename to examples/kubernetes/llama-cpp/.helmignore diff --git a/examples/kubernetes/llama-cpp/Chart.yaml b/examples/kubernetes/llama-cpp/Chart.yaml new file mode 100644 index 0000000000000..13cd200182883 --- /dev/null +++ b/examples/kubernetes/llama-cpp/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +name: llama-cpp +description: llama.cpp Helm chart for Kubernetes +type: application +version: 0.0.1 +appVersion: "cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b" diff --git a/examples/kubernetes/helm-charts/templates/NOTES.txt b/examples/kubernetes/llama-cpp/templates/NOTES.txt similarity index 73% rename from examples/kubernetes/helm-charts/templates/NOTES.txt rename to examples/kubernetes/llama-cpp/templates/NOTES.txt index e01560fb14ecf..44d5a115a3d42 100644 --- a/examples/kubernetes/helm-charts/templates/NOTES.txt +++ b/examples/kubernetes/llama-cpp/templates/NOTES.txt @@ -1,10 +1,16 @@ 1. Get the application URL by running these commands: -{{- if .Values.ingress.enabled }} -{{- range $host := .Values.ingress.hosts }} +{{- if .Values.ingresses.completions.enabled }} +{{- range $host := .Values.ingresses.completions.hosts }} {{- range .paths }} - http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + http{{ if $.Values.ingresses.completions.tls }}s{{ end }}://{{ if .host }}{{ .host }}{{else}}localhost{{ end }}{{ .path }} --data '{"messages": [{"role": "user", "message":"hello llama.cpp"}]}' {{- end }} {{- end }} +{{- else if .Values.ingresses.embeddings.enabled }} +{{- range $host := .Values.ingresses.embeddings.hosts }} + {{- range .paths }} + curl http{{ if $.Values.ingresses.embeddings.tls }}s{{ end }}://{{ $host.host }}{{ .path }} --data '{"input": "hello llama.cpp"}' +a {{- end }} +{{- end }} {{- else if contains "NodePort" .Values.service.type }} export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "server.llama.cpp.fullname" . }}) export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") diff --git a/examples/kubernetes/helm-charts/templates/_helpers.tpl b/examples/kubernetes/llama-cpp/templates/_helpers.tpl similarity index 100% rename from examples/kubernetes/helm-charts/templates/_helpers.tpl rename to examples/kubernetes/llama-cpp/templates/_helpers.tpl diff --git a/examples/kubernetes/llama-cpp/templates/deployment.yaml b/examples/kubernetes/llama-cpp/templates/deployment.yaml new file mode 100644 index 0000000000000..2fb09f79cc282 --- /dev/null +++ b/examples/kubernetes/llama-cpp/templates/deployment.yaml @@ -0,0 +1,146 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "server.llama.cpp.fullname" . }} + labels: + {{- include "server.llama.cpp.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + {{- include "server.llama.cpp.labels" . | nindent 8 }} + {{- if .Values.server.metrics }} + prometheus.io/scrape: 'true' + prometheus.io/port: '{{ .Values.server.port }}' + {{- end }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + prometheus.io/scrape: 'true' + {{- include "server.llama.cpp.labels" . | nindent 8 }} + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + initContainers: + - name: wait-model + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}-{{ .Values.images.downloader.tag }} + env: + - name: MODEL_PATH + value: {{ .Values.model.path }} + - name: MODEL_FILE + value: {{ regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }} + - name: MODEL_SHA256 + value: {{ .Values.model.sha256 }} + - name: MODEL_DOWNLOAD_REPO + value: {{ .Values.model.repo }} + - name: MODEL_DOWNLOAD_FILE + value: {{ .Values.model.file }} + command: + - sh + - -c + args: + - > + set -eux; + while ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha256sum -c - ; do + echo "waiting for model file${MODEL_PATH}/${MODEL_FILE}=${MODEL_SHA256}"; + sleep 1; + done + volumeMounts: + - mountPath: {{ .Values.model.path }} + name: models + readOnly: true + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.images.server.repository }}:{{ .Values.images.server.name }}-{{ .Values.images.server.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.images.pullPolicy }} + command: + - {{ .Values.server.command }} + args: + - --host + - {{ .Values.server.host }} + - --port + - "{{ .Values.server.port }}" + - --model + - {{ .Values.model.path }}/{{ regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }} + - --cont-batching + - --alias + - {{ .Values.model.alias }} + - --ctx-size + - "{{ .Values.server.kvCache.size }}" + - --parallel + - "{{ .Values.server.slots }}" + {{- if .Values.server.embeddings }} + - --embedding + {{- end }} + {{- if .Values.server.metrics }} + - --metrics + {{- end }} + - --log-format + - {{ .Values.server.log.format }} + {{- if .Values.server.log.disabled }} + - --log-disable + {{- end }} + {{- with .Values.server.extraArgs }} + {{- toYaml . | nindent 12 }} + {{- end }} + ports: + - name: http + containerPort: {{ .Values.server.port }} + protocol: TCP + startupProbe: + httpGet: + path: /health + port: {{ .Values.server.port }} + + livenessProbe: + httpGet: + path: /health + port: {{ .Values.server.port }} + + readinessProbe: + httpGet: + path: /health?fail_on_no_slot + port: {{ .Values.server.port }} + + {{- with .Values.volumeMounts }} + volumeMounts: + {{- toYaml . | nindent 12 }} + {{- end }} + volumeMounts: + - mountPath: {{ .Values.model.path }} + name: models + readOnly: true + volumes: + - name: models + persistentVolumeClaim: + claimName: {{ include "server.llama.cpp.fullname" . }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/examples/kubernetes/helm-charts/templates/hpa.yaml b/examples/kubernetes/llama-cpp/templates/hpa.yaml similarity index 100% rename from examples/kubernetes/helm-charts/templates/hpa.yaml rename to examples/kubernetes/llama-cpp/templates/hpa.yaml diff --git a/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml b/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml new file mode 100644 index 0000000000000..d1ef1bda4541c --- /dev/null +++ b/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml @@ -0,0 +1,64 @@ +{{- if and .Values.server.completions .Values.ingresses.completions.enabled -}} +{{- $fullName := include "server.llama.cpp.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingresses.completions.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingresses.completions.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingresses.completions.annotations "kubernetes.io/ingress.class" .Values.ingresses.completions.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }}-completions + labels: + {{- include "server.llama.cpp.labels" . | nindent 4 }} + {{- with .Values.ingresses.completions.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingresses.completions.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingresses.completions.className }} + {{- end }} + {{- if .Values.ingresses.completions.tls }} + tls: + {{- range .Values.ingresses.completions.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingresses.completions.hosts }} + - http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} + {{- if .host }} + host: {{ .host | quote }} + {{- end }} + +{{- end }} diff --git a/examples/kubernetes/helm-charts/templates/ingress.yaml b/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml similarity index 59% rename from examples/kubernetes/helm-charts/templates/ingress.yaml rename to examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml index 94f8740557c7b..1085d62580e46 100644 --- a/examples/kubernetes/helm-charts/templates/ingress.yaml +++ b/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml @@ -1,9 +1,9 @@ -{{- if .Values.ingress.enabled -}} +{{- if and .Values.server.embeddings .Values.ingresses.embeddings.enabled -}} {{- $fullName := include "server.llama.cpp.fullname" . -}} {{- $svcPort := .Values.service.port -}} -{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} - {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} - {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} +{{- if and .Values.ingresses.embeddings.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class" .Values.ingresses.embeddings.className}} {{- end }} {{- end }} {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} @@ -15,20 +15,20 @@ apiVersion: extensions/v1beta1 {{- end }} kind: Ingress metadata: - name: {{ $fullName }} + name: {{ $fullName }}-embeddings labels: {{- include "server.llama.cpp.labels" . | nindent 4 }} - {{- with .Values.ingress.annotations }} + {{- with .Values.ingresses.embeddings.annotations }} annotations: {{- toYaml . | nindent 4 }} {{- end }} spec: - {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} - ingressClassName: {{ .Values.ingress.className }} + {{- if and .Values.ingresses.embeddings.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingresses.embeddings.className }} {{- end }} - {{- if .Values.ingress.tls }} + {{- if .Values.ingresses.embeddings.tls }} tls: - {{- range .Values.ingress.tls }} + {{- range .Values.ingresses.embeddings.tls }} - hosts: {{- range .hosts }} - {{ . | quote }} @@ -37,9 +37,8 @@ spec: {{- end }} {{- end }} rules: - {{- range .Values.ingress.hosts }} - - host: {{ .host | quote }} - http: + {{- range .Values.ingresses.embeddings.hosts }} + - http: paths: {{- range .paths }} - path: {{ .path }} @@ -58,4 +57,8 @@ spec: {{- end }} {{- end }} {{- end }} + {{- if .host }} + host: {{ .host | quote }} + {{- end }} + {{- end }} diff --git a/examples/kubernetes/llama-cpp/templates/jobs.yaml b/examples/kubernetes/llama-cpp/templates/jobs.yaml new file mode 100644 index 0000000000000..76942b0a7a7fd --- /dev/null +++ b/examples/kubernetes/llama-cpp/templates/jobs.yaml @@ -0,0 +1,53 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: download-model-{{ include "server.llama.cpp.fullname" . }} + labels: + {{- include "server.llama.cpp.labels" . | nindent 4 }} +spec: + template: + metadata: + name: download-model-{{ include "server.llama.cpp.fullname" . }} + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "server.llama.cpp.labels" . | nindent 8 }} + {{- with .Values.jobLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + containers: + - name: {{ include "server.llama.cpp.fullname" . }}-download-model + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}-{{ .Values.images.downloader.tag }} + env: + - name: MODEL_PATH + value: {{ .Values.model.path }} + - name: MODEL_FILE + value: {{ regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }} + - name: MODEL_SHA256 + value: {{ .Values.model.sha256 }} + - name: MODEL_DOWNLOAD_REPO + value: {{ .Values.model.repo }} + - name: MODEL_DOWNLOAD_FILE + value: {{ .Values.model.file }} + command: + - sh + - -c + args: + - > + set -eux; + if ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha256sum -c -s - ; then + wget -q -c -O ${MODEL_PATH}/${MODEL_FILE} https://huggingface.co/${MODEL_DOWNLOAD_REPO}/resolve/main/${MODEL_DOWNLOAD_FILE}; + fi + volumeMounts: + - mountPath: {{ .Values.model.path }} + name: models + restartPolicy: OnFailure + volumes: + - name: models + persistentVolumeClaim: + claimName: {{ include "server.llama.cpp.fullname" . }} \ No newline at end of file diff --git a/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml b/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml new file mode 100644 index 0000000000000..f2a9ba0ce29e4 --- /dev/null +++ b/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml @@ -0,0 +1,16 @@ +{{- if .Values.server.metrics }} +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ include "server.llama.cpp.fullname" . }} + labels: + {{- include "server.llama.cpp.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }} + podMetricsEndpoints: + - port: http + interval: 30s + path: /metrics +{{end}} \ No newline at end of file diff --git a/examples/kubernetes/llama-cpp/templates/pvc.yaml b/examples/kubernetes/llama-cpp/templates/pvc.yaml new file mode 100644 index 0000000000000..681873c379598 --- /dev/null +++ b/examples/kubernetes/llama-cpp/templates/pvc.yaml @@ -0,0 +1,17 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: {{ include "server.llama.cpp.fullname" . }} + labels: + {{- include "server.llama.cpp.labels" . | nindent 4 }} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.model.size | quote }} +{{- if .Values.persistence.storageClass }} +{{- if (not empty .Values.persistence.storageClass) }} + storageClassName: "{{ .Values.persistence.storageClass }}" +{{- end }} +{{- end }} \ No newline at end of file diff --git a/examples/kubernetes/helm-charts/templates/service.yaml b/examples/kubernetes/llama-cpp/templates/service.yaml similarity index 89% rename from examples/kubernetes/helm-charts/templates/service.yaml rename to examples/kubernetes/llama-cpp/templates/service.yaml index aa8b2007a1bf4..8e5a222504c1c 100644 --- a/examples/kubernetes/helm-charts/templates/service.yaml +++ b/examples/kubernetes/llama-cpp/templates/service.yaml @@ -8,7 +8,7 @@ spec: type: {{ .Values.service.type }} ports: - port: {{ .Values.service.port }} - targetPort: http + targetPort: {{ .Values.service.port }} protocol: TCP name: http selector: diff --git a/examples/kubernetes/helm-charts/templates/tests/test-connection.yaml b/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml similarity index 92% rename from examples/kubernetes/helm-charts/templates/tests/test-connection.yaml rename to examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml index 96ed674fd728d..5685bf3421180 100644 --- a/examples/kubernetes/helm-charts/templates/tests/test-connection.yaml +++ b/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml @@ -11,5 +11,5 @@ spec: - name: wget image: busybox command: ['wget'] - args: ['{{ include "server.llama.cpp.fullname" . }}:{{ .Values.service.port }}'] + args: ['{{ include "server.llama.cpp.fullname" . }}:{{ .Values.service.port }}/health'] restartPolicy: Never diff --git a/examples/kubernetes/llama-cpp/values.yaml b/examples/kubernetes/llama-cpp/values.yaml new file mode 100644 index 0000000000000..2ac6ed35ad4cc --- /dev/null +++ b/examples/kubernetes/llama-cpp/values.yaml @@ -0,0 +1,121 @@ +# Default values for server.llama.cpp. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 2 + +images: + server: + repository: ghcr.io/ggerganov/llama.cpp + name: server + tag: + downloader: + repository: busybox + name: 1.36.1 + tag: "glibc" + + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + +imagePullSecrets: [ ] +nameOverride: "" +fullnameOverride: "" + +podAnnotations: { } +podLabels: { } + +jobAnnotations: { } +jobLabels: { } + +podSecurityContext: + runAsNonRoot: true + +securityContext: + readOnlyRootFilesystem: false #FIXME + runAsNonRoot: true + runAsUser: 1000 + +model: + path: /tmp + alias: microsoft-phi2 + repo: ggml-org/models # TheBloke/phi-2-GGUF + file: tinyllamas/stories260K.gguf # phi-2.Q4_K_M.gguf + size: 2Mi # 1.8Gi + sha256: 047bf46455a544931cff6fef14d7910154c56afbc23ab1c5e56a72e69912c04b # 324356668fa5ba9f4135de348447bb2bbe2467eaa1b8fcfb53719de62fbd2499 + +server: + command: /server + host: 0.0.0.0 + port: 8080 + completions: true + embeddings: false + metrics: true + kvCache: + size: 64 + slots: 2 + log: + format: text + disabled: false + extraArgs: [] + +deployments: + init + +service: + type: ClusterIP + port: 80 + +ingresses: + completions: + enabled: true + className: "" + annotations: + kubernetes.io/ingress.class: nginx + + hosts: + - #host: llama-cpp.mydomain + paths: + - path: /v1/completions + pathType: Prefix + tls: [ ] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + + embeddings: + enabled: true + className: "" + annotations: + kubernetes.io/ingress.class: nginx + + hosts: + - #host: llama-cpp.mydomain + paths: + - path: /v1/embeddings + pathType: Prefix + tls: [ ] + # - secretName: chart-example-tls + # hosts: + # - chart-example.local + +resources: { } + +autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 4 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 + +volumes: [ ] + +volumeMounts: [ ] + +nodeSelector: { } + +tolerations: [ ] + +affinity: { } + +persistence: + storageClass: \ No newline at end of file diff --git a/examples/kubernetes/llama.cpp.kubernetes.png b/examples/kubernetes/llama.cpp.kubernetes.png index 39d47cbf87d03..612f83275ddc0 100644 Binary files a/examples/kubernetes/llama.cpp.kubernetes.png and b/examples/kubernetes/llama.cpp.kubernetes.png differ diff --git a/phymbert-notes.md b/phymbert-notes.md new file mode 100644 index 0000000000000..7878f67a16423 --- /dev/null +++ b/phymbert-notes.md @@ -0,0 +1,32 @@ +# TODO + +test U server example +fix log-disable +add /models +helm example +empty input test + + + + initContainers: + - name: wait-model + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}-{{ .Values.images.downloader.tag | default .Chart.Version }} + env: + - name: MODEL_PATH + value: {{ .Values.model.path }} + - name: MODEL_FILE + value: {{ .Values.model.file_basename }} + - name: MODEL_SHA256 + value: {{ .Values.model.sha256 }} + - name: MODEL_DOWNLOAD_FILE + value: {{ .Values.model.file }} + command: + - /bin/bash + - -c + args: + - > + if [ ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha --algorithm 256 -c ]; then + wget -q --show-progress -c -O ${MODEL_PATH}/${MODEL_FILE} https://huggingface.co/${MODEL_PATH}/resolve/main/${MODEL_DOWNLOAD_FILE} + fi \ No newline at end of file