diff --git a/.github/workflows/kubernetes.yaml b/.github/workflows/kubernetes.yaml
index 61a15b53df1f6..4447b68bc76ef 100644
--- a/.github/workflows/kubernetes.yaml
+++ b/.github/workflows/kubernetes.yaml
@@ -16,8 +16,39 @@ jobs:
       - name: Check out the repo
         uses: actions/checkout@v3
 
+      - name: Set up Helm
+        uses: azure/setup-helm@v3
+        with:
+          version: v3.12.1
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+          check-latest: true
+
+      - name: Set up chart-testing
+        uses: helm/chart-testing-action@v2.6.0
+
+      - name: Install repo
+        id: install-repo
+        run: |
+          ct install --chart-dirs examples/kubernetes/ --charts examples/kubernetes/
+
+      - name: Run chart-testing (list-changed)
+        id: list-changed
+        run: |
+          changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
+          if [[ -n "$changed" ]]; then
+            echo "changed=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Run chart-testing (lint)
+        if: steps.list-changed.outputs.changed == 'true'
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }}
+
       - name: Create k8s Kind Cluster
         id: create_kind_cluster
+        if: steps.list-changed.outputs.changed == 'true'
         uses: container-tools/kind-action@v1
 #
 #      - name: Build and push Docker image
@@ -33,6 +64,7 @@ jobs:
 
       - name: Docker build
         id: docker_build
+        if: steps.list-changed.outputs.changed == 'true'
         run: |
           kubectl cluster-info
           kubectl get storageclass standard
@@ -48,6 +80,7 @@ jobs:
 
       - name: Deploy prometheus
         id: deploy_prometheus
+        if: steps.list-changed.outputs.changed == 'true'
         run: |
           helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
           helm repo update
@@ -55,5 +88,10 @@ jobs:
 
       - name: Deploy llama.cpp server
         id: deploy_llama_cpp
+        if: steps.list-changed.outputs.changed == 'true'
         run: |
           helm install llama-cpp-stack ./examples/kubernetes/helm-charts
+      - name: Set up Helm
+        uses: azure/setup-helm@v3
+        with:
+          version: v3.12.1
diff --git a/examples/kubernetes/README.md b/examples/kubernetes/README.md
index 0bad7ce89a391..5a0806977ae77 100644
--- a/examples/kubernetes/README.md
+++ b/examples/kubernetes/README.md
@@ -4,6 +4,15 @@ This example demonstrates how to deploy [llama.cpp server](../server) on a [kube
 
 ![llama.cpp.kubernetes.png](llama.cpp.kubernetes.png)
 
+We provide an [Helm chart](https://helm.sh/)  repository to deploy llama.cpp at scale for completions and embeddings:
+
+```shell
+
+helm repo add llama.cpp https://ggerganov.github.io/llama.cpp
+helm repo update
+helm install example llama-cpp --namespace llama-cpp --create-namespace
+```
+
 ## Prerequisites
 
 Obviously you need a kubernetes cluster.
@@ -15,11 +24,26 @@ Required access to an API server with the following `roles`:
 
 If you do not have a real k8s cluster, you can give a try to [kind](https://kind.sigs.k8s.io/).
 
+### Metrics monitoring
+
+You might want to deploy prometheus helm chart:
+
+```shell
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm repo update
+helm install \
+    --set prometheus.prometheusSpec.podMonitorSelectorNilUseHelmValues=false \
+    kube-prometheus-stack prometheus-community/kube-prometheus-stack \
+    --create-namespace \
+    --namespace monitoring
+```
+
 ## Goals
 
 Deploy a production ready LLM API over kubernetes, including:
 - High availability
 - multi models
+- support of embeddings and completions models
 - load balancing
 - Auto scaling
 - Security 
@@ -36,6 +60,7 @@ This example does not cover [NVidia based docker engine](https://docs.nvidia.com
 **Approach**
 1. Models file are downloaded once on a `PV` by a `Job` when the stack is deployed
 2. Server `Deployment` is using an init containers to verify if the model is downloaded
+3. `Ingress` rules are routing incoming request to the target models
 3. `Probes` are used to monitor the `pods` healthiness
 4. [Prometheus](https://prometheus.io/) is used as the metrics server
 
diff --git a/examples/kubernetes/helm-charts/Chart.yaml b/examples/kubernetes/helm-charts/Chart.yaml
deleted file mode 100644
index 44540d42838a8..0000000000000
--- a/examples/kubernetes/helm-charts/Chart.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-apiVersion: v2
-name: server.llama.cpp
-description: llama.cpp Helm chart for Kubernetes
-
-# A chart can be either an 'application' or a 'library' chart.
-#
-# Application charts are a collection of templates that can be packaged into versioned archives
-# to be deployed.
-#
-# Library charts provide useful utilities or functions for the chart developer. They're included as
-# a dependency of application charts to inject those utilities and functions into the rendering
-# pipeline. Library charts do not define any templates and therefore cannot be deployed.
-type: application
-
-# This is the chart version. This version number should be incremented each time you make changes
-# to the chart and its templates, including the app version.
-# Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.1.0
-
-# This is the version number of the application being deployed. This version number should be
-# incremented each time you make changes to the application. Versions are not expected to
-# follow Semantic Versioning. They should reflect the version the application is using.
-# It is recommended to use it with quotes.
-appVersion: "1.16.0"
diff --git a/examples/kubernetes/helm-charts/templates/deployment.yaml b/examples/kubernetes/helm-charts/templates/deployment.yaml
deleted file mode 100644
index 7b22a35c76dd6..0000000000000
--- a/examples/kubernetes/helm-charts/templates/deployment.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: {{ include "server.llama.cpp.fullname" . }}
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-spec:
-  {{- if not .Values.autoscaling.enabled }}
-  replicas: {{ .Values.replicaCount }}
-  {{- end }}
-  selector:
-    matchLabels:
-      {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }}
-  template:
-    metadata:
-      {{- with .Values.podAnnotations }}
-      annotations:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      labels:
-        {{- include "server.llama.cpp.labels" . | nindent 8 }}
-        {{- with .Values.podLabels }}
-        {{- toYaml . | nindent 8 }}
-        {{- end }}
-    spec:
-      {{- with .Values.imagePullSecrets }}
-      imagePullSecrets:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      serviceAccountName: {{ include "server.llama.cpp.serviceAccountName" . }}
-      securityContext:
-        {{- toYaml .Values.podSecurityContext | nindent 8 }}
-      containers:
-        - name: {{ .Chart.Name }}
-          securityContext:
-            {{- toYaml .Values.securityContext | nindent 12 }}
-          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
-          imagePullPolicy: {{ .Values.image.pullPolicy }}
-          ports:
-            - name: http
-              containerPort: {{ .Values.service.port }}
-              protocol: TCP
-          livenessProbe:
-            {{- toYaml .Values.livenessProbe | nindent 12 }}
-          readinessProbe:
-            {{- toYaml .Values.readinessProbe | nindent 12 }}
-          resources:
-            {{- toYaml .Values.resources | nindent 12 }}
-          {{- with .Values.volumeMounts }}
-          volumeMounts:
-            {{- toYaml . | nindent 12 }}
-          {{- end }}
-      {{- with .Values.volumes }}
-      volumes:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.nodeSelector }}
-      nodeSelector:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.affinity }}
-      affinity:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
-      {{- with .Values.tolerations }}
-      tolerations:
-        {{- toYaml . | nindent 8 }}
-      {{- end }}
diff --git a/examples/kubernetes/helm-charts/templates/serviceaccount.yaml b/examples/kubernetes/helm-charts/templates/serviceaccount.yaml
deleted file mode 100644
index 1ad075ff18355..0000000000000
--- a/examples/kubernetes/helm-charts/templates/serviceaccount.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-{{- if .Values.serviceAccount.create -}}
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: {{ include "server.llama.cpp.serviceAccountName" . }}
-  labels:
-    {{- include "server.llama.cpp.labels" . | nindent 4 }}
-  {{- with .Values.serviceAccount.annotations }}
-  annotations:
-    {{- toYaml . | nindent 4 }}
-  {{- end }}
-automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
-{{- end }}
diff --git a/examples/kubernetes/helm-charts/values.yaml b/examples/kubernetes/helm-charts/values.yaml
deleted file mode 100644
index 5f41f989fb1a5..0000000000000
--- a/examples/kubernetes/helm-charts/values.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-# Default values for server.llama.cpp.
-# This is a YAML-formatted file.
-# Declare variables to be passed into your templates.
-
-replicaCount: 1
-
-image:
-  repository: nginx
-  pullPolicy: IfNotPresent
-  # Overrides the image tag whose default is the chart appVersion.
-  tag: ""
-
-imagePullSecrets: []
-nameOverride: ""
-fullnameOverride: ""
-
-serviceAccount:
-  # Specifies whether a service account should be created
-  create: true
-  # Automatically mount a ServiceAccount's API credentials?
-  automount: true
-  # Annotations to add to the service account
-  annotations: {}
-  # The name of the service account to use.
-  # If not set and create is true, a name is generated using the fullname template
-  name: ""
-
-podAnnotations: {}
-podLabels: {}
-
-podSecurityContext: {}
-  # fsGroup: 2000
-
-securityContext: {}
-  # capabilities:
-  #   drop:
-  #   - ALL
-  # readOnlyRootFilesystem: true
-  # runAsNonRoot: true
-  # runAsUser: 1000
-
-service:
-  type: ClusterIP
-  port: 80
-
-ingress:
-  enabled: false
-  className: ""
-  annotations: {}
-    # kubernetes.io/ingress.class: nginx
-    # kubernetes.io/tls-acme: "true"
-  hosts:
-    - host: chart-example.local
-      paths:
-        - path: /
-          pathType: ImplementationSpecific
-  tls: []
-  #  - secretName: chart-example-tls
-  #    hosts:
-  #      - chart-example.local
-
-resources: {}
-  # We usually recommend not to specify default resources and to leave this as a conscious
-  # choice for the user. This also increases chances charts run on environments with little
-  # resources, such as Minikube. If you do want to specify resources, uncomment the following
-  # lines, adjust them as necessary, and remove the curly braces after 'resources:'.
-  # limits:
-  #   cpu: 100m
-  #   memory: 128Mi
-  # requests:
-  #   cpu: 100m
-  #   memory: 128Mi
-
-startupProbe:
-  httpGet:
-    path: /health
-    port: http
-
-livenessProbe:
-  httpGet:
-    path: /health
-    port: http
-
-readinessProbe:
-  httpGet:
-    path: /
-    port: http
-
-autoscaling:
-  enabled: true
-  minReplicas: 1
-  maxReplicas: 100
-  targetCPUUtilizationPercentage: 80
-  # targetMemoryUtilizationPercentage: 80
-
-# Additional volumes on the output Deployment definition.
-volumes: []
-# - name: foo
-#   secret:
-#     secretName: mysecret
-#     optional: false
-
-# Additional volumeMounts on the output Deployment definition.
-volumeMounts: []
-# - name: foo
-#   mountPath: "/etc/foo"
-#   readOnly: true
-
-nodeSelector: {}
-
-tolerations: []
-
-affinity: {}
diff --git a/examples/kubernetes/index.yaml b/examples/kubernetes/index.yaml
new file mode 100644
index 0000000000000..32edb60096fc7
--- /dev/null
+++ b/examples/kubernetes/index.yaml
@@ -0,0 +1,3 @@
+apiVersion: v1
+entries: {}
+generated: "2024-02-27T12:54:11.664702812+01:00"
diff --git a/examples/kubernetes/helm-charts/.helmignore b/examples/kubernetes/llama-cpp/.helmignore
similarity index 100%
rename from examples/kubernetes/helm-charts/.helmignore
rename to examples/kubernetes/llama-cpp/.helmignore
diff --git a/examples/kubernetes/llama-cpp/Chart.yaml b/examples/kubernetes/llama-cpp/Chart.yaml
new file mode 100644
index 0000000000000..13cd200182883
--- /dev/null
+++ b/examples/kubernetes/llama-cpp/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: llama-cpp
+description: llama.cpp Helm chart for Kubernetes
+type: application
+version: 0.0.1
+appVersion: "cbbd1efa06f8c09f9dff58ff9d9af509cc4c152b"
diff --git a/examples/kubernetes/helm-charts/templates/NOTES.txt b/examples/kubernetes/llama-cpp/templates/NOTES.txt
similarity index 73%
rename from examples/kubernetes/helm-charts/templates/NOTES.txt
rename to examples/kubernetes/llama-cpp/templates/NOTES.txt
index e01560fb14ecf..44d5a115a3d42 100644
--- a/examples/kubernetes/helm-charts/templates/NOTES.txt
+++ b/examples/kubernetes/llama-cpp/templates/NOTES.txt
@@ -1,10 +1,16 @@
 1. Get the application URL by running these commands:
-{{- if .Values.ingress.enabled }}
-{{- range $host := .Values.ingress.hosts }}
+{{- if .Values.ingresses.completions.enabled }}
+{{- range $host := .Values.ingresses.completions.hosts }}
   {{- range .paths }}
-  http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }}
+  http{{ if $.Values.ingresses.completions.tls }}s{{ end }}://{{ if .host }}{{ .host }}{{else}}localhost{{ end }}{{ .path }} --data '{"messages": [{"role": "user", "message":"hello llama.cpp"}]}'
   {{- end }}
 {{- end }}
+{{- else if .Values.ingresses.embeddings.enabled }}
+{{- range $host := .Values.ingresses.embeddings.hosts }}
+  {{- range .paths }}
+  curl http{{ if $.Values.ingresses.embeddings.tls }}s{{ end }}://{{ $host.host }}{{ .path }} --data '{"input": "hello llama.cpp"}'
+a  {{- end }}
+{{- end }}
 {{- else if contains "NodePort" .Values.service.type }}
   export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "server.llama.cpp.fullname" . }})
   export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}")
diff --git a/examples/kubernetes/helm-charts/templates/_helpers.tpl b/examples/kubernetes/llama-cpp/templates/_helpers.tpl
similarity index 100%
rename from examples/kubernetes/helm-charts/templates/_helpers.tpl
rename to examples/kubernetes/llama-cpp/templates/_helpers.tpl
diff --git a/examples/kubernetes/llama-cpp/templates/deployment.yaml b/examples/kubernetes/llama-cpp/templates/deployment.yaml
new file mode 100644
index 0000000000000..2fb09f79cc282
--- /dev/null
+++ b/examples/kubernetes/llama-cpp/templates/deployment.yaml
@@ -0,0 +1,146 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "server.llama.cpp.fullname" . }}
+  labels:
+    {{- include "server.llama.cpp.labels" . | nindent 4 }}
+spec:
+  {{- if not .Values.autoscaling.enabled }}
+  replicas: {{ .Values.replicaCount }}
+  {{- end }}
+  selector:
+    matchLabels:
+      {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      annotations:
+        {{- include "server.llama.cpp.labels" . | nindent 8 }}
+        {{- if .Values.server.metrics }}
+        prometheus.io/scrape: 'true'
+        prometheus.io/port: '{{ .Values.server.port }}'
+        {{- end }}
+      {{- with .Values.podAnnotations }}
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        prometheus.io/scrape: 'true'
+        {{- include "server.llama.cpp.labels" . | nindent 8 }}
+        {{- with .Values.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      securityContext:
+        {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      initContainers:
+        - name: wait-model
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}-{{ .Values.images.downloader.tag }}
+          env:
+            - name: MODEL_PATH
+              value: {{ .Values.model.path }}
+            - name: MODEL_FILE
+              value: {{  regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }}
+            - name: MODEL_SHA256
+              value: {{ .Values.model.sha256 }}
+            - name: MODEL_DOWNLOAD_REPO
+              value: {{ .Values.model.repo }}
+            - name: MODEL_DOWNLOAD_FILE
+              value: {{ .Values.model.file }}
+          command:
+            - sh
+            - -c
+          args:
+            - >
+              set -eux;
+              while ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha256sum -c - ; do
+                echo "waiting for model file${MODEL_PATH}/${MODEL_FILE}=${MODEL_SHA256}";
+                sleep 1;
+              done
+          volumeMounts:
+            - mountPath: {{ .Values.model.path }}
+              name: models
+              readOnly: true
+      containers:
+        - name: {{ .Chart.Name }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: "{{ .Values.images.server.repository }}:{{ .Values.images.server.name }}-{{ .Values.images.server.tag | default .Chart.AppVersion }}"
+          imagePullPolicy: {{ .Values.images.pullPolicy }}
+          command:
+            - {{ .Values.server.command }}
+          args:
+            - --host
+            - {{ .Values.server.host }}
+            - --port
+            - "{{ .Values.server.port }}"
+            - --model
+            - {{ .Values.model.path }}/{{  regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }}
+            - --cont-batching
+            - --alias
+            - {{ .Values.model.alias }}
+            - --ctx-size
+            - "{{ .Values.server.kvCache.size }}"
+            - --parallel
+            - "{{ .Values.server.slots }}"
+            {{- if .Values.server.embeddings }}
+            - --embedding
+            {{- end }}
+            {{- if .Values.server.metrics }}
+            - --metrics
+            {{- end }}
+            - --log-format
+            - {{ .Values.server.log.format }}
+            {{- if  .Values.server.log.disabled }}
+            - --log-disable
+            {{- end }}
+            {{- with .Values.server.extraArgs }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.server.port }}
+              protocol: TCP
+          startupProbe:
+            httpGet:
+              path: /health
+              port: {{ .Values.server.port }}
+
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: {{ .Values.server.port }}
+
+          readinessProbe:
+            httpGet:
+              path: /health?fail_on_no_slot
+              port: {{ .Values.server.port }}
+
+          {{- with .Values.volumeMounts }}
+          volumeMounts:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          volumeMounts:
+            - mountPath: {{ .Values.model.path }}
+              name: models
+              readOnly: true
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: {{ include "server.llama.cpp.fullname" . }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
diff --git a/examples/kubernetes/helm-charts/templates/hpa.yaml b/examples/kubernetes/llama-cpp/templates/hpa.yaml
similarity index 100%
rename from examples/kubernetes/helm-charts/templates/hpa.yaml
rename to examples/kubernetes/llama-cpp/templates/hpa.yaml
diff --git a/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml b/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml
new file mode 100644
index 0000000000000..d1ef1bda4541c
--- /dev/null
+++ b/examples/kubernetes/llama-cpp/templates/ingress-completions.yaml
@@ -0,0 +1,64 @@
+{{- if and .Values.server.completions .Values.ingresses.completions.enabled -}}
+{{- $fullName := include "server.llama.cpp.fullname" . -}}
+{{- $svcPort := .Values.service.port -}}
+{{- if and .Values.ingresses.completions.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
+  {{- if not (hasKey .Values.ingresses.completions.annotations "kubernetes.io/ingress.class") }}
+  {{- $_ := set .Values.ingresses.completions.annotations "kubernetes.io/ingress.class" .Values.ingresses.completions.className}}
+  {{- end }}
+{{- end }}
+{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1
+{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
+apiVersion: networking.k8s.io/v1beta1
+{{- else -}}
+apiVersion: extensions/v1beta1
+{{- end }}
+kind: Ingress
+metadata:
+  name: {{ $fullName }}-completions
+  labels:
+    {{- include "server.llama.cpp.labels" . | nindent 4 }}
+  {{- with .Values.ingresses.completions.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if and .Values.ingresses.completions.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
+  ingressClassName: {{ .Values.ingresses.completions.className }}
+  {{- end }}
+  {{- if .Values.ingresses.completions.tls }}
+  tls:
+    {{- range .Values.ingresses.completions.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+    {{- range .Values.ingresses.completions.hosts }}
+    - http:
+        paths:
+          {{- range .paths }}
+          - path: {{ .path }}
+            {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }}
+            pathType: {{ .pathType }}
+            {{- end }}
+            backend:
+              {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }}
+              service:
+                name: {{ $fullName }}
+                port:
+                  number: {{ $svcPort }}
+              {{- else }}
+              serviceName: {{ $fullName }}
+              servicePort: {{ $svcPort }}
+              {{- end }}
+          {{- end }}
+    {{- end }}
+      {{- if .host }}
+      host: {{ .host | quote }}
+      {{- end }}
+
+{{- end }}
diff --git a/examples/kubernetes/helm-charts/templates/ingress.yaml b/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml
similarity index 59%
rename from examples/kubernetes/helm-charts/templates/ingress.yaml
rename to examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml
index 94f8740557c7b..1085d62580e46 100644
--- a/examples/kubernetes/helm-charts/templates/ingress.yaml
+++ b/examples/kubernetes/llama-cpp/templates/ingress-embeddings.yaml
@@ -1,9 +1,9 @@
-{{- if .Values.ingress.enabled -}}
+{{- if and .Values.server.embeddings .Values.ingresses.embeddings.enabled -}}
 {{- $fullName := include "server.llama.cpp.fullname" . -}}
 {{- $svcPort := .Values.service.port -}}
-{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
-  {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
-  {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
+{{- if and .Values.ingresses.embeddings.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
+  {{- if not (hasKey .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class") }}
+  {{- $_ := set .Values.ingresses.embeddings.annotations "kubernetes.io/ingress.class" .Values.ingresses.embeddings.className}}
   {{- end }}
 {{- end }}
 {{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
@@ -15,20 +15,20 @@ apiVersion: extensions/v1beta1
 {{- end }}
 kind: Ingress
 metadata:
-  name: {{ $fullName }}
+  name: {{ $fullName }}-embeddings
   labels:
     {{- include "server.llama.cpp.labels" . | nindent 4 }}
-  {{- with .Values.ingress.annotations }}
+  {{- with .Values.ingresses.embeddings.annotations }}
   annotations:
     {{- toYaml . | nindent 4 }}
   {{- end }}
 spec:
-  {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
-  ingressClassName: {{ .Values.ingress.className }}
+  {{- if and .Values.ingresses.embeddings.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
+  ingressClassName: {{ .Values.ingresses.embeddings.className }}
   {{- end }}
-  {{- if .Values.ingress.tls }}
+  {{- if .Values.ingresses.embeddings.tls }}
   tls:
-    {{- range .Values.ingress.tls }}
+    {{- range .Values.ingresses.embeddings.tls }}
     - hosts:
         {{- range .hosts }}
         - {{ . | quote }}
@@ -37,9 +37,8 @@ spec:
     {{- end }}
   {{- end }}
   rules:
-    {{- range .Values.ingress.hosts }}
-    - host: {{ .host | quote }}
-      http:
+    {{- range .Values.ingresses.embeddings.hosts }}
+    - http:
         paths:
           {{- range .paths }}
           - path: {{ .path }}
@@ -58,4 +57,8 @@ spec:
               {{- end }}
           {{- end }}
     {{- end }}
+      {{- if .host }}
+      host: {{ .host | quote }}
+      {{- end }}
+
 {{- end }}
diff --git a/examples/kubernetes/llama-cpp/templates/jobs.yaml b/examples/kubernetes/llama-cpp/templates/jobs.yaml
new file mode 100644
index 0000000000000..76942b0a7a7fd
--- /dev/null
+++ b/examples/kubernetes/llama-cpp/templates/jobs.yaml
@@ -0,0 +1,53 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name:  download-model-{{ include "server.llama.cpp.fullname" . }}
+  labels:
+    {{- include "server.llama.cpp.labels" . | nindent 4 }}
+spec:
+  template:
+    metadata:
+      name: download-model-{{ include "server.llama.cpp.fullname" . }}
+      {{- with .Values.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        {{- include "server.llama.cpp.labels" . | nindent 8 }}
+        {{- with .Values.jobLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      containers:
+        - name: {{ include "server.llama.cpp.fullname" . }}-download-model
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}-{{ .Values.images.downloader.tag }}
+          env:
+            - name: MODEL_PATH
+              value: {{ .Values.model.path }}
+            - name: MODEL_FILE
+              value: {{  regexReplaceAll "(.*/)?([^/]+).gguf" .Values.model.file "${2}.gguf" }}
+            - name: MODEL_SHA256
+              value: {{ .Values.model.sha256 }}
+            - name: MODEL_DOWNLOAD_REPO
+              value: {{ .Values.model.repo }}
+            - name: MODEL_DOWNLOAD_FILE
+              value: {{ .Values.model.file }}
+          command:
+            - sh
+            - -c
+          args:
+            - >
+              set -eux;
+              if ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha256sum -c -s - ; then
+                wget -q -c -O ${MODEL_PATH}/${MODEL_FILE} https://huggingface.co/${MODEL_DOWNLOAD_REPO}/resolve/main/${MODEL_DOWNLOAD_FILE};
+              fi
+          volumeMounts:
+            - mountPath: {{ .Values.model.path }}
+              name: models
+      restartPolicy: OnFailure
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: {{ include "server.llama.cpp.fullname" . }}
\ No newline at end of file
diff --git a/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml b/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml
new file mode 100644
index 0000000000000..f2a9ba0ce29e4
--- /dev/null
+++ b/examples/kubernetes/llama-cpp/templates/pod-monitor.yaml
@@ -0,0 +1,16 @@
+{{- if .Values.server.metrics }}
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: {{ include "server.llama.cpp.fullname" . }}
+  labels:
+    {{- include "server.llama.cpp.labels" . | nindent 4 }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "server.llama.cpp.selectorLabels" . | nindent 6 }}
+  podMetricsEndpoints:
+    - port: http
+      interval: 30s
+      path: /metrics
+{{end}}
\ No newline at end of file
diff --git a/examples/kubernetes/llama-cpp/templates/pvc.yaml b/examples/kubernetes/llama-cpp/templates/pvc.yaml
new file mode 100644
index 0000000000000..681873c379598
--- /dev/null
+++ b/examples/kubernetes/llama-cpp/templates/pvc.yaml
@@ -0,0 +1,17 @@
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: {{ include "server.llama.cpp.fullname" . }}
+  labels:
+    {{- include "server.llama.cpp.labels" . | nindent 4 }}
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: {{ .Values.model.size | quote }}
+{{- if .Values.persistence.storageClass }}
+{{- if (not empty .Values.persistence.storageClass) }}
+  storageClassName: "{{ .Values.persistence.storageClass }}"
+{{- end }}
+{{- end }}
\ No newline at end of file
diff --git a/examples/kubernetes/helm-charts/templates/service.yaml b/examples/kubernetes/llama-cpp/templates/service.yaml
similarity index 89%
rename from examples/kubernetes/helm-charts/templates/service.yaml
rename to examples/kubernetes/llama-cpp/templates/service.yaml
index aa8b2007a1bf4..8e5a222504c1c 100644
--- a/examples/kubernetes/helm-charts/templates/service.yaml
+++ b/examples/kubernetes/llama-cpp/templates/service.yaml
@@ -8,7 +8,7 @@ spec:
   type: {{ .Values.service.type }}
   ports:
     - port: {{ .Values.service.port }}
-      targetPort: http
+      targetPort: {{ .Values.service.port }}
       protocol: TCP
       name: http
   selector:
diff --git a/examples/kubernetes/helm-charts/templates/tests/test-connection.yaml b/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml
similarity index 92%
rename from examples/kubernetes/helm-charts/templates/tests/test-connection.yaml
rename to examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml
index 96ed674fd728d..5685bf3421180 100644
--- a/examples/kubernetes/helm-charts/templates/tests/test-connection.yaml
+++ b/examples/kubernetes/llama-cpp/templates/tests/test-connection.yaml
@@ -11,5 +11,5 @@ spec:
     - name: wget
       image: busybox
       command: ['wget']
-      args: ['{{ include "server.llama.cpp.fullname" . }}:{{ .Values.service.port }}']
+      args: ['{{ include "server.llama.cpp.fullname" . }}:{{ .Values.service.port }}/health']
   restartPolicy: Never
diff --git a/examples/kubernetes/llama-cpp/values.yaml b/examples/kubernetes/llama-cpp/values.yaml
new file mode 100644
index 0000000000000..2ac6ed35ad4cc
--- /dev/null
+++ b/examples/kubernetes/llama-cpp/values.yaml
@@ -0,0 +1,121 @@
+# Default values for server.llama.cpp.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+replicaCount: 2
+
+images:
+  server:
+    repository: ghcr.io/ggerganov/llama.cpp
+    name: server
+    tag:
+  downloader:
+    repository: busybox
+    name: 1.36.1
+    tag: "glibc"
+
+  pullPolicy: IfNotPresent
+  # Overrides the image tag whose default is the chart appVersion.
+
+imagePullSecrets: [ ]
+nameOverride: ""
+fullnameOverride: ""
+
+podAnnotations: { }
+podLabels: { }
+
+jobAnnotations: { }
+jobLabels: { }
+
+podSecurityContext:
+  runAsNonRoot: true
+
+securityContext:
+  readOnlyRootFilesystem: false #FIXME
+  runAsNonRoot: true
+  runAsUser: 1000
+
+model:
+  path: /tmp
+  alias: microsoft-phi2
+  repo: ggml-org/models # TheBloke/phi-2-GGUF
+  file: tinyllamas/stories260K.gguf # phi-2.Q4_K_M.gguf
+  size: 2Mi # 1.8Gi
+  sha256: 047bf46455a544931cff6fef14d7910154c56afbc23ab1c5e56a72e69912c04b # 324356668fa5ba9f4135de348447bb2bbe2467eaa1b8fcfb53719de62fbd2499
+
+server:
+  command: /server
+  host: 0.0.0.0
+  port: 8080
+  completions: true
+  embeddings: false
+  metrics: true
+  kvCache:
+    size: 64
+  slots: 2
+  log:
+    format: text
+    disabled: false
+  extraArgs: []
+
+deployments:
+  init
+
+service:
+  type: ClusterIP
+  port: 80
+
+ingresses:
+  completions:
+    enabled: true
+    className: ""
+    annotations:
+      kubernetes.io/ingress.class: nginx
+
+    hosts:
+      - #host: llama-cpp.mydomain
+        paths:
+          - path: /v1/completions
+            pathType: Prefix
+    tls: [ ]
+    #  - secretName: chart-example-tls
+    #    hosts:
+    #      - chart-example.local
+
+  embeddings:
+    enabled: true
+    className: ""
+    annotations:
+      kubernetes.io/ingress.class: nginx
+
+    hosts:
+      - #host: llama-cpp.mydomain
+        paths:
+          - path: /v1/embeddings
+            pathType: Prefix
+    tls: [ ]
+    #  - secretName: chart-example-tls
+    #    hosts:
+    #      - chart-example.local
+
+resources: { }
+
+autoscaling:
+  enabled: true
+  minReplicas: 1
+  maxReplicas: 4
+  targetCPUUtilizationPercentage: 80
+  targetMemoryUtilizationPercentage: 80
+
+volumes: [ ]
+
+volumeMounts: [ ]
+
+nodeSelector: { }
+
+tolerations: [ ]
+
+affinity: { }
+
+persistence:
+  storageClass:
\ No newline at end of file
diff --git a/examples/kubernetes/llama.cpp.kubernetes.png b/examples/kubernetes/llama.cpp.kubernetes.png
index 39d47cbf87d03..612f83275ddc0 100644
Binary files a/examples/kubernetes/llama.cpp.kubernetes.png and b/examples/kubernetes/llama.cpp.kubernetes.png differ
diff --git a/phymbert-notes.md b/phymbert-notes.md
new file mode 100644
index 0000000000000..7878f67a16423
--- /dev/null
+++ b/phymbert-notes.md
@@ -0,0 +1,32 @@
+# TODO
+
+test U server example
+fix log-disable
+add /models
+helm example
+empty input test
+
+
+
+      initContainers:
+        - name: wait-model
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: {{ .Values.images.downloader.repository }}:{{ .Values.images.downloader.name }}-{{ .Values.images.downloader.tag | default .Chart.Version }}
+          env:
+            - name: MODEL_PATH
+              value: {{ .Values.model.path }}
+            - name: MODEL_FILE
+              value: {{ .Values.model.file_basename }}
+            - name: MODEL_SHA256
+              value: {{ .Values.model.sha256 }}
+            - name: MODEL_DOWNLOAD_FILE
+              value: {{ .Values.model.file }}
+          command:
+            - /bin/bash
+            - -c
+          args:
+            - >
+              if [ ! echo "${MODEL_SHA256} *${MODEL_PATH}/${MODEL_FILE}" | sha --algorithm 256 -c ]; then
+                wget -q --show-progress -c -O ${MODEL_PATH}/${MODEL_FILE} https://huggingface.co/${MODEL_PATH}/resolve/main/${MODEL_DOWNLOAD_FILE}
+              fi
\ No newline at end of file