From abad607bf8d877307187832273a0d373f48e5843 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Wed, 29 Jan 2025 12:21:59 -0500
Subject: [PATCH 1/3] Add Kubernetes deployment guide

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/distributions/index.md            |   8 +-
 .../distributions/kubernetes_deployment.md    | 192 ++++++++++++++++++
 2 files changed, 199 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/distributions/kubernetes_deployment.md
diff --git a/docs/source/distributions/index.md b/docs/source/distributions/index.md
index ee7f4f23cd..1f766e75e8 100644
--- a/docs/source/distributions/index.md
+++ b/docs/source/distributions/index.md
@@ -14,7 +14,12 @@ Another simple way to start interacting with Llama Stack is to just spin up a co
 
 **Conda**:
 
-Lastly, if you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+If you have a custom or an advanced setup or you are developing on Llama Stack you can also build a custom Llama Stack server. Using `llama stack build` and `llama stack run` you can build/run a custom Llama Stack server containing the exact combination of providers you wish. We have also provided various templates to make getting started easier. See [Building a Custom Distribution](building_distro) for more details.
+
+
+**Kubernetes**:
+
+If you have built a container image and want to deploy it in a Kubernetes cluster instead of starting the Llama Stack server locally. See [Kubernetes Deployment Guide](kubernetes_deployment) for more details.
 
 
 ```{toctree}
@@ -25,4 +30,5 @@ importing_as_library
 building_distro
 configuration
 selection
+kubernetes_deployment
 ```
diff --git a/docs/source/distributions/kubernetes_deployment.md b/docs/source/distributions/kubernetes_deployment.md
new file mode 100644
index 0000000000..cd307c111e
--- /dev/null
+++ b/docs/source/distributions/kubernetes_deployment.md
@@ -0,0 +1,192 @@
+# Kubernetes Deployment Guide
+
+Instead of starting the Llama Stack and vLLM servers locally. We can deploy them in a Kubernetes cluster. In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
+
+First, create a local Kubernetes cluster via Kind:
+
+```bash
+kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
+```
+
+Start vLLM server as a Kubernetes Pod and Service (remember to replace `<YOUR-HF-TOKEN>` with your actual token and `<VLLM-IMAGE>` to meet your local system architecture):
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: vllm-models
+spec:
+  accessModes:
+    - ReadWriteOnce
+  volumeMode: Filesystem
+  resources:
+    requests:
+      storage: 50Gi
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: "<YOUR-HF-TOKEN>"
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: vllm-server
+  labels:
+    app: vllm
+spec:
+  containers:
+  - name: llama-stack
+    image: <VLLM-IMAGE>
+    command:
+        - bash
+        - -c
+        - |
+          MODEL="meta-llama/Llama-3.2-1B-Instruct"
+          MODEL_PATH=/app/model/$(basename $MODEL)
+          huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
+          huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
+          python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
+    ports:
+      - containerPort: 8000
+    volumeMounts:
+      - name: llama-storage
+        mountPath: /app/model
+    env:
+      - name: HUGGING_FACE_HUB_TOKEN
+        valueFrom:
+          secretKeyRef:
+            name: hf-token-secret
+            key: token
+  volumes:
+  - name: llama-storage
+    persistentVolumeClaim:
+      claimName: vllm-models
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: vllm-server
+spec:
+  selector:
+    app: vllm
+  ports:
+  - port: 8000
+    targetPort: 8000
+  type: NodePort
+EOF
+```
+
+We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
+
+```bash
+$ kubectl logs vllm-server
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
+```
+
+Then we can modify the Llama Stack run configuration YAML with the following inference provider:
+
+```yaml
+providers:
+  inference:
+  - provider_id: vllm
+    provider_type: remote::vllm
+    config:
+      url: http://vllm-server.default.svc.cluster.local:8000/v1
+      max_tokens: 4096
+      api_token: fake
+```
+
+Once we have defined the run configuration for Llama Stack, we can build an image with that configuration and the server source code:
+
+```bash
+cat >/tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s <<EOF
+FROM distribution-myenv:dev
+
+RUN apt-get update && apt-get install -y git
+RUN git clone https://github.com/meta-llama/llama-stack.git /app/llama-stack-source
+
+ADD ./vllm-llama-stack-run-k8s.yaml /app/config.yaml
+EOF
+podman build -f /tmp/test-vllm-llama-stack/Containerfile.llama-stack-run-k8s -t llama-stack-run-k8s /tmp/test-vllm-llama-stack
+```
+
+
+We can then start the Llama Stack server by deploying a Kubernetes Pod and Service:
+
+```bash
+cat <<EOF |kubectl apply -f -
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: llama-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 1Gi
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: llama-stack-pod
+  labels:
+    app: llama-stack
+spec:
+  containers:
+  - name: llama-stack
+    image: localhost/llama-stack-run-k8s:latest
+    imagePullPolicy: IfNotPresent
+    command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
+    ports:
+      - containerPort: 5000
+    volumeMounts:
+      - name: llama-storage
+        mountPath: /root/.llama
+  volumes:
+  - name: llama-storage
+    persistentVolumeClaim:
+      claimName: llama-pvc
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: llama-stack-service
+spec:
+  selector:
+    app: llama-stack
+  ports:
+  - protocol: TCP
+    port: 5000
+    targetPort: 5000
+  type: ClusterIP
+EOF
+```
+
+We can check that the LlamaStack server has started:
+
+```bash
+$ kubectl logs vllm-server
+...
+INFO:     Started server process [1]
+INFO:     Waiting for application startup.
+INFO:     ASGI 'lifespan' protocol appears unsupported.
+INFO:     Application startup complete.
+INFO:     Uvicorn running on http://['::', '0.0.0.0']:5000 (Press CTRL+C to quit)
+```
+
+Finally, we forward the Kubernetes service to a local port and test some inference requests against it via the Llama Stack Client:
+
+```bash
+kubectl port-forward service/llama-stack-service 5000:5000
+llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
+```

From ddba43fada6d826e3911f1f53e4e12419f3f5f2c Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Wed, 29 Jan 2025 21:32:22 -0500
Subject: [PATCH 2/3] Pod -> Deployment, NodePort -> ClusterIP

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 .../distributions/kubernetes_deployment.md    | 119 ++++++++++--------
 1 file changed, 67 insertions(+), 52 deletions(-)

diff --git a/docs/source/distributions/kubernetes_deployment.md b/docs/source/distributions/kubernetes_deployment.md
index cd307c111e..ecdf37837d 100644
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@@ -32,40 +32,47 @@ type: Opaque
 data:
   token: "<YOUR-HF-TOKEN>"
 ---
-apiVersion: v1
-kind: Pod
+apiVersion: apps/v1
+kind: Deployment
 metadata:
   name: vllm-server
-  labels:
-    app: vllm
 spec:
-  containers:
-  - name: llama-stack
-    image: <VLLM-IMAGE>
-    command:
-        - bash
-        - -c
-        - |
-          MODEL="meta-llama/Llama-3.2-1B-Instruct"
-          MODEL_PATH=/app/model/$(basename $MODEL)
-          huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
-          huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
-          python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
-    ports:
-      - containerPort: 8000
-    volumeMounts:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: vllm
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: vllm
+    spec:
+      containers:
+      - name: llama-stack
+        image: <VLLM-IMAGE>
+        command:
+            - bash
+            - -c
+            - |
+              MODEL="meta-llama/Llama-3.2-1B-Instruct"
+              MODEL_PATH=/app/model/$(basename $MODEL)
+              huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
+              huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
+              python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
+        ports:
+          - containerPort: 8000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /app/model
+        env:
+          - name: HUGGING_FACE_HUB_TOKEN
+            valueFrom:
+              secretKeyRef:
+                name: hf-token-secret
+                key: token
+      volumes:
       - name: llama-storage
-        mountPath: /app/model
-    env:
-      - name: HUGGING_FACE_HUB_TOKEN
-        valueFrom:
-          secretKeyRef:
-            name: hf-token-secret
-            key: token
-  volumes:
-  - name: llama-storage
-    persistentVolumeClaim:
-      claimName: vllm-models
+        persistentVolumeClaim:
+          claimName: vllm-models
 ---
 apiVersion: v1
 kind: Service
@@ -73,11 +80,12 @@ metadata:
   name: vllm-server
 spec:
   selector:
-    app: vllm
+    app.kubernetes.io/name: vllm
   ports:
-  - port: 8000
+  - protocol: TCP
+    port: 8000
     targetPort: 8000
-  type: NodePort
+  type: ClusterIP
 EOF
 ```
 
@@ -135,27 +143,34 @@ spec:
     requests:
       storage: 1Gi
 ---
-apiVersion: v1
-kind: Pod
+apiVersion: apps/v1
+kind: Deployment
 metadata:
-  name: llama-stack-pod
-  labels:
-    app: llama-stack
+  name: llama-stack-server
 spec:
-  containers:
-  - name: llama-stack
-    image: localhost/llama-stack-run-k8s:latest
-    imagePullPolicy: IfNotPresent
-    command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
-    ports:
-      - containerPort: 5000
-    volumeMounts:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: llama-stack
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: llama-stack
+    spec:
+      containers:
+      - name: llama-stack
+        image: localhost/llama-stack-run-k8s:latest
+        imagePullPolicy: IfNotPresent
+        command: ["python", "-m", "llama_stack.distribution.server.server", "--yaml-config", "/app/config.yaml"]
+        ports:
+          - containerPort: 5000
+        volumeMounts:
+          - name: llama-storage
+            mountPath: /root/.llama
+      volumes:
       - name: llama-storage
-        mountPath: /root/.llama
-  volumes:
-  - name: llama-storage
-    persistentVolumeClaim:
-      claimName: llama-pvc
+        persistentVolumeClaim:
+          claimName: llama-pvc
 ---
 apiVersion: v1
 kind: Service
@@ -163,7 +178,7 @@ metadata:
   name: llama-stack-service
 spec:
   selector:
-    app: llama-stack
+    app.kubernetes.io/name: llama-stack
   ports:
   - protocol: TCP
     port: 5000

From 3bcc778e16b14fc816de7e9d05db918ff2cf8279 Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Wed, 29 Jan 2025 21:39:29 -0500
Subject: [PATCH 3/3] env var and fix logs

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
---
 docs/source/distributions/kubernetes_deployment.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/distributions/kubernetes_deployment.md b/docs/source/distributions/kubernetes_deployment.md
index ecdf37837d..6cca2bc476 100644
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@@ -8,7 +8,7 @@ First, create a local Kubernetes cluster via Kind:
 kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
 ```
 
-Start vLLM server as a Kubernetes Pod and Service (remember to replace `<YOUR-HF-TOKEN>` with your actual token and `<VLLM-IMAGE>` to meet your local system architecture):
+Start vLLM server as a Kubernetes Pod and Service:
 
 ```bash
 cat <<EOF |kubectl apply -f -
@@ -30,7 +30,7 @@ metadata:
   name: hf-token-secret
 type: Opaque
 data:
-  token: "<YOUR-HF-TOKEN>"
+  token: $(HF_TOKEN)
 ---
 apiVersion: apps/v1
 kind: Deployment
@@ -48,7 +48,7 @@ spec:
     spec:
       containers:
       - name: llama-stack
-        image: <VLLM-IMAGE>
+        image: $(VLLM_IMAGE)
         command:
             - bash
             - -c
@@ -92,7 +92,7 @@ EOF
 We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
 
 ```bash
-$ kubectl logs vllm-server
+$ kubectl logs -l app.kubernetes.io/name=vllm
 ...
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.
@@ -190,7 +190,7 @@ EOF
 We can check that the LlamaStack server has started:
 
 ```bash
-$ kubectl logs vllm-server
+$ kubectl logs -l app.kubernetes.io/name=llama-stack
 ...
 INFO:     Started server process [1]
 INFO:     Waiting for application startup.