From 0330d6559168ff9741dafc059abede9a189f2f04 Mon Sep 17 00:00:00 2001
From: Henry Chen <1474479+chenhunghan@users.noreply.github.com>
Date: Thu, 10 Aug 2023 22:12:10 +0300
Subject: [PATCH] Add helm chart, add CI for lint/relase helm chart, update
 README for helm installation (#3)

Signed-off-by: Hung-Han (Henry) Chen <chenhungh@gmail.com>
---
 .github/workflows/helm-chart-lint-test.yaml   | 45 +++++++++++++++++
 .github/workflows/helm-chart-release.yaml     | 32 +++++++++++++
 README.md                                     | 48 +++++++++++++++++++
 apps/text-inference-batcher-nodejs/Dockerfile |  2 +-
 .../text-inference-batcher-nodejs/Chart.yaml  |  6 +++
 .../templates/deployment.yaml                 | 43 +++++++++++++++++
 .../templates/service.yaml                    | 15 ++++++
 .../text-inference-batcher-nodejs/values.yaml | 38 +++++++++++++++
 8 files changed, 228 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/helm-chart-lint-test.yaml
 create mode 100644 .github/workflows/helm-chart-release.yaml
 create mode 100644 charts/text-inference-batcher-nodejs/Chart.yaml
 create mode 100644 charts/text-inference-batcher-nodejs/templates/deployment.yaml
 create mode 100644 charts/text-inference-batcher-nodejs/templates/service.yaml
 create mode 100644 charts/text-inference-batcher-nodejs/values.yaml

diff --git a/.github/workflows/helm-chart-lint-test.yaml b/.github/workflows/helm-chart-lint-test.yaml
new file mode 100644
index 0000000..67d9ec2
--- /dev/null
+++ b/.github/workflows/helm-chart-lint-test.yaml
@@ -0,0 +1,45 @@
+name: Lint and Test Charts
+
+on: pull_request
+
+jobs:
+  lint-test:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Set up Helm
+        uses: azure/setup-helm@v3
+        with:
+          version: v3.12.1
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+          check-latest: true
+
+      - name: Set up chart-testing
+        uses: helm/chart-testing-action@v2.4.0
+
+      - name: Run chart-testing (list-changed)
+        id: list-changed
+        run: |
+          changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }})
+          if [[ -n "$changed" ]]; then
+            echo "changed=true" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Run chart-testing (lint)
+        if: steps.list-changed.outputs.changed == 'true'
+        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --validate-maintainers=false
+
+      - name: Create kind cluster
+        if: steps.list-changed.outputs.changed == 'true'
+        uses: helm/kind-action@v1.7.0
+
+      - name: Run chart-testing (install)
+        if: steps.list-changed.outputs.changed == 'true'
+        run: ct install --target-branch ${{ github.event.repository.default_branch }}
diff --git a/.github/workflows/helm-chart-release.yaml b/.github/workflows/helm-chart-release.yaml
new file mode 100644
index 0000000..d60f594
--- /dev/null
+++ b/.github/workflows/helm-chart-release.yaml
@@ -0,0 +1,32 @@
+name: Release Charts
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+    - 'charts/text-inference-batcher-nodejs/**'
+    - '.github/workflows/helm-chart-release.yaml'
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Configure Git
+        run: |
+          git config user.name "$GITHUB_ACTOR"
+          git config user.email "$GITHUB_ACTOR@users.noreply.github.com"
+
+      - name: Run chart-releaser
+        uses: helm/chart-releaser-action@v1.5.0
+        with:
+          charts_dir: charts
+        env:
+          CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/README.md b/README.md
index 371f68f..16774ba 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,54 @@
 
 `text-inference-batcher` is a high-performance router optimized for maximum throughput in text inference workload.
 
+## Quick Start
+
+Quickly deploy two inference backend using [ialacol](https://github.com/chenhunghan/ialacol) in namespace `llm`.
+
+```sh
+helm repo add ialacol https://chenhunghan.github.io/ialacol
+helm repo update
+# the classic llama-2 13B
+helm install llama-2 ialacol/ialacol \ 
+  --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="" \
+  --set deployment.env.DEFAULT_MODEL_FILE="llama-2-13b-chat.ggmlv3.q4_0.bin" \
+  -n llm
+# orca mini fine-tuned llama-2 https://huggingface.co/psmathur/orca_mini_v3_13b
+helm install orca-mini ialacol/ialacol \
+  --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="TheBloke/orca_mini_v3_13B-GGML" \
+  --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="orca_mini_v3_13b.ggmlv3.q4_0.bin" \
+  -n llm
+# just another fine-tuned variant
+helm install stable-platypus2 ialacol/ialacol \
+  --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="TheBloke/Stable-Platypus2-13B-GGML" \
+  --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="stable-platypus2-13b.ggmlv3.q4_0.bin" \
+  -n llm
+```
+
+Add `text-inference-batcher` pointing to upstreams.
+
+```sh
+helm repo add text-inference-batcher <https://chenhunghan.github.io/text-inference-batcher>
+helm repo update
+helm install tib text-inference-batcher/text-inference-batcher-nodejs \
+  --set deployment.env.UPSTREAMS="http://llama-2:8000,http://orca-mini:8000,http://stable-platypus2:8000"
+  -n llm
+```
+
+Port forward `text-inference-batcher` for testing.
+
+```sh
+kubectl port-forward svc/tib 8000:8000 -n llm
+```
+
+Single gateway for all your inference backends
+
+```sh
+openai -k "sk-" -b http://localhost:8000/v1 -vv api chat_completions.create -m llama-2-13b-chat.ggmlv3.q4_0.bin -g user "Hello world!"
+openai -k "sk-" -b http://localhost:8000/v1 -vv api chat_completions.create -m orca_mini_v3_13b.ggmlv3.q4_0.bin -g user "Hello world!"
+openai -k "sk-" -b http://localhost:8000/v1 -vv api chat_completions.create -m stable-platypus2-13b.ggmlv3.q4_0.bin -g user "Hello world!"
+```
+
 ## Features
 
 - Max throughput by queuing, and continuous batching of incoming requests.
diff --git a/apps/text-inference-batcher-nodejs/Dockerfile b/apps/text-inference-batcher-nodejs/Dockerfile
index 74e9376..fc07d64 100644
--- a/apps/text-inference-batcher-nodejs/Dockerfile
+++ b/apps/text-inference-batcher-nodejs/Dockerfile
@@ -43,4 +43,4 @@ RUN --mount=type=cache,target=/tmp/.npm \
         --cache /tmp/.npm
 ENV NODE_ENV production
 EXPOSE 8000
-CMD ["node", "dist/index.js"]  
+CMD ["node", "apps/text-inference-batcher-nodejs/dist/index.js"]  
diff --git a/charts/text-inference-batcher-nodejs/Chart.yaml b/charts/text-inference-batcher-nodejs/Chart.yaml
new file mode 100644
index 0000000..c15940d
--- /dev/null
+++ b/charts/text-inference-batcher-nodejs/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+appVersion: 0.0.1
+description: A Helm chart for text-inference-batcher with node.js runtime
+name: text-inference-batcher-nodejs
+type: application
+version: 0.0.1
diff --git a/charts/text-inference-batcher-nodejs/templates/deployment.yaml b/charts/text-inference-batcher-nodejs/templates/deployment.yaml
new file mode 100644
index 0000000..87b86ea
--- /dev/null
+++ b/charts/text-inference-batcher-nodejs/templates/deployment.yaml
@@ -0,0 +1,43 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ .Release.Name }}
+  namespace: {{ .Release.Namespace | quote }}
+  labels:
+    app.kubernetes.io/instance: {{ .Chart.Name }}
+    app.kubernetes.io/name: {{ .Release.Name }}
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/instance: {{ .Chart.Name }}
+      app.kubernetes.io/name: {{ .Release.Name }}
+  replicas: {{ .Values.replicas }}
+  template:
+    metadata:
+      name: {{ .Release.Name }}
+      labels:
+        app.kubernetes.io/instance: {{ .Chart.Name }}
+        app.kubernetes.io/name: {{ .Release.Name }}
+    spec:
+      containers:
+        - name: {{ .Release.Name }}
+          image: {{ .Values.deployment.image }}
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          env:
+          - name: UPSTREAMS
+            value: {{ (.Values.deployment.env).UPSTREAMS | quote }}
+          - name: MAX_CONNECT_PER_UPSTREAM
+            value: {{ (.Values.deployment.env).MAX_CONNECT_PER_UPSTREAM | quote }}
+      tolerations:
+{{- if .Values.tolerations }}
+{{ toYaml .Values.tolerations | indent 8 }}
+{{- end }}
+      nodeSelector:
+{{- if .Values.nodeSelector }}
+{{ toYaml .Values.nodeSelector | indent 8 }}
+{{- end }}
+      affinity:
+{{- if .Values.affinity }}
+{{ toYaml .Values.affinity | indent 8 }}
+{{- end }}
diff --git a/charts/text-inference-batcher-nodejs/templates/service.yaml b/charts/text-inference-batcher-nodejs/templates/service.yaml
new file mode 100644
index 0000000..7793702
--- /dev/null
+++ b/charts/text-inference-batcher-nodejs/templates/service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ .Release.Name }}
+  namespace: {{ .Release.Namespace | quote }}
+spec:
+  selector:
+    app.kubernetes.io/instance: {{ .Chart.Name }}
+    app.kubernetes.io/name: {{ .Release.Name }}
+  type: "{{ .Values.service.type }}"
+  ports:
+    - protocol: TCP
+      port: {{ .Values.service.port }}
+      targetPort: 8000
+      name: http
diff --git a/charts/text-inference-batcher-nodejs/values.yaml b/charts/text-inference-batcher-nodejs/values.yaml
new file mode 100644
index 0000000..a877b6c
--- /dev/null
+++ b/charts/text-inference-batcher-nodejs/values.yaml
@@ -0,0 +1,38 @@
+replicas: 1
+
+deployment:
+  image: ghcr.io/chenhunghan/text-inference-batcher-nodejs:latest
+  env:
+    # upstream url separated by comman. e.g. "http://llama-2-7b-0:8000,http://llama-2-7b-1:8000,http://llama-2-13b-0:8000"
+    UPSTREAMS: ""
+    MAX_CONNECT_PER_UPSTREAM: 1
+resources:
+  {}
+  # limits:
+  #   cpu: 100m
+  #   memory: 128Mi
+  # requests:
+  #   cpu: 100m
+  #   memory: 128Mi
+service:
+  type: ClusterIP
+  port: 8000
+  annotations: {}
+  # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout
+  # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200"
+nodeSelector: {}
+tolerations: []
+  # e.g.
+  # - key: "computing"
+  #   operator: "Exists"
+  #   effect: "NoSchedule"
+affinity: {}
+  # e.g.
+  # nodeAffinity:
+  #   requiredDuringSchedulingIgnoredDuringExecution:
+  #     nodeSelectorTerms:
+  #     - matchExpressions:
+  #       - key: computing-lb
+  #         operator: In
+  #         values:
+  #         - "true"