From 0330d6559168ff9741dafc059abede9a189f2f04 Mon Sep 17 00:00:00 2001 From: Henry Chen <1474479+chenhunghan@users.noreply.github.com> Date: Thu, 10 Aug 2023 22:12:10 +0300 Subject: [PATCH] Add helm chart, add CI for lint/relase helm chart, update README for helm installation (#3) Signed-off-by: Hung-Han (Henry) Chen --- .github/workflows/helm-chart-lint-test.yaml | 45 +++++++++++++++++ .github/workflows/helm-chart-release.yaml | 32 +++++++++++++ README.md | 48 +++++++++++++++++++ apps/text-inference-batcher-nodejs/Dockerfile | 2 +- .../text-inference-batcher-nodejs/Chart.yaml | 6 +++ .../templates/deployment.yaml | 43 +++++++++++++++++ .../templates/service.yaml | 15 ++++++ .../text-inference-batcher-nodejs/values.yaml | 38 +++++++++++++++ 8 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/helm-chart-lint-test.yaml create mode 100644 .github/workflows/helm-chart-release.yaml create mode 100644 charts/text-inference-batcher-nodejs/Chart.yaml create mode 100644 charts/text-inference-batcher-nodejs/templates/deployment.yaml create mode 100644 charts/text-inference-batcher-nodejs/templates/service.yaml create mode 100644 charts/text-inference-batcher-nodejs/values.yaml diff --git a/.github/workflows/helm-chart-lint-test.yaml b/.github/workflows/helm-chart-lint-test.yaml new file mode 100644 index 0000000..67d9ec2 --- /dev/null +++ b/.github/workflows/helm-chart-lint-test.yaml @@ -0,0 +1,45 @@ +name: Lint and Test Charts + +on: pull_request + +jobs: + lint-test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.12.1 + + - uses: actions/setup-python@v4 + with: + python-version: '3.9' + check-latest: true + + - name: Set up chart-testing + uses: helm/chart-testing-action@v2.4.0 + + - name: Run chart-testing (list-changed) + id: list-changed + run: | + changed=$(ct list-changed --target-branch ${{ github.event.repository.default_branch }}) + if [[ -n "$changed" ]]; then + echo "changed=true" >> "$GITHUB_OUTPUT" + fi + + - name: Run chart-testing (lint) + if: steps.list-changed.outputs.changed == 'true' + run: ct lint --target-branch ${{ github.event.repository.default_branch }} --validate-maintainers=false + + - name: Create kind cluster + if: steps.list-changed.outputs.changed == 'true' + uses: helm/kind-action@v1.7.0 + + - name: Run chart-testing (install) + if: steps.list-changed.outputs.changed == 'true' + run: ct install --target-branch ${{ github.event.repository.default_branch }} diff --git a/.github/workflows/helm-chart-release.yaml b/.github/workflows/helm-chart-release.yaml new file mode 100644 index 0000000..d60f594 --- /dev/null +++ b/.github/workflows/helm-chart-release.yaml @@ -0,0 +1,32 @@ +name: Release Charts + +on: + push: + branches: + - main + paths: + - 'charts/text-inference-batcher-nodejs/**' + - '.github/workflows/helm-chart-release.yaml' + +jobs: + release: + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Configure Git + run: | + git config user.name "$GITHUB_ACTOR" + git config user.email "$GITHUB_ACTOR@users.noreply.github.com" + + - name: Run chart-releaser + uses: helm/chart-releaser-action@v1.5.0 + with: + charts_dir: charts + env: + CR_TOKEN: "${{ secrets.GITHUB_TOKEN }}" diff --git a/README.md b/README.md index 371f68f..16774ba 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,54 @@ `text-inference-batcher` is a high-performance router optimized for maximum throughput in text inference workload. +## Quick Start + +Quickly deploy two inference backend using [ialacol](https://github.com/chenhunghan/ialacol) in namespace `llm`. + +```sh +helm repo add ialacol https://chenhunghan.github.io/ialacol +helm repo update +# the classic llama-2 13B +helm install llama-2 ialacol/ialacol \ + --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="" \ + --set deployment.env.DEFAULT_MODEL_FILE="llama-2-13b-chat.ggmlv3.q4_0.bin" \ + -n llm +# orca mini fine-tuned llama-2 https://huggingface.co/psmathur/orca_mini_v3_13b +helm install orca-mini ialacol/ialacol \ + --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="TheBloke/orca_mini_v3_13B-GGML" \ + --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="orca_mini_v3_13b.ggmlv3.q4_0.bin" \ + -n llm +# just another fine-tuned variant +helm install stable-platypus2 ialacol/ialacol \ + --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="TheBloke/Stable-Platypus2-13B-GGML" \ + --set deployment.env.DEFAULT_MODEL_HG_REPO_ID="stable-platypus2-13b.ggmlv3.q4_0.bin" \ + -n llm +``` + +Add `text-inference-batcher` pointing to upstreams. + +```sh +helm repo add text-inference-batcher +helm repo update +helm install tib text-inference-batcher/text-inference-batcher-nodejs \ + --set deployment.env.UPSTREAMS="http://llama-2:8000,http://orca-mini:8000,http://stable-platypus2:8000" + -n llm +``` + +Port forward `text-inference-batcher` for testing. + +```sh +kubectl port-forward svc/tib 8000:8000 -n llm +``` + +Single gateway for all your inference backends + +```sh +openai -k "sk-" -b http://localhost:8000/v1 -vv api chat_completions.create -m llama-2-13b-chat.ggmlv3.q4_0.bin -g user "Hello world!" +openai -k "sk-" -b http://localhost:8000/v1 -vv api chat_completions.create -m orca_mini_v3_13b.ggmlv3.q4_0.bin -g user "Hello world!" +openai -k "sk-" -b http://localhost:8000/v1 -vv api chat_completions.create -m stable-platypus2-13b.ggmlv3.q4_0.bin -g user "Hello world!" +``` + ## Features - Max throughput by queuing, and continuous batching of incoming requests. diff --git a/apps/text-inference-batcher-nodejs/Dockerfile b/apps/text-inference-batcher-nodejs/Dockerfile index 74e9376..fc07d64 100644 --- a/apps/text-inference-batcher-nodejs/Dockerfile +++ b/apps/text-inference-batcher-nodejs/Dockerfile @@ -43,4 +43,4 @@ RUN --mount=type=cache,target=/tmp/.npm \ --cache /tmp/.npm ENV NODE_ENV production EXPOSE 8000 -CMD ["node", "dist/index.js"] +CMD ["node", "apps/text-inference-batcher-nodejs/dist/index.js"] diff --git a/charts/text-inference-batcher-nodejs/Chart.yaml b/charts/text-inference-batcher-nodejs/Chart.yaml new file mode 100644 index 0000000..c15940d --- /dev/null +++ b/charts/text-inference-batcher-nodejs/Chart.yaml @@ -0,0 +1,6 @@ +apiVersion: v2 +appVersion: 0.0.1 +description: A Helm chart for text-inference-batcher with node.js runtime +name: text-inference-batcher-nodejs +type: application +version: 0.0.1 diff --git a/charts/text-inference-batcher-nodejs/templates/deployment.yaml b/charts/text-inference-batcher-nodejs/templates/deployment.yaml new file mode 100644 index 0000000..87b86ea --- /dev/null +++ b/charts/text-inference-batcher-nodejs/templates/deployment.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Release.Name }} + namespace: {{ .Release.Namespace | quote }} + labels: + app.kubernetes.io/instance: {{ .Chart.Name }} + app.kubernetes.io/name: {{ .Release.Name }} +spec: + selector: + matchLabels: + app.kubernetes.io/instance: {{ .Chart.Name }} + app.kubernetes.io/name: {{ .Release.Name }} + replicas: {{ .Values.replicas }} + template: + metadata: + name: {{ .Release.Name }} + labels: + app.kubernetes.io/instance: {{ .Chart.Name }} + app.kubernetes.io/name: {{ .Release.Name }} + spec: + containers: + - name: {{ .Release.Name }} + image: {{ .Values.deployment.image }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + env: + - name: UPSTREAMS + value: {{ (.Values.deployment.env).UPSTREAMS | quote }} + - name: MAX_CONNECT_PER_UPSTREAM + value: {{ (.Values.deployment.env).MAX_CONNECT_PER_UPSTREAM | quote }} + tolerations: +{{- if .Values.tolerations }} +{{ toYaml .Values.tolerations | indent 8 }} +{{- end }} + nodeSelector: +{{- if .Values.nodeSelector }} +{{ toYaml .Values.nodeSelector | indent 8 }} +{{- end }} + affinity: +{{- if .Values.affinity }} +{{ toYaml .Values.affinity | indent 8 }} +{{- end }} diff --git a/charts/text-inference-batcher-nodejs/templates/service.yaml b/charts/text-inference-batcher-nodejs/templates/service.yaml new file mode 100644 index 0000000..7793702 --- /dev/null +++ b/charts/text-inference-batcher-nodejs/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ .Release.Name }} + namespace: {{ .Release.Namespace | quote }} +spec: + selector: + app.kubernetes.io/instance: {{ .Chart.Name }} + app.kubernetes.io/name: {{ .Release.Name }} + type: "{{ .Values.service.type }}" + ports: + - protocol: TCP + port: {{ .Values.service.port }} + targetPort: 8000 + name: http diff --git a/charts/text-inference-batcher-nodejs/values.yaml b/charts/text-inference-batcher-nodejs/values.yaml new file mode 100644 index 0000000..a877b6c --- /dev/null +++ b/charts/text-inference-batcher-nodejs/values.yaml @@ -0,0 +1,38 @@ +replicas: 1 + +deployment: + image: ghcr.io/chenhunghan/text-inference-batcher-nodejs:latest + env: + # upstream url separated by comman. e.g. "http://llama-2-7b-0:8000,http://llama-2-7b-1:8000,http://llama-2-13b-0:8000" + UPSTREAMS: "" + MAX_CONNECT_PER_UPSTREAM: 1 +resources: + {} + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi +service: + type: ClusterIP + port: 8000 + annotations: {} + # If using an AWS load balancer, you'll need to override the default 60s load balancer idle timeout + # service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "1200" +nodeSelector: {} +tolerations: [] + # e.g. + # - key: "computing" + # operator: "Exists" + # effect: "NoSchedule" +affinity: {} + # e.g. + # nodeAffinity: + # requiredDuringSchedulingIgnoredDuringExecution: + # nodeSelectorTerms: + # - matchExpressions: + # - key: computing-lb + # operator: In + # values: + # - "true"