From 041ec63732b261401f21101b7df8ae7de9a75fa0 Mon Sep 17 00:00:00 2001 From: greg pereira Date: Mon, 25 Nov 2024 13:52:05 -0800 Subject: [PATCH] Implementing healtcheck sidecar and probe Signed-off-by: greg pereira --- .env.github.example | 2 + .../pr-healthcheck-sidecar-image.yml | 167 ++++++++++ .gitignore | 6 +- Makefiles/containers-base/Makefile | 5 + Makefiles/local/Makefile | 42 +++ .../healthcheck-sidecar/kustomization.yaml | 11 + .../k8s/base/healthcheck-sidecar/service.yaml | 14 + deploy/k8s/overlays/kind/kustomization.yaml | 8 +- .../openshift/common/kustomization.yaml | 4 + .../openshift/prod/kustomization.yaml | 25 +- .../overlays/openshift/qa/kustomization.yaml | 24 ++ docs/healthcheck_sidecar.md | 99 ++++++ healthcheck-sidecar/Containerfile | 29 ++ healthcheck-sidecar/requirements.txt | 1 + healthcheck-sidecar/sidecar_script.py | 296 ++++++++++++++++++ healthcheck-sidecar/stubbed_model_server.py | 61 ++++ src/Containerfile | 4 +- src/healthcheck-probe.sh | 36 +++ 18 files changed, 822 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/pr-healthcheck-sidecar-image.yml create mode 100644 deploy/k8s/base/healthcheck-sidecar/kustomization.yaml create mode 100644 deploy/k8s/base/healthcheck-sidecar/service.yaml create mode 100644 deploy/k8s/overlays/openshift/common/kustomization.yaml create mode 100644 docs/healthcheck_sidecar.md create mode 100644 healthcheck-sidecar/Containerfile create mode 100644 healthcheck-sidecar/requirements.txt create mode 100644 healthcheck-sidecar/sidecar_script.py create mode 100644 healthcheck-sidecar/stubbed_model_server.py create mode 100755 src/healthcheck-probe.sh diff --git a/.env.github.example b/.env.github.example index d6841e6c..1fe35e69 100644 --- a/.env.github.example +++ b/.env.github.example @@ -19,3 +19,5 @@ IL_MERLINITE_MODEL_NAME= IL_ENABLE_DEV_MODE=true #Enable this option if you want to enable UI features that helps in development, such as form Auto-Fill feature. NEXT_PUBLIC_EXPERIMENTAL_FEATURES=false + +SLACK_WEBHOOK_URL= diff --git a/.github/workflows/pr-healthcheck-sidecar-image.yml b/.github/workflows/pr-healthcheck-sidecar-image.yml new file mode 100644 index 00000000..b5b1a72b --- /dev/null +++ b/.github/workflows/pr-healthcheck-sidecar-image.yml @@ -0,0 +1,167 @@ +name: Publish QA Healthcheck Sidecar Container Images + +on: + push: + branches: + - main + paths: + - "healthcheck-sidecar/*" + - "!healthcheck-sidecar/stubbed_model_server.py" + +env: + GHCR_REGISTRY: ghcr.io + GHCR_HS_IMAGE_NAME: "${{ github.repository }}/healthcheck-sidecar" + QUAY_REGISTRY: quay.io + QUAY_HS_IMAGE_NAME: instructlab-ui/healthcheck-sidecar + +jobs: + build_and_publish_hs_qa_image: + name: Push QA Healthcheck Sidecar container image to GHCR and QUAY + runs-on: ubuntu-latest + environment: registry-creds + permissions: + packages: write + contents: write + attestations: write + id-token: write + + steps: + - name: Check out the repo + uses: actions/checkout@v4 + with: + token: ${{ secrets.BOT_PAT }} + ref: 'main' + + - name: Log in to the GHCR container image registry + uses: docker/login-action@v3 + with: + registry: "${{ env.GHCR_REGISTRY }}" + username: "${{ github.actor }}" + password: "${{ secrets.GITHUB_TOKEN }}" + + - name: Log in to the Quay container image registry + uses: docker/login-action@v3 + with: + registry: "${{ env.QUAY_REGISTRY }}" + username: "${{ secrets.QUAY_USERNAME }}" + password: "${{ secrets.QUAY_TOKEN }}" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Cache Docker layers + uses: actions/cache@v4 + with: + path: /tmp/.buildx-cache + key: "${{ runner.os }}-buildx-${{ github.sha }}" + restore-keys: | + "${{ runner.os }}-buildx-" + + - name: Get Pull Request Number from Commit + id: get_pr_number + uses: actions/github-script@v7 + with: + script: | + console.log("Repository owner:", context.repo.owner); + console.log("Repository name:", context.repo.repo); + console.log("Current commit SHA:", context.sha); + + const prs = await github.rest.pulls.list({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'closed', + sort: 'updated', + direction: 'desc' + }); + console.log("Number of closed PRs fetched:", prs.data.length); + + for (const pr of prs.data) { + console.log("Checking PR #", pr.number, "- Merged:"); + if (pr.merged_at != "") { + console.log("Found merged PR:", pr.number); + return pr.number; + } + } + + console.log("No merged PR found in the recent closed PRs."); + return ''; + + - name: Extract GHCR metadata (tags, labels) for HS image + id: ghcr_hs_meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME }} + + - name: Extract Quay metadata (tags, labels) for HS image + id: quay_hs_meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME }} + + - name: Build and push HS image to GHCR + id: push-hs-ghcr + uses: docker/build-push-action@v6 + with: + context: healthcheck-sidecar + push: true + tags: |- + "${{ steps.ghcr_hs_meta.outputs.tags }}" + "${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME }}:pr-${{ steps.get_pr_number.outputs.result }}" + labels: ${{ steps.ghcr_hs_meta.outputs.labels }} + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max + file: healthcheck-sidecar/Containerfile + + - name: Generate GHCR artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-name: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME}} + subject-digest: ${{ steps.push-hs-ghcr.outputs.digest }} + push-to-registry: true + + - name: Build and push HS image to QUAY + id: push-hs-quay + uses: docker/build-push-action@v6 + with: + context: healthcheck-sidecar + push: true + tags: |- + "${{ steps.quay_hs_meta.outputs.tags }}" + "${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME }}:pr-${{ steps.get_pr_number.outputs.result }}" + labels: ${{ steps.quay_hs_meta.outputs.labels }} + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max + file: healthcheck-sidecar/Containerfile + + - name: Generate QA HS Quay artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-name: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME}} + subject-digest: ${{ steps.push-hs-quay.outputs.digest }} + push-to-registry: true + + - name: Update coderefs before code changes + run: |- + git pull --ff-only + + - name: Update QA Quay HS image + id: update_qa_hs_manifest_image + env: + PR_TAG: "pr-${{ steps.get_pr_number.outputs.result }}" + run: |- + sudo wget https://github.com/mikefarah/yq/releases/download/v4.34.1/yq_linux_amd64 -O /usr/local/bin/yq + sudo chmod +x /usr/local/bin/yq + yq -i ' + (.images[] | select(.name == "quay.io/${{ env.QUAY_HS_IMAGE_NAME }}") | .newTag) = env(PR_TAG) + ' deploy/k8s/overlays/openshift/qa/kustomization.yaml + + - name: Commit and push bump QA HS Image manifest + run: |- + git config user.name "platform-engineering-bot" + git config user.email "platform-engineering@redhat.com" + git add deploy/k8s/overlays/openshift/qa/kustomization.yaml + git commit -m "[CI AUTOMATION]: Bumping QA HS image to tag: pr-${{ steps.get_pr_number.outputs.result }}" -s + git push origin main + diff --git a/.gitignore b/.gitignore index 596d5356..aca4a9ed 100644 --- a/.gitignore +++ b/.gitignore @@ -20,8 +20,7 @@ auth.log tsconfig.tsbuildinfo .idea .vscode -ui.pid -pathservice.pid +*.pid /test-results/ /playwright-report/ /blob-report/ @@ -34,4 +33,5 @@ deploy/k8s/overlays/openshift/umami/qa/umami-secret.yaml deploy/k8s/overlays/openshift/umami/prod/umami-secret.yaml deploy/podman/github/secret.yaml deploy/podman/native/secret.yaml - +healthcheck-sidecar/sidecar.log +*/venv diff --git a/Makefiles/containers-base/Makefile b/Makefiles/containers-base/Makefile index 7cdb97fb..2d2e44be 100644 --- a/Makefiles/containers-base/Makefile +++ b/Makefiles/containers-base/Makefile @@ -36,3 +36,8 @@ ps-image: validate-container-engine pathservice/Containerfile ## Build container $(ECHO_PREFIX) printf " %-12s pathservice/Containerfile\n" "[$(CONTAINER_ENGINE), linux/$(TARGET_IMAGE_ARCH)]" $(CMD_PREFIX) $(CONTAINER_ENGINE) build --platform linux/$(TARGET_IMAGE_ARCH) -f pathservice/Containerfile -t quay.io/instructlab-ui/pathservice:$(TAG) . $(CMD_PREFIX) $(CONTAINER_ENGINE) tag quay.io/instructlab-ui/pathservice:$(TAG) quay.io/instructlab-ui/pathservice:main + +healthcheck-sidecar-image: validate-container-engine healthcheck-sidecar/Containerfile ## Build container image for the InstructLab Healthcheck-Sidecar + $(ECHO_PREFIX) printf " %-12s healthcheck-sidecar/Containerfile\n" "[$(CONTAINER_ENGINE), linux/$(TARGET_IMAGE_ARCH)]" + $(CMD_PREFIX) $(CONTAINER_ENGINE) build -f healthcheck-sidecar/Containerfile -t quay.io/instructlab-ui/healthcheck-sidecar:$(TAG) healthcheck-sidecar + $(CMD_PREFIX) $(CONTAINER_ENGINE) tag quay.io/instructlab-ui/healthcheck-sidecar:$(TAG) quay.io/instructlab-ui/healthcheck-sidecar:main diff --git a/Makefiles/local/Makefile b/Makefiles/local/Makefile index c75d61a6..0c0ed9d3 100644 --- a/Makefiles/local/Makefile +++ b/Makefiles/local/Makefile @@ -2,6 +2,16 @@ # ║ Local Development Targets ║ # ╚══════════════════════════════════════════════════════════╝ +#################### DEPENDENCY FUNCTIONS #################### + +.PHONY: check-python3 +check-python3: + $(CMD_PREFIX) if ! command -v python3 >/dev/null 2>&1; then \ + echo "Error: 'python3' is not installed."; \ + echo "Please visit https://www.python.org/downloads/ for installation instructions."; \ + exit 1; \ + fi + #################### DEPLOYMENT FUNCTIONS #################### .PHONY: start-dev-local @@ -17,3 +27,35 @@ stop-dev-local: ## Stop the npm and pathservice local instances $(CMD_PREFIX) if [ -f ui.pid ]; then kill -2 `cat ui.pid` && rm ui.pid || echo "Failed to stop ui"; fi $(CMD_PREFIX) if [ -f pathservice.pid ]; then kill -2 `cat pathservice.pid` && rm pathservice.pid || echo "Failed to stop pathservice"; fi $(CMD_PREFIX) echo "Development environment stopped." + +.PHONY: start-healthcheck-sidecar-local +start-healthcheck-sidecar-local: check-python3 ## Start the healthcheck-sidecar local instance + $(CMD_PREFIX) echo "Starting healthcheck-sidecar..." + $(CMD_PREFIX) cd healthcheck-sidecar; \ + python3 -m venv venv; \ + source venv/bin/activate; \ + venv/bin/python -m pip install -r requirements.txt; \ + python sidecar_script.py & echo $$! > healthcheck-sidecar.pid + $(CMD_PREFIX) echo "Healthcheck-Sidecar started at http://localhost:8080/health." + +.PHONY: stop-healthcheck-sidecar-local +stop-healthcheck-sidecar-local: ## Stop the healtcheck-sidecar local instance + $(CMD_PREFIX) echo "Stopping healthcheck-sidecar..." + $(CMD_PREFIX) if [ -f healthcheck-sidecar.pid ]; then kill -2 `cat healthcheck-sidecar.pid` && rm healthcheck-sidecar.pid || echo "Failed to stop healthcheck-sidecar"; fi + $(CMD_PREFIX) echo "Healthcheck-Sidecar stopped." + +.PHONY: start-healthcheck-sidecar-model-server-local +start-healthcheck-sidecar-model-server-local: check-python3 ## Start the healthcheck-sidecar model server instance + $(CMD_PREFIX) echo "Starting Stubbed model server..." + $(CMD_PREFIX) cd healthcheck-sidecar; \ + python3 -m venv venv; \ + source venv/bin/activate; \ + venv/bin/python -m pip install -r requirements.txt; \ + python stubbed_model_server.py & echo $$! > stubbed-model-server.pid + $(CMD_PREFIX) echo "Stubbed Model Server started at http://localhost:8001, serving `/health` and `/v1/models`." + +.PHONY: stop-healthcheck-sidecar-model-server-local +stop-healthcheck-sidecar-model-server-local: ## Stop the healtcheck-sidecar model server local instance + $(CMD_PREFIX) echo "Stopping stubbed model server..." + $(CMD_PREFIX) if [ -f stubbed-model-server.pid ]; then kill -2 `cat stubbed-model-server.pid` && rm stubbed-model-server.pid || echo "Failed to stop stubbed model server"; fi + $(CMD_PREFIX) echo "Stubbed Model Server stopped." diff --git a/deploy/k8s/base/healthcheck-sidecar/kustomization.yaml b/deploy/k8s/base/healthcheck-sidecar/kustomization.yaml new file mode 100644 index 00000000..96ab7ccc --- /dev/null +++ b/deploy/k8s/base/healthcheck-sidecar/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: instructlab +resources: + - service.yaml +labels: + - includeSelectors: true + pairs: + app.kubernetes.io/component: ui + app.kubernetes.io/instance: ui + app.kubernetes.io/name: ui diff --git a/deploy/k8s/base/healthcheck-sidecar/service.yaml b/deploy/k8s/base/healthcheck-sidecar/service.yaml new file mode 100644 index 00000000..6f585c9b --- /dev/null +++ b/deploy/k8s/base/healthcheck-sidecar/service.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: healthcheck-sidecar + labels: + app.kubernetes.io/component: ui +spec: + ports: + - name: web + port: 8080 + selector: + app.kubernetes.io/name: ui + type: ClusterIP diff --git a/deploy/k8s/overlays/kind/kustomization.yaml b/deploy/k8s/overlays/kind/kustomization.yaml index dfed5672..138344c7 100644 --- a/deploy/k8s/overlays/kind/kustomization.yaml +++ b/deploy/k8s/overlays/kind/kustomization.yaml @@ -24,10 +24,7 @@ patches: patch: |- - op: replace path: /spec/template/spec/containers/0/image - value: quay.io/instructlab-ui/ui:main # Override this image if you want to use a different UI image - - op: replace - path: /spec/template/spec/containers/0/imagePullPolicy - value: Always + value: quay.io/instructlab-ui/ui:main # Override the pathservice image for Kind deployment - target: @@ -37,6 +34,3 @@ patches: - op: replace path: /spec/template/spec/containers/0/image value: quay.io/instructlab-ui/pathservice:main # Override this image if you want to use a different pathservice image - - op: replace - path: /spec/template/spec/containers/0/imagePullPolicy - value: Always diff --git a/deploy/k8s/overlays/openshift/common/kustomization.yaml b/deploy/k8s/overlays/openshift/common/kustomization.yaml new file mode 100644 index 00000000..c6ac9b9b --- /dev/null +++ b/deploy/k8s/overlays/openshift/common/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- ../../../base/healthcheck-sidecar/ diff --git a/deploy/k8s/overlays/openshift/prod/kustomization.yaml b/deploy/k8s/overlays/openshift/prod/kustomization.yaml index 87c83cc4..60c84c20 100644 --- a/deploy/k8s/overlays/openshift/prod/kustomization.yaml +++ b/deploy/k8s/overlays/openshift/prod/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization namespace: instructlab resources: - ../../../base + - ../common - issuer.yaml - certificate.yaml - prod.env.sealedsecret.yaml @@ -23,7 +24,6 @@ patches: - op: replace path: /spec/tls/0/hosts/0 value: ui.instructlab.ai - - target: kind: Deployment name: ui @@ -31,8 +31,31 @@ patches: - op: replace path: /spec/template/spec/containers/0/envFrom/0/secretRef/name value: prod.env + - op: add + path: /spec/template/spec/containers/0/readinessProbe + value: + exec: + command: + - sh + - -c + - "/opt/app-root/src/src/healthcheck-probe.sh" + initialDelaySeconds: 5 + periodSeconds: 10 + - op: add + path: /spec/template/spec/containers/- + value: + name: model-endpoint-healthcheck-sidecar + image: quay.io/instructlab-ui/healthcheck-sidecar + # imagePullPolicy: Always # until image lands in quay cannot use pullPolicy: Always + ports: + - containerPort: 8080 + envFrom: + - secretRef: + name: prod.env images: - name: quay.io/instructlab-ui/ui newTag: v1.0.0-beta.3 - name: quay.io/instructlab-ui/pathservice newTag: v1.0.0-beta.3 + - name: quay.io/instructlab-ui/healthcheck-sidecar + newTag: main # not currently available in our quay org diff --git a/deploy/k8s/overlays/openshift/qa/kustomization.yaml b/deploy/k8s/overlays/openshift/qa/kustomization.yaml index 729e3cb1..5c11ecd3 100644 --- a/deploy/k8s/overlays/openshift/qa/kustomization.yaml +++ b/deploy/k8s/overlays/openshift/qa/kustomization.yaml @@ -3,6 +3,7 @@ kind: Kustomization namespace: instructlab resources: - ../../../base + - ../common - issuer.yaml - certificate.yaml - qa.env.sealedsecret.yaml @@ -30,8 +31,31 @@ patches: - op: replace path: /spec/template/spec/containers/0/envFrom/0/secretRef/name value: qa.env + - op: add + path: /spec/template/spec/containers/0/readinessProbe + value: + exec: + command: + - sh + - -c + - "/opt/app-root/src/src/healthcheck-probe.sh" + initialDelaySeconds: 5 + periodSeconds: 10 + - op: add + path: /spec/template/spec/containers/- + value: + name: model-endpoint-healthcheck-sidecar + image: quay.io/instructlab-ui/healthcheck-sidecar + imagePullPolicy: Always # until image lands in quay cannot use pullPolicy: Always + ports: + - containerPort: 8080 + envFrom: + - secretRef: + name: qa.env images: - name: quay.io/instructlab-ui/ui newTag: pr-435 - name: quay.io/instructlab-ui/pathservice newTag: pr-435 + - name: quay.io/instructlab-ui/healthcheck-sidecar + newTag: main diff --git a/docs/healthcheck_sidecar.md b/docs/healthcheck_sidecar.md new file mode 100644 index 00000000..ee9a4d44 --- /dev/null +++ b/docs/healthcheck_sidecar.md @@ -0,0 +1,99 @@ +# Healthcheck Sidecar + +The healthcheck-sidecar is a simple python based container image to help monitor the model endpoints, to help maintainers identify outages. + +## How Does It Work? + +The sidecar continually polls the `/health` and `/v1/models` enpdoints on the address stored in the `IL_GRANITE_API` env variable. +It the serves those results to `http://localhost:8080/health`. In this way it can get picked up by other containers in the pod, +or could forward thes datapoints to anywhere in your cluster by backing its deployment with a `service`. Whenever the sidecar can +no longer recieves a `200` status code on either of those endpoints (`/health` and `/v1/models`), it will parse the status data +into a slack notification template and posted to the `SLACK_WEBHOOK_URL`. + +You can get a `SLACK_WEBHOOK_URL` for yourself by [creating a slack application](https://api.slack.com/apps/new), installing and +authorizing the app, and then enabling incoming webhooks. This process is a lot easier than it sounds, Slack does a fantastic job +[documenting this process](https://api.slack.com/quickstart), and providing tools such as the +[block kit builder](https://app.slack.com/block-kit-builder) to help you design the message templates, if you want extend them further. + +The UI container now has the [healthcheck-probe.sh](../src/healthcheck-probe.sh) built into it. This script will run as a readiness +probe, IE. the UI container will not come online if it determines its dependent model endpoints are down. + +## What Does It Require? + +The script requires 2 values, `IL_GRANITE_API` and `IL_GRANITE_MODEL_NAME`. These should be set in your `.env` file at the root of the +repo and `source`d into your environment. There is also an optional `SLACK_WEBHOOK_URL` environment variable. If you set the +`SLACK_WEBHOOK_URL` env variable, the healthcheck-sidecar will post to slack channel or user backed by that webhook on `outage` and +`resolution` incidents. + +## Building the image + +The simplest way to build the image is to use the `make healthcheck-sidecar-image` make target from the root of the repo, or you +can build it from the source: + +```bash +podman build \ + --platform "linux/$(uname -m)" \ + -f healthcheck-sidecar/Containerfile \ + -t quay.io/instructlab-ui/healthcheck-sidecar:main \ + healthcheck-sidecar +``` + +And you can run the image with: + +```bash +podman run \ + --platform "linux/$(uname -m)" \ + -e SLACK_NOTIFICATION_WEBHOOK="$(SLACK_NOTIFICATION_WEBHOOK)" \ + -e IL_GRANITE_API="$(IL_GRANITE_API)" \ + -e IL_GRANITE_MODEL_NAME="$(IL_GRANITE_MODEL_NAME)" \ + --user 1001750000 \ + quay.io/instructlab-ui/healthcheck-sidecar:main +``` + +You don't have run with user `1001750000`, in fact the default user for the container image is `default`. However in Openshift, it will +run with an ephemeral user in the valid range due to the `restricted-v2` scc. As such I find it helpful to include for testing purposes. + +## Local Development + +In particular, the process of testing the outage and resolution incidents feature was quite difficult. To expedite development, there +is a [stubbed python model server](../healthcheck-sidecar/stubbed_model_server.py) that is complient with the OpenAI spec, and with +serving runtimes like llamacpp and vllm. You can have either one, or both running at the time, it should not break either script. +Since this script is only meant for debugging purposes, it was not included in the contianer image. + +However this process generates a lot of noise in the way of slack messages so if you do want to work with this I suggest you +[comment out the notification publishing logic](../healthcheck-sidecar/sidecar_script.py#L247-254). + +We currently do not support this in `kind` for local development at this time. If there is interest we could update the deployments +to support that. + +## Prod and QA Deployment + +The `kustomization.yaml` files in the Prod and QA overlays both contain 2 patches realted to the healthcheck sidecar. The first +will patch in the `readinessProbe` into the UI container, ensuring that it is dependendnt on the results of the +[healthcheck-probe.sh](../src/healthcheck-probe.sh) script. They next patch the sidecar itself into the UI deployment. Finally, +both Openshift overlay kustomizations also include a reference to the common directory, which will add the healthcheck-sidecar +`service` to the manifest list. + +## Testing the Slack Notification Feature + +Begin by setting your `SLACK_NOTIFICATION_WEBHOOK`, as the server can run without, just won't report outages. +You can start a stubbed model server at `http://localhost:8001` through the make target: `make start-stubbed-model-server-local` +Make sure to then set your `IL_GRANITE_API` env variable to match that as displayed below. + +```bash +export IL_GRANITE_API="http://localhost:8001" +``` + +> Note: +> You must include the protocal (`http://`) otherwise it will hit the exceptions and not function properly. + +Now you can bring up your Healthcheck-Sidecar service, use the `make start-healthcheck-sidecar-local` target. +Once that comes online, it should start polling the stubbed model server. Simply stop the stubbed model server +(`make stop-stubbed-model-server-local`) to simulate an outage. Finally, bringing the stubbed model server +back online will simulate a resolution incident (one more, `make start-stubbed-model-server-local`). + +To switch back to the default prod deployment use: + +```bash +export IL_GRANITE_API="https://proxy.nexodus.io/chat/granite" +``` diff --git a/healthcheck-sidecar/Containerfile b/healthcheck-sidecar/Containerfile new file mode 100644 index 00000000..e16416bd --- /dev/null +++ b/healthcheck-sidecar/Containerfile @@ -0,0 +1,29 @@ +FROM registry.access.redhat.com/ubi9-minimal:9.5-1731593028 + +WORKDIR /home + +RUN useradd -u 1001 -g root --home-dir /opt/app-root/src -s /sbin/nologin --comment "Default Application User" default && \ + mkdir -p /opt/app-root/src/.local && \ + touch /opt/app-root/src/sidecar.log && \ + chmod -R g+w /opt/app-root/src + +WORKDIR /opt/app-root/src + +RUN microdnf install -y jq python3 python3-pip && \ + microdnf clean all + +COPY sidecar_script.py . +COPY requirements.txt . + +RUN python3 -m pip install -r requirements.txt + +RUN chown -R default:root /opt/app-root/src/ + +USER default + +ENV HOME=/opt/app-root/src \ + PATH=/opt/app-root/src/.local/bin:$PATH + +RUN chmod -R g+rwX /opt/app-root/src + +ENTRYPOINT ["python3", "sidecar_script.py"] diff --git a/healthcheck-sidecar/requirements.txt b/healthcheck-sidecar/requirements.txt new file mode 100644 index 00000000..f2293605 --- /dev/null +++ b/healthcheck-sidecar/requirements.txt @@ -0,0 +1 @@ +requests diff --git a/healthcheck-sidecar/sidecar_script.py b/healthcheck-sidecar/sidecar_script.py new file mode 100644 index 00000000..2e3ca293 --- /dev/null +++ b/healthcheck-sidecar/sidecar_script.py @@ -0,0 +1,296 @@ +import http.server +import socketserver +import json +import threading +import time +import requests +import os +import logging +from datetime import datetime + +################## SETUP LOGGING AND VALIDATE ENV ################## + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +file_handler = logging.FileHandler('sidecar.log') +file_handler.setLevel(logging.DEBUG) +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) +logger.addHandler(file_handler) +logger.addHandler(console_handler) + +formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') +file_handler.setFormatter(formatter) +console_handler.setFormatter(formatter) + +API_REQUEST_HEADERS = { + "Content-Type": "application/json" +} + +def validate_env() -> None: + if not os.getenv("IL_GRANITE_API"): + error = "expecting granite API endpoint as env variable `$IL_GRANITE_API`, which does not exist." + logging.error(error) + raise ValueError(error) + if not os.getenv("IL_GRANITE_MODEL_NAME"): + error = "expecting granite model name as env variable `$IL_GRANITE_MODEL_NAME`, which does not exist." + logging.error(error) + raise ValueError(error) + +################################ CLASSES #################################### + +class ModelsAPIStatus: + def __init__(self, status: str, model_name: str, models: list, available: bool): + self.status = status + self.model_name = model_name + self.models = models + self.available = available + def to_dict(self): + return { + "status": self.status, + "model_name": self.model_name, + "models": self.models, + "available": self.available, + } + +class APIHealthStatus: + def __init__(self, health_api_status: str, models_api_status: ModelsAPIStatus): + self.health_api_status = health_api_status + self.models_api_status = models_api_status + def to_dict(self): + return { + "health_api_status": self.health_api_status, + "models_api_status": self.models_api_status.to_dict(), + } + +class HealthHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + if self.path == "/health": + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + tmp_write_copy = {} + for model_name, api_health_status in health_status.items(): + if isinstance(api_health_status, dict): + health_status[model_name] = APIHealthStatus( + api_health_status["health_api_status"], + ModelsAPIStatus( + api_health_status["models_api_status"]["status"], + api_health_status["models_api_status"]["model_name"], + api_health_status["models_api_status"]["models"], + api_health_status["models_api_status"]["available"] + ) + ), + tmp_write_copy[model_name] = api_health_status.to_dict() + self.wfile.write(json.dumps(tmp_write_copy).encode()) + else: + self.send_response(404) + self.end_headers() + + +def extract_model_ids(response_json: dict) -> list: + """ + Extracts the 'id' values from all entries in the 'data' field of the response JSON. + + Args: + response_json (dict): The JSON response containing the 'data' field. + + Returns: + list: A list of 'id' values from the entries in 'data'. + """ + models = [] + for model in response_json["data"]: + models.append(model["id"]) + return models + +def send_slack_notification(payload: dict, slack_webhook_url: str) -> None: + try: + response = requests.post( + slack_webhook_url, + json=payload, + headers={"Content-Type": "application/json"}, + ) + if response.status_code != 200: + logger.error(f"Failed to send Slack notification: {response.status_code} - {response.text}") + else: + logger.info("Successfully sent slack notification of the incident.") + except Exception as e: + logger.error(f"Error sending Slack notification: {e}") + +def create_slack_message(granite_status: APIHealthStatus, incident_type: str, granite_health_api_url: str, granite_models_api_url: str) -> dict: + """ + Creates a Slack message payload for the Granite outage incident. + + Args: + granite_status (dict): The current status of the Granite endpoint. + incident_type (str): Either 'outage' or 'resolution'. + + Returns: + dict: Slack message payload. + """ + if incident_type == "outage": + header = ":meow_outage: Granite Outage Incident" + status_text = "Granite endpoint went DOWN" + color = "#FF0000" # Red for outage + elif incident_type == "resolution": + header = ":meow_green: Granite Outage Resolved" + status_text = "Granite endpoint is BACK UP" + color = "#36A64F" # Green for resolution + else: + raise ValueError("Invalid incident type. Must be 'outage' or 'resolution'.") + + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + granite_status_dict = granite_status.to_dict() + granite_status_dict["health_api_url"] = granite_health_api_url + granite_status_dict["models_api_url"] = granite_models_api_url + granite_status_json = json.dumps(granite_status_dict, indent=2) + + slack_message = { + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": header + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "Look alive @Anil Vishnoi @brent-salisbury @grpereir ." + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": f"{status_text} at *{timestamp}*." + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "*Granite Status Details:*\n```json\n" + granite_status_json + "\n```" + } + } + ], + "attachments": [ + { + "color": color, + "blocks": [] + } + ] + } + + return slack_message + +def check_health_api(health_api_url: str, api_health_status: APIHealthStatus) -> APIHealthStatus: + try: + health_api_response = requests.get(url=health_api_url, headers=API_REQUEST_HEADERS, timeout=5) + if health_api_response.ok: + api_health_status.health_api_status = "healthy" + else: + api_health_status.health_api_status = "unhealthy" + + except Exception as healthAPIConnectionError: + exception = f"Cannot connect to {api_health_status.models_api_status.model_name} Health API at {health_api_url}. Excpetion: {healthAPIConnectionError}" + logger.debug(exception) + + return api_health_status + +def check_models_api(models_api_url: str, api_health_status: APIHealthStatus) -> APIHealthStatus: + try: + models_api_response = requests.get(url=models_api_url, headers=API_REQUEST_HEADERS, timeout=5) + if models_api_response.ok: + api_health_status.models_api_status.status = "healthy" + else: + api_health_status.models_api_status.status = "unhealthy" + if len(models_api_response.json()["data"]) > 0: + models = extract_model_ids(models_api_response.json()) + api_health_status.models_api_status.models = models + api_health_status.models_api_status.available = api_health_status.models_api_status.model_name in api_health_status.models_api_status.models + else: + api_health_status.models_api_status.models = [] + api_health_status.models_api_status.available = False + except Exception as modelsAPIConnectionError: + exception = f"{datetime.now()}: Cannot connect to {api_health_status.models_api_status.model_name} Models API at {models_api_url}. Excpetion: {modelsAPIConnectionError}" + logger.debug(exception) + return api_health_status + +def check_health_and_models_api(health_api_url: str, models_api_url: str, model_name: str) -> APIHealthStatus: + local_health_status = APIHealthStatus("unknown", ModelsAPIStatus("unknown", model_name, [], False)) + local_health_status = check_health_api(health_api_url=health_api_url, api_health_status=local_health_status) + local_health_status = check_models_api(models_api_url=models_api_url, api_health_status=local_health_status) + return local_health_status + +def update_health_status(): + global health_status, incident_state, status_initialized + while True: + try: + new_granite_status = check_health_and_models_api( + health_api_url=granite_health_api_url, + models_api_url=granite_models_api_url, + model_name=granite_model_name, + ) + if new_granite_status is not None: + with health_status_lock: + health_status["granite"] = new_granite_status + logger.info(f"Updated health_status: {health_status['granite']}") + if (health_status["granite"].health_api_status != "healthy" or not health_status["granite"].models_api_status.available) and not incident_state["granite"] and status_initialized: + incident_state["granite"] = True + if enable_slack_posts: + outage_notification_payload = create_slack_message(health_status["granite"], "outage", granite_health_api_url, granite_models_api_url) + send_slack_notification(payload=outage_notification_payload, slack_webhook_url=slack_webhook_url) + elif (health_status["granite"].health_api_status == "healthy" and health_status["granite"].models_api_status.available) and incident_state["granite"] and status_initialized: + incident_state["granite"] = False + if enable_slack_posts: + resolution_notification_payload = create_slack_message(health_status["granite"], "resolution", granite_health_api_url, granite_models_api_url ) + send_slack_notification(payload=resolution_notification_payload, slack_webhook_url=slack_webhook_url) + status_initialized = True + else: + logger.info(f"check_health_and_models_api returned None for {granite_model_name}") + status_initialized = True + except Exception as e: + logger.error(f"Error updating health status: {e}") + status_initialized = True + time.sleep(10) + + +if __name__ == "__main__": + validate_env() + + enable_slack_posts = False + + if os.getenv("SLACK_WEBHOOK_URL"): + logger.info("Env variable \`$SLACK_WEBHOOK_URL\`, running with slack posting functionality.") + logger.info("Warning, slack posting functionality is prone to be noise and fragility against local model server deployment and manipulation. Be warned.") + enable_slack_posts = True + slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL") + + granite_health_api_url = f"{os.getenv('IL_GRANITE_API')}/health" + granite_models_api_url = f"{os.getenv('IL_GRANITE_API')}/v1/models" + granite_model_name = os.getenv("IL_GRANITE_MODEL_NAME") + + incident_state = { + "granite": False, + } + + health_status = { + "granite": APIHealthStatus("unknown", ModelsAPIStatus("unknown", granite_model_name, [], False)), + } + + status_initialized = False + + health_status_lock = threading.Lock() # Lock for synchronizing access + + threading.Thread(target=update_health_status, daemon=True).start() + + with socketserver.TCPServer(("", 8080), HealthHandler) as httpd: + print("Serving health status on port 8080") + logger.info("Serving health status on port 8080") + httpd.serve_forever() + diff --git a/healthcheck-sidecar/stubbed_model_server.py b/healthcheck-sidecar/stubbed_model_server.py new file mode 100644 index 00000000..79be7f86 --- /dev/null +++ b/healthcheck-sidecar/stubbed_model_server.py @@ -0,0 +1,61 @@ +import json +import os +from http.server import SimpleHTTPRequestHandler, HTTPServer +import logging + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + +def validate_env() -> None: + if not os.getenv("IL_GRANITE_MODEL_NAME"): + error = "expecting granite model name as env variable `$IL_GRANITE_MODEL_NAME`, which does not exist." + logging.error(error) + raise ValueError(error) + +class HealthHandler(SimpleHTTPRequestHandler): + def do_GET(self): + model_name = os.getenv("IL_GRANITE_MODEL_NAME") + if self.path == "/health": + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(b'{"status": "healthy"}') + elif self.path == "/v1/models": + stubbed_models = { + "object": "list", + "data": [ + { + "id": model_name, + "object": "model", + "owned_by": "vllm", + "max_model_len": 4096, + "permission": [ + { + "object": "model_permission", + "allow_create_engine": False, + "allow_sampling": True, + "allow_logprobs": False, + "allow_fine_tuning": True, + } + ] + } + ] + } + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(str.encode(json.dumps(stubbed_models))) + else: + self.send_response(404) + self.end_headers() + +# Start the server +def run(server_class=HTTPServer, handler_class=HealthHandler, port=8001): + server_address = ("", port) + httpd = server_class(server_address, handler_class) + print(f"Serving on http://localhost:{port}/health and http://localhost:{port}/v1/models") + httpd.serve_forever() + +if __name__ == "__main__": + validate_env() + run() diff --git a/src/Containerfile b/src/Containerfile index 995993d8..383b6289 100644 --- a/src/Containerfile +++ b/src/Containerfile @@ -5,8 +5,10 @@ USER root WORKDIR /opt/app-root/src COPY ./ . + +RUN dnf install -y jq RUN mkdir -p node_modules -RUN chown -R default:root package*.json next-env.d.ts node_modules +RUN chown -R default:root package*.json next-env.d.ts node_modules /opt/app-root/src/src/healthcheck-probe.sh USER default diff --git a/src/healthcheck-probe.sh b/src/healthcheck-probe.sh new file mode 100755 index 00000000..c13d6788 --- /dev/null +++ b/src/healthcheck-probe.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# -*- indent-tabs-mode: nil; tab-width: 2; sh-indentation: 2; -*- + +# probe script to check to run as readinesProb (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-readiness-probes) +# requires jq + +set -x +set -e +set -o pipefail + +if ! curl "http://localhost:8080/health" >/dev/null 2>&1; then + echo "Error: Could not access the sidecar server." + exit 1 +fi + +## Below checks the actual health of the model enpdoints, which I am not sure we want. The UI container +## should be dependent on only the sidecar health server, and not its results. +# granite_health_api_status=$(echo $health_curl | jq '.granite.health_api_status' | cut -d "\"" -f 2) +# granite_models_api_status=$(echo $health_curl | jq '.granite.models_api_status.status' | cut -d "\"" -f 2) +# granite_model_avilable=$(echo $health_curl | jq '.granite.models_api_status.available') + +# if [[ "$granite_health_api_status" != "healthy" ]]; then +# echo "\`.granite.health_api_status\` did not evaluate to healthy: ${granite_health_api_status}" +# exit 1 +# fi + +# if [[ "$granite_models_api_status" != "healthy" ]]; then +# echo "\`.granite.models_api_status\` did not evaluate to healthy: ${granite_models_api_status}" +# exit 1 +# fi + +# if [[ "$granite_model_avilable" != true ]]; then +# echo "\`.granite.models_api_status.available\` did not evaluate to healthy: ${granite_model_avilable}" +# exit 1 +# fi