From 041ec63732b261401f21101b7df8ae7de9a75fa0 Mon Sep 17 00:00:00 2001
From: greg pereira <grpereir@redhat.com>
Date: Mon, 25 Nov 2024 13:52:05 -0800
Subject: [PATCH] Implementing healtcheck sidecar and probe

Signed-off-by: greg pereira <grpereir@redhat.com>
---
 .env.github.example                           |   2 +
 .../pr-healthcheck-sidecar-image.yml          | 167 ++++++++++
 .gitignore                                    |   6 +-
 Makefiles/containers-base/Makefile            |   5 +
 Makefiles/local/Makefile                      |  42 +++
 .../healthcheck-sidecar/kustomization.yaml    |  11 +
 .../k8s/base/healthcheck-sidecar/service.yaml |  14 +
 deploy/k8s/overlays/kind/kustomization.yaml   |   8 +-
 .../openshift/common/kustomization.yaml       |   4 +
 .../openshift/prod/kustomization.yaml         |  25 +-
 .../overlays/openshift/qa/kustomization.yaml  |  24 ++
 docs/healthcheck_sidecar.md                   |  99 ++++++
 healthcheck-sidecar/Containerfile             |  29 ++
 healthcheck-sidecar/requirements.txt          |   1 +
 healthcheck-sidecar/sidecar_script.py         | 296 ++++++++++++++++++
 healthcheck-sidecar/stubbed_model_server.py   |  61 ++++
 src/Containerfile                             |   4 +-
 src/healthcheck-probe.sh                      |  36 +++
 18 files changed, 822 insertions(+), 12 deletions(-)
 create mode 100644 .github/workflows/pr-healthcheck-sidecar-image.yml
 create mode 100644 deploy/k8s/base/healthcheck-sidecar/kustomization.yaml
 create mode 100644 deploy/k8s/base/healthcheck-sidecar/service.yaml
 create mode 100644 deploy/k8s/overlays/openshift/common/kustomization.yaml
 create mode 100644 docs/healthcheck_sidecar.md
 create mode 100644 healthcheck-sidecar/Containerfile
 create mode 100644 healthcheck-sidecar/requirements.txt
 create mode 100644 healthcheck-sidecar/sidecar_script.py
 create mode 100644 healthcheck-sidecar/stubbed_model_server.py
 create mode 100755 src/healthcheck-probe.sh

diff --git a/.env.github.example b/.env.github.example
index d6841e6c..1fe35e69 100644
--- a/.env.github.example
+++ b/.env.github.example
@@ -19,3 +19,5 @@ IL_MERLINITE_MODEL_NAME=<MERLINITE_MODEL_NAME>
 IL_ENABLE_DEV_MODE=true #Enable this option if you want to enable UI features that helps in development, such as form Auto-Fill feature.
 
 NEXT_PUBLIC_EXPERIMENTAL_FEATURES=false
+
+SLACK_WEBHOOK_URL=
diff --git a/.github/workflows/pr-healthcheck-sidecar-image.yml b/.github/workflows/pr-healthcheck-sidecar-image.yml
new file mode 100644
index 00000000..b5b1a72b
--- /dev/null
+++ b/.github/workflows/pr-healthcheck-sidecar-image.yml
@@ -0,0 +1,167 @@
+name: Publish QA Healthcheck Sidecar Container Images
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "healthcheck-sidecar/*"
+      - "!healthcheck-sidecar/stubbed_model_server.py"
+
+env:
+  GHCR_REGISTRY: ghcr.io
+  GHCR_HS_IMAGE_NAME: "${{ github.repository }}/healthcheck-sidecar"
+  QUAY_REGISTRY: quay.io
+  QUAY_HS_IMAGE_NAME: instructlab-ui/healthcheck-sidecar
+
+jobs:
+  build_and_publish_hs_qa_image:
+    name: Push QA Healthcheck Sidecar container image to GHCR and QUAY
+    runs-on: ubuntu-latest
+    environment: registry-creds
+    permissions:
+      packages: write
+      contents: write
+      attestations: write
+      id-token: write
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+        with:
+          token: ${{ secrets.BOT_PAT }}
+          ref: 'main'
+  
+      - name: Log in to the GHCR container image registry
+        uses: docker/login-action@v3
+        with:
+          registry: "${{ env.GHCR_REGISTRY }}"
+          username: "${{ github.actor }}"
+          password: "${{ secrets.GITHUB_TOKEN }}"
+
+      - name: Log in to the Quay container image registry
+        uses: docker/login-action@v3
+        with:
+          registry: "${{ env.QUAY_REGISTRY }}"
+          username: "${{ secrets.QUAY_USERNAME }}"
+          password: "${{ secrets.QUAY_TOKEN }}"
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Cache Docker layers
+        uses: actions/cache@v4
+        with:
+          path: /tmp/.buildx-cache
+          key: "${{ runner.os }}-buildx-${{ github.sha }}"
+          restore-keys: |
+            "${{ runner.os }}-buildx-"
+
+      - name: Get Pull Request Number from Commit
+        id: get_pr_number
+        uses: actions/github-script@v7
+        with:
+          script: |
+            console.log("Repository owner:", context.repo.owner);
+            console.log("Repository name:", context.repo.repo);
+            console.log("Current commit SHA:", context.sha);
+
+            const prs = await github.rest.pulls.list({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              state: 'closed',
+              sort: 'updated',
+              direction: 'desc'
+            });
+            console.log("Number of closed PRs fetched:", prs.data.length);
+
+            for (const pr of prs.data) {
+              console.log("Checking PR #", pr.number, "- Merged:");
+              if (pr.merged_at != "") {
+                console.log("Found merged PR:", pr.number);
+                return pr.number;
+              }
+            }
+
+            console.log("No merged PR found in the recent closed PRs.");
+            return '';
+
+      - name: Extract GHCR metadata (tags, labels) for HS image
+        id: ghcr_hs_meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME }}
+
+      - name: Extract Quay metadata (tags, labels) for HS image
+        id: quay_hs_meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME }}
+
+      - name: Build and push HS image to GHCR
+        id: push-hs-ghcr
+        uses: docker/build-push-action@v6
+        with:
+          context: healthcheck-sidecar
+          push: true
+          tags: |-
+            "${{ steps.ghcr_hs_meta.outputs.tags }}"
+            "${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME }}:pr-${{ steps.get_pr_number.outputs.result }}"
+          labels: ${{ steps.ghcr_hs_meta.outputs.labels }}
+          platforms: linux/amd64,linux/arm64
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          file: healthcheck-sidecar/Containerfile
+
+      - name: Generate GHCR artifact attestation
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-name: ${{ env.GHCR_REGISTRY }}/${{ env.GHCR_HS_IMAGE_NAME}}
+          subject-digest: ${{ steps.push-hs-ghcr.outputs.digest }}
+          push-to-registry: true
+
+      - name: Build and push HS image to QUAY
+        id: push-hs-quay
+        uses: docker/build-push-action@v6
+        with:
+          context: healthcheck-sidecar
+          push: true
+          tags: |-
+            "${{ steps.quay_hs_meta.outputs.tags }}"
+            "${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME }}:pr-${{ steps.get_pr_number.outputs.result }}"
+          labels: ${{ steps.quay_hs_meta.outputs.labels }}
+          platforms: linux/amd64,linux/arm64
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          file: healthcheck-sidecar/Containerfile
+
+      - name: Generate QA HS Quay artifact attestation
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-name: ${{ env.QUAY_REGISTRY }}/${{ env.QUAY_HS_IMAGE_NAME}}
+          subject-digest: ${{ steps.push-hs-quay.outputs.digest }}
+          push-to-registry: true
+
+      - name: Update coderefs before code changes
+        run: |-
+          git pull --ff-only
+
+      - name: Update QA Quay HS image
+        id: update_qa_hs_manifest_image
+        env:
+          PR_TAG:  "pr-${{ steps.get_pr_number.outputs.result }}"
+        run: |-
+          sudo wget https://github.com/mikefarah/yq/releases/download/v4.34.1/yq_linux_amd64 -O /usr/local/bin/yq
+          sudo chmod +x /usr/local/bin/yq
+          yq -i '
+            (.images[] | select(.name == "quay.io/${{ env.QUAY_HS_IMAGE_NAME }}") | .newTag) = env(PR_TAG)
+          ' deploy/k8s/overlays/openshift/qa/kustomization.yaml
+
+      - name: Commit and push bump QA HS Image manifest
+        run: |-
+          git config user.name "platform-engineering-bot"
+          git config user.email "platform-engineering@redhat.com"
+          git add deploy/k8s/overlays/openshift/qa/kustomization.yaml
+          git commit -m "[CI AUTOMATION]: Bumping QA HS image to tag: pr-${{ steps.get_pr_number.outputs.result }}" -s
+          git push origin main
+
diff --git a/.gitignore b/.gitignore
index 596d5356..aca4a9ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,8 +20,7 @@ auth.log
 tsconfig.tsbuildinfo
 .idea
 .vscode
-ui.pid
-pathservice.pid
+*.pid
 /test-results/
 /playwright-report/
 /blob-report/
@@ -34,4 +33,5 @@ deploy/k8s/overlays/openshift/umami/qa/umami-secret.yaml
 deploy/k8s/overlays/openshift/umami/prod/umami-secret.yaml
 deploy/podman/github/secret.yaml
 deploy/podman/native/secret.yaml
-
+healthcheck-sidecar/sidecar.log
+*/venv
diff --git a/Makefiles/containers-base/Makefile b/Makefiles/containers-base/Makefile
index 7cdb97fb..2d2e44be 100644
--- a/Makefiles/containers-base/Makefile
+++ b/Makefiles/containers-base/Makefile
@@ -36,3 +36,8 @@ ps-image: validate-container-engine pathservice/Containerfile ## Build container
 	$(ECHO_PREFIX) printf "  %-12s pathservice/Containerfile\n" "[$(CONTAINER_ENGINE), linux/$(TARGET_IMAGE_ARCH)]"
 	$(CMD_PREFIX) $(CONTAINER_ENGINE) build --platform linux/$(TARGET_IMAGE_ARCH) -f pathservice/Containerfile -t quay.io/instructlab-ui/pathservice:$(TAG) .
 	$(CMD_PREFIX) $(CONTAINER_ENGINE) tag quay.io/instructlab-ui/pathservice:$(TAG) quay.io/instructlab-ui/pathservice:main
+
+healthcheck-sidecar-image: validate-container-engine healthcheck-sidecar/Containerfile ## Build container image for the InstructLab Healthcheck-Sidecar
+	$(ECHO_PREFIX) printf "  %-12s healthcheck-sidecar/Containerfile\n" "[$(CONTAINER_ENGINE), linux/$(TARGET_IMAGE_ARCH)]"
+	$(CMD_PREFIX) $(CONTAINER_ENGINE) build -f healthcheck-sidecar/Containerfile -t quay.io/instructlab-ui/healthcheck-sidecar:$(TAG) healthcheck-sidecar
+	$(CMD_PREFIX) $(CONTAINER_ENGINE) tag quay.io/instructlab-ui/healthcheck-sidecar:$(TAG) quay.io/instructlab-ui/healthcheck-sidecar:main
diff --git a/Makefiles/local/Makefile b/Makefiles/local/Makefile
index c75d61a6..0c0ed9d3 100644
--- a/Makefiles/local/Makefile
+++ b/Makefiles/local/Makefile
@@ -2,6 +2,16 @@
 # ║                Local Development Targets                 ║
 # ╚══════════════════════════════════════════════════════════╝
 
+#################### DEPENDENCY FUNCTIONS ####################
+
+.PHONY: check-python3
+check-python3:
+	$(CMD_PREFIX) if ! command -v python3 >/dev/null 2>&1; then \
+		echo "Error: 'python3' is not installed."; \
+		echo "Please visit https://www.python.org/downloads/ for installation instructions."; \
+		exit 1; \
+	fi
+
 #################### DEPLOYMENT FUNCTIONS ####################
 
 .PHONY: start-dev-local
@@ -17,3 +27,35 @@ stop-dev-local:  ## Stop the npm and pathservice local instances
 	$(CMD_PREFIX) if [ -f ui.pid ]; then kill -2 `cat ui.pid` && rm ui.pid || echo "Failed to stop ui"; fi
 	$(CMD_PREFIX) if [ -f pathservice.pid ]; then kill -2 `cat pathservice.pid` && rm pathservice.pid || echo "Failed to stop pathservice"; fi
 	$(CMD_PREFIX) echo "Development environment stopped."
+
+.PHONY: start-healthcheck-sidecar-local
+start-healthcheck-sidecar-local: check-python3 ## Start the healthcheck-sidecar local instance
+	$(CMD_PREFIX) echo "Starting healthcheck-sidecar..."
+	$(CMD_PREFIX) cd healthcheck-sidecar; \
+                  python3 -m venv venv; \
+                  source venv/bin/activate; \
+	              venv/bin/python -m pip install -r requirements.txt; \
+	              python sidecar_script.py & echo $$! > healthcheck-sidecar.pid
+	$(CMD_PREFIX) echo "Healthcheck-Sidecar started at http://localhost:8080/health."
+
+.PHONY: stop-healthcheck-sidecar-local
+stop-healthcheck-sidecar-local:  ## Stop the healtcheck-sidecar local instance
+	$(CMD_PREFIX) echo "Stopping healthcheck-sidecar..."
+	$(CMD_PREFIX) if [ -f healthcheck-sidecar.pid ]; then kill -2 `cat healthcheck-sidecar.pid` && rm healthcheck-sidecar.pid || echo "Failed to stop healthcheck-sidecar"; fi
+	$(CMD_PREFIX) echo "Healthcheck-Sidecar stopped."
+
+.PHONY: start-healthcheck-sidecar-model-server-local
+start-healthcheck-sidecar-model-server-local: check-python3 ## Start the healthcheck-sidecar model server instance
+	$(CMD_PREFIX) echo "Starting Stubbed model server..."
+	$(CMD_PREFIX) cd healthcheck-sidecar; \
+                  python3 -m venv venv; \
+                  source venv/bin/activate; \
+	              venv/bin/python -m pip install -r requirements.txt; \
+	              python stubbed_model_server.py & echo $$! > stubbed-model-server.pid
+	$(CMD_PREFIX) echo "Stubbed Model Server started at http://localhost:8001, serving `/health` and `/v1/models`."
+
+.PHONY: stop-healthcheck-sidecar-model-server-local
+stop-healthcheck-sidecar-model-server-local:  ## Stop the healtcheck-sidecar model server local instance
+	$(CMD_PREFIX) echo "Stopping stubbed model server..."
+	$(CMD_PREFIX) if [ -f stubbed-model-server.pid ]; then kill -2 `cat stubbed-model-server.pid` && rm stubbed-model-server.pid || echo "Failed to stop stubbed model server"; fi
+	$(CMD_PREFIX) echo "Stubbed Model Server stopped."
diff --git a/deploy/k8s/base/healthcheck-sidecar/kustomization.yaml b/deploy/k8s/base/healthcheck-sidecar/kustomization.yaml
new file mode 100644
index 00000000..96ab7ccc
--- /dev/null
+++ b/deploy/k8s/base/healthcheck-sidecar/kustomization.yaml
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: instructlab
+resources:
+  - service.yaml
+labels:
+  - includeSelectors: true
+    pairs:
+      app.kubernetes.io/component: ui
+      app.kubernetes.io/instance: ui
+      app.kubernetes.io/name: ui
diff --git a/deploy/k8s/base/healthcheck-sidecar/service.yaml b/deploy/k8s/base/healthcheck-sidecar/service.yaml
new file mode 100644
index 00000000..6f585c9b
--- /dev/null
+++ b/deploy/k8s/base/healthcheck-sidecar/service.yaml
@@ -0,0 +1,14 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: healthcheck-sidecar
+  labels:
+    app.kubernetes.io/component: ui
+spec:
+  ports:
+    - name: web
+      port: 8080
+  selector:
+    app.kubernetes.io/name: ui
+  type: ClusterIP
diff --git a/deploy/k8s/overlays/kind/kustomization.yaml b/deploy/k8s/overlays/kind/kustomization.yaml
index dfed5672..138344c7 100644
--- a/deploy/k8s/overlays/kind/kustomization.yaml
+++ b/deploy/k8s/overlays/kind/kustomization.yaml
@@ -24,10 +24,7 @@ patches:
     patch: |-
       - op: replace
         path: /spec/template/spec/containers/0/image
-        value: quay.io/instructlab-ui/ui:main # Override this image if you want to use a different UI image
-      - op: replace
-        path: /spec/template/spec/containers/0/imagePullPolicy
-        value: Always
+        value: quay.io/instructlab-ui/ui:main
 
   # Override the pathservice image for Kind deployment
   - target:
@@ -37,6 +34,3 @@ patches:
       - op: replace
         path: /spec/template/spec/containers/0/image
         value: quay.io/instructlab-ui/pathservice:main # Override this image if you want to use a different pathservice image
-      - op: replace
-        path: /spec/template/spec/containers/0/imagePullPolicy
-        value: Always
diff --git a/deploy/k8s/overlays/openshift/common/kustomization.yaml b/deploy/k8s/overlays/openshift/common/kustomization.yaml
new file mode 100644
index 00000000..c6ac9b9b
--- /dev/null
+++ b/deploy/k8s/overlays/openshift/common/kustomization.yaml
@@ -0,0 +1,4 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+- ../../../base/healthcheck-sidecar/
diff --git a/deploy/k8s/overlays/openshift/prod/kustomization.yaml b/deploy/k8s/overlays/openshift/prod/kustomization.yaml
index 87c83cc4..60c84c20 100644
--- a/deploy/k8s/overlays/openshift/prod/kustomization.yaml
+++ b/deploy/k8s/overlays/openshift/prod/kustomization.yaml
@@ -3,6 +3,7 @@ kind: Kustomization
 namespace: instructlab
 resources:
   - ../../../base
+  - ../common
   - issuer.yaml
   - certificate.yaml
   - prod.env.sealedsecret.yaml
@@ -23,7 +24,6 @@ patches:
       - op: replace
         path: /spec/tls/0/hosts/0
         value: ui.instructlab.ai
-
   - target:
       kind: Deployment
       name: ui
@@ -31,8 +31,31 @@ patches:
       - op: replace
         path: /spec/template/spec/containers/0/envFrom/0/secretRef/name
         value: prod.env
+      - op: add 
+        path: /spec/template/spec/containers/0/readinessProbe
+        value:
+          exec:
+            command:
+              - sh
+              - -c
+              - "/opt/app-root/src/src/healthcheck-probe.sh"
+          initialDelaySeconds: 5
+          periodSeconds: 10
+      - op: add
+        path: /spec/template/spec/containers/-
+        value:
+          name: model-endpoint-healthcheck-sidecar
+          image: quay.io/instructlab-ui/healthcheck-sidecar
+          # imagePullPolicy: Always # until image lands in quay cannot use pullPolicy: Always
+          ports:
+            - containerPort: 8080
+          envFrom:
+            - secretRef:
+                name: prod.env
 images:
   - name: quay.io/instructlab-ui/ui
     newTag: v1.0.0-beta.3
   - name: quay.io/instructlab-ui/pathservice
     newTag: v1.0.0-beta.3
+  - name: quay.io/instructlab-ui/healthcheck-sidecar
+    newTag: main # not currently available in our quay org
diff --git a/deploy/k8s/overlays/openshift/qa/kustomization.yaml b/deploy/k8s/overlays/openshift/qa/kustomization.yaml
index 729e3cb1..5c11ecd3 100644
--- a/deploy/k8s/overlays/openshift/qa/kustomization.yaml
+++ b/deploy/k8s/overlays/openshift/qa/kustomization.yaml
@@ -3,6 +3,7 @@ kind: Kustomization
 namespace: instructlab
 resources:
   - ../../../base
+  - ../common
   - issuer.yaml
   - certificate.yaml
   - qa.env.sealedsecret.yaml
@@ -30,8 +31,31 @@ patches:
       - op: replace
         path: /spec/template/spec/containers/0/envFrom/0/secretRef/name
         value: qa.env
+      - op: add 
+        path: /spec/template/spec/containers/0/readinessProbe
+        value:
+          exec:
+            command:
+              - sh
+              - -c
+              - "/opt/app-root/src/src/healthcheck-probe.sh"
+          initialDelaySeconds: 5
+          periodSeconds: 10
+      - op: add
+        path: /spec/template/spec/containers/-
+        value:
+          name: model-endpoint-healthcheck-sidecar
+          image: quay.io/instructlab-ui/healthcheck-sidecar
+          imagePullPolicy: Always # until image lands in quay cannot use pullPolicy: Always
+          ports:
+            - containerPort: 8080
+          envFrom:
+            - secretRef:
+                name: qa.env
 images:
   - name: quay.io/instructlab-ui/ui
     newTag: pr-435
   - name: quay.io/instructlab-ui/pathservice
     newTag: pr-435
+  - name: quay.io/instructlab-ui/healthcheck-sidecar
+    newTag: main
diff --git a/docs/healthcheck_sidecar.md b/docs/healthcheck_sidecar.md
new file mode 100644
index 00000000..ee9a4d44
--- /dev/null
+++ b/docs/healthcheck_sidecar.md
@@ -0,0 +1,99 @@
+# Healthcheck Sidecar
+
+The healthcheck-sidecar is a simple python based container image to help monitor the model endpoints, to help maintainers identify outages.
+
+## How Does It Work?
+
+The sidecar continually polls the `/health` and `/v1/models` enpdoints on the address stored in the `IL_GRANITE_API` env variable.
+It the serves those results to `http://localhost:8080/health`. In this way it can get picked up by other containers in the pod,
+or could forward thes datapoints to anywhere in your cluster by backing its deployment with a `service`. Whenever the sidecar can
+no longer recieves a `200` status code on either of those endpoints (`/health` and `/v1/models`), it will parse the status data
+into a slack notification template and posted to the `SLACK_WEBHOOK_URL`.
+
+You can get a `SLACK_WEBHOOK_URL` for yourself by [creating a slack application](https://api.slack.com/apps/new), installing and
+authorizing the app, and then enabling incoming webhooks. This process is a lot easier than it sounds, Slack does a fantastic job
+[documenting this process](https://api.slack.com/quickstart), and providing tools such as the
+[block kit builder](https://app.slack.com/block-kit-builder) to help you design the message templates, if you want extend them further.
+
+The UI container now has the [healthcheck-probe.sh](../src/healthcheck-probe.sh) built into it. This script will run as a readiness
+probe, IE. the UI container will not come online if it determines its dependent model endpoints are down.
+
+## What Does It Require?
+
+The script requires 2 values, `IL_GRANITE_API` and `IL_GRANITE_MODEL_NAME`. These should be set in your `.env` file at the root of the
+repo and `source`d into your environment. There is also an optional `SLACK_WEBHOOK_URL` environment variable. If you set the
+`SLACK_WEBHOOK_URL` env variable, the healthcheck-sidecar will post to slack channel or user backed by that webhook on `outage` and
+`resolution` incidents.
+
+## Building the image
+
+The simplest way to build the image is to use the `make healthcheck-sidecar-image` make target from the root of the repo, or you
+can build it from the source:
+
+```bash
+podman build  \
+    --platform "linux/$(uname -m)" \
+    -f healthcheck-sidecar/Containerfile \
+    -t quay.io/instructlab-ui/healthcheck-sidecar:main \
+    healthcheck-sidecar
+```
+
+And you can run the image with:
+
+```bash
+podman run \
+    --platform "linux/$(uname -m)" \
+    -e SLACK_NOTIFICATION_WEBHOOK="$(SLACK_NOTIFICATION_WEBHOOK)" \
+    -e IL_GRANITE_API="$(IL_GRANITE_API)" \
+    -e IL_GRANITE_MODEL_NAME="$(IL_GRANITE_MODEL_NAME)" \
+    --user 1001750000 \
+    quay.io/instructlab-ui/healthcheck-sidecar:main
+```
+
+You don't have run with user `1001750000`, in fact the default user for the container image is `default`. However in Openshift, it will
+run with an ephemeral user in the valid range due to the `restricted-v2` scc. As such I find it helpful to include for testing purposes.
+
+## Local Development
+
+In particular, the process of testing the outage and resolution incidents feature was quite difficult. To expedite development, there
+is a [stubbed python model server](../healthcheck-sidecar/stubbed_model_server.py) that is complient with the OpenAI spec, and with
+serving runtimes like llamacpp and vllm. You can have either one, or both running at the time, it should not break either script.
+Since this script is only meant for debugging purposes, it was not included in the contianer image.
+
+However this process generates a lot of noise in the way of slack messages so if you do want to work with this I suggest you
+[comment out the notification publishing logic](../healthcheck-sidecar/sidecar_script.py#L247-254).
+
+We currently do not support this in `kind` for local development at this time. If there is interest we could update the deployments
+to support that.
+
+## Prod and QA Deployment
+
+The `kustomization.yaml` files in the Prod and QA overlays both contain 2 patches realted to the healthcheck sidecar. The first
+will patch in the `readinessProbe` into the UI container, ensuring that it is dependendnt on the results of the
+[healthcheck-probe.sh](../src/healthcheck-probe.sh) script. They next patch the sidecar itself into the UI deployment. Finally,
+both Openshift overlay kustomizations also include a reference to the common directory, which will add the healthcheck-sidecar
+`service` to the manifest list.
+
+## Testing the Slack Notification Feature
+
+Begin by setting your `SLACK_NOTIFICATION_WEBHOOK`, as the server can run without, just won't report outages.
+You can start a stubbed model server at `http://localhost:8001` through the make target: `make start-stubbed-model-server-local`
+Make sure to then set your `IL_GRANITE_API` env variable to match that as displayed below.
+
+```bash
+export IL_GRANITE_API="http://localhost:8001"
+```
+
+> Note:
+> You must include the protocal (`http://`) otherwise it will hit the exceptions and not function properly.
+
+Now you can bring up your Healthcheck-Sidecar service, use the `make start-healthcheck-sidecar-local` target.
+Once that comes online, it should start polling the stubbed model server. Simply stop the stubbed model server
+(`make stop-stubbed-model-server-local`) to simulate an outage. Finally, bringing the stubbed model server
+back online will simulate a resolution incident (one more, `make start-stubbed-model-server-local`).
+
+To switch back to the default prod deployment use:
+
+```bash
+export IL_GRANITE_API="https://proxy.nexodus.io/chat/granite"
+```
diff --git a/healthcheck-sidecar/Containerfile b/healthcheck-sidecar/Containerfile
new file mode 100644
index 00000000..e16416bd
--- /dev/null
+++ b/healthcheck-sidecar/Containerfile
@@ -0,0 +1,29 @@
+FROM registry.access.redhat.com/ubi9-minimal:9.5-1731593028
+
+WORKDIR /home
+
+RUN useradd -u 1001 -g root --home-dir /opt/app-root/src -s /sbin/nologin --comment "Default Application User" default && \
+    mkdir -p /opt/app-root/src/.local && \
+    touch /opt/app-root/src/sidecar.log && \
+    chmod -R g+w /opt/app-root/src
+
+WORKDIR /opt/app-root/src
+
+RUN microdnf install -y jq python3 python3-pip  && \
+    microdnf clean all
+
+COPY sidecar_script.py .
+COPY requirements.txt .
+
+RUN python3 -m pip install -r requirements.txt
+
+RUN chown -R default:root /opt/app-root/src/
+
+USER default
+
+ENV HOME=/opt/app-root/src \
+    PATH=/opt/app-root/src/.local/bin:$PATH
+
+RUN chmod -R g+rwX /opt/app-root/src
+
+ENTRYPOINT ["python3", "sidecar_script.py"]
diff --git a/healthcheck-sidecar/requirements.txt b/healthcheck-sidecar/requirements.txt
new file mode 100644
index 00000000..f2293605
--- /dev/null
+++ b/healthcheck-sidecar/requirements.txt
@@ -0,0 +1 @@
+requests
diff --git a/healthcheck-sidecar/sidecar_script.py b/healthcheck-sidecar/sidecar_script.py
new file mode 100644
index 00000000..2e3ca293
--- /dev/null
+++ b/healthcheck-sidecar/sidecar_script.py
@@ -0,0 +1,296 @@
+import http.server
+import socketserver
+import json
+import threading
+import time
+import requests
+import os
+import logging
+from datetime import datetime
+
+################## SETUP LOGGING AND VALIDATE ENV ##################
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+file_handler = logging.FileHandler('sidecar.log')
+file_handler.setLevel(logging.DEBUG)
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.INFO)
+logger.addHandler(file_handler)
+logger.addHandler(console_handler)
+
+formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+file_handler.setFormatter(formatter)
+console_handler.setFormatter(formatter)
+
+API_REQUEST_HEADERS = {
+    "Content-Type": "application/json"
+}
+
+def validate_env() -> None:
+    if not os.getenv("IL_GRANITE_API"):
+        error = "expecting granite API endpoint as env variable `$IL_GRANITE_API`, which does not exist."
+        logging.error(error)
+        raise ValueError(error)
+    if not os.getenv("IL_GRANITE_MODEL_NAME"):
+        error = "expecting granite model name as env variable `$IL_GRANITE_MODEL_NAME`, which does not exist."
+        logging.error(error)
+        raise ValueError(error)
+
+################################ CLASSES ####################################
+
+class ModelsAPIStatus:
+    def __init__(self, status: str, model_name: str, models: list, available: bool):
+        self.status = status
+        self.model_name = model_name
+        self.models = models
+        self.available = available
+    def to_dict(self):
+        return {
+            "status": self.status,
+            "model_name": self.model_name,
+            "models": self.models,
+            "available": self.available,
+        }
+
+class APIHealthStatus:
+    def __init__(self, health_api_status: str, models_api_status: ModelsAPIStatus):
+        self.health_api_status = health_api_status
+        self.models_api_status = models_api_status
+    def to_dict(self):
+        return {
+            "health_api_status": self.health_api_status,
+            "models_api_status": self.models_api_status.to_dict(),
+        }
+
+class HealthHandler(http.server.SimpleHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == "/health":
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            tmp_write_copy = {}
+            for model_name, api_health_status in health_status.items():
+                if isinstance(api_health_status, dict):
+                    health_status[model_name] = APIHealthStatus(
+                        api_health_status["health_api_status"],
+                        ModelsAPIStatus(
+                            api_health_status["models_api_status"]["status"], 
+                            api_health_status["models_api_status"]["model_name"],
+                            api_health_status["models_api_status"]["models"],
+                            api_health_status["models_api_status"]["available"]
+                        )
+                    ),
+                tmp_write_copy[model_name] = api_health_status.to_dict()
+            self.wfile.write(json.dumps(tmp_write_copy).encode())
+        else:
+            self.send_response(404)
+            self.end_headers()
+
+
+def extract_model_ids(response_json: dict) -> list:
+    """
+    Extracts the 'id' values from all entries in the 'data' field of the response JSON.
+
+    Args:
+        response_json (dict): The JSON response containing the 'data' field.
+
+    Returns:
+        list: A list of 'id' values from the entries in 'data'.
+    """
+    models = []
+    for model in response_json["data"]:
+        models.append(model["id"])
+    return models
+
+def send_slack_notification(payload: dict, slack_webhook_url: str) -> None:
+    try:
+        response = requests.post(
+            slack_webhook_url,
+            json=payload,
+            headers={"Content-Type": "application/json"},
+        )
+        if response.status_code != 200:
+            logger.error(f"Failed to send Slack notification: {response.status_code} - {response.text}")
+        else:
+            logger.info("Successfully sent slack notification of the incident.")
+    except Exception as e:
+        logger.error(f"Error sending Slack notification: {e}")
+
+def create_slack_message(granite_status: APIHealthStatus, incident_type: str,  granite_health_api_url: str, granite_models_api_url: str) -> dict:
+    """
+    Creates a Slack message payload for the Granite outage incident.
+
+    Args:
+        granite_status (dict): The current status of the Granite endpoint.
+        incident_type (str): Either 'outage' or 'resolution'.
+
+    Returns:
+        dict: Slack message payload.
+    """
+    if incident_type == "outage":
+        header = ":meow_outage: Granite Outage Incident"
+        status_text = "Granite endpoint went DOWN"
+        color = "#FF0000"  # Red for outage
+    elif incident_type == "resolution":
+        header = ":meow_green: Granite Outage Resolved"
+        status_text = "Granite endpoint is BACK UP"
+        color = "#36A64F"  # Green for resolution
+    else:
+        raise ValueError("Invalid incident type. Must be 'outage' or 'resolution'.")
+
+    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    granite_status_dict = granite_status.to_dict()
+    granite_status_dict["health_api_url"] = granite_health_api_url
+    granite_status_dict["models_api_url"] = granite_models_api_url
+    granite_status_json = json.dumps(granite_status_dict, indent=2)
+
+    slack_message = {
+        "blocks": [
+            {
+                "type": "header",
+                "text": {
+                    "type": "plain_text",
+                    "text": header
+                }
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "Look alive @Anil Vishnoi @brent-salisbury @grpereir ."
+                }
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": f"{status_text} at *{timestamp}*."
+                }
+            },
+            {
+                "type": "section",
+                "text": {
+                    "type": "mrkdwn",
+                    "text": "*Granite Status Details:*\n```json\n" + granite_status_json + "\n```"
+                }
+            }
+        ],
+        "attachments": [
+            {
+                "color": color,
+                "blocks": []
+            }
+        ]
+    }
+
+    return slack_message
+
+def check_health_api(health_api_url: str,  api_health_status: APIHealthStatus) -> APIHealthStatus:
+    try:
+        health_api_response = requests.get(url=health_api_url, headers=API_REQUEST_HEADERS, timeout=5)
+        if health_api_response.ok:
+            api_health_status.health_api_status = "healthy"
+        else:
+            api_health_status.health_api_status = "unhealthy"
+        
+    except Exception as healthAPIConnectionError:
+        exception = f"Cannot connect to {api_health_status.models_api_status.model_name} Health API at {health_api_url}. Excpetion: {healthAPIConnectionError}"
+        logger.debug(exception)
+
+    return api_health_status
+
+def check_models_api(models_api_url: str,  api_health_status: APIHealthStatus) -> APIHealthStatus:
+    try:
+        models_api_response = requests.get(url=models_api_url, headers=API_REQUEST_HEADERS, timeout=5)
+        if models_api_response.ok:
+            api_health_status.models_api_status.status = "healthy"
+        else:
+            api_health_status.models_api_status.status = "unhealthy"
+        if len(models_api_response.json()["data"]) > 0:
+            models = extract_model_ids(models_api_response.json())
+            api_health_status.models_api_status.models = models
+            api_health_status.models_api_status.available = api_health_status.models_api_status.model_name in api_health_status.models_api_status.models
+        else:
+            api_health_status.models_api_status.models = []
+            api_health_status.models_api_status.available = False
+    except Exception as modelsAPIConnectionError:
+        exception = f"{datetime.now()}: Cannot connect to {api_health_status.models_api_status.model_name} Models API at {models_api_url}. Excpetion: {modelsAPIConnectionError}"
+        logger.debug(exception)
+    return api_health_status
+
+def check_health_and_models_api(health_api_url: str, models_api_url: str, model_name: str) -> APIHealthStatus:
+    local_health_status = APIHealthStatus("unknown", ModelsAPIStatus("unknown", model_name, [], False))
+    local_health_status = check_health_api(health_api_url=health_api_url, api_health_status=local_health_status)
+    local_health_status = check_models_api(models_api_url=models_api_url, api_health_status=local_health_status)
+    return local_health_status
+
+def update_health_status():
+    global health_status, incident_state, status_initialized
+    while True:
+        try:
+            new_granite_status = check_health_and_models_api(
+                health_api_url=granite_health_api_url,
+                models_api_url=granite_models_api_url,
+                model_name=granite_model_name,
+            )
+            if new_granite_status is not None:
+                with health_status_lock:
+                    health_status["granite"] = new_granite_status
+                    logger.info(f"Updated health_status: {health_status['granite']}")
+                    if (health_status["granite"].health_api_status != "healthy" or not health_status["granite"].models_api_status.available) and not incident_state["granite"] and status_initialized:
+                        incident_state["granite"] = True
+                        if enable_slack_posts:
+                            outage_notification_payload = create_slack_message(health_status["granite"], "outage", granite_health_api_url, granite_models_api_url)
+                            send_slack_notification(payload=outage_notification_payload, slack_webhook_url=slack_webhook_url)
+                    elif (health_status["granite"].health_api_status == "healthy" and health_status["granite"].models_api_status.available) and incident_state["granite"] and status_initialized:
+                        incident_state["granite"] = False
+                        if enable_slack_posts:
+                            resolution_notification_payload = create_slack_message(health_status["granite"], "resolution", granite_health_api_url, granite_models_api_url )
+                            send_slack_notification(payload=resolution_notification_payload, slack_webhook_url=slack_webhook_url)
+                    status_initialized = True
+            else:
+                logger.info(f"check_health_and_models_api returned None for {granite_model_name}")
+                status_initialized = True
+        except Exception as e:
+            logger.error(f"Error updating health status: {e}")
+            status_initialized = True
+        time.sleep(10)
+
+
+if __name__ == "__main__":
+    validate_env()
+
+    enable_slack_posts = False
+
+    if os.getenv("SLACK_WEBHOOK_URL"):
+        logger.info("Env variable \`$SLACK_WEBHOOK_URL\`, running with slack posting functionality.")
+        logger.info("Warning, slack posting functionality is prone to be noise and fragility against local model server deployment and manipulation. Be warned.")
+        enable_slack_posts = True
+        slack_webhook_url = os.getenv("SLACK_WEBHOOK_URL")
+
+    granite_health_api_url = f"{os.getenv('IL_GRANITE_API')}/health"
+    granite_models_api_url = f"{os.getenv('IL_GRANITE_API')}/v1/models"
+    granite_model_name = os.getenv("IL_GRANITE_MODEL_NAME")
+
+    incident_state = {
+        "granite": False,
+    }
+
+    health_status = {
+        "granite": APIHealthStatus("unknown", ModelsAPIStatus("unknown", granite_model_name, [], False)),
+    }
+
+    status_initialized = False
+
+    health_status_lock = threading.Lock()  # Lock for synchronizing access
+
+    threading.Thread(target=update_health_status, daemon=True).start()
+
+    with socketserver.TCPServer(("", 8080), HealthHandler) as httpd:
+        print("Serving health status on port 8080")
+        logger.info("Serving health status on port 8080")
+        httpd.serve_forever()
+
diff --git a/healthcheck-sidecar/stubbed_model_server.py b/healthcheck-sidecar/stubbed_model_server.py
new file mode 100644
index 00000000..79be7f86
--- /dev/null
+++ b/healthcheck-sidecar/stubbed_model_server.py
@@ -0,0 +1,61 @@
+import json
+import os
+from http.server import SimpleHTTPRequestHandler, HTTPServer
+import logging
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+def validate_env() -> None:
+    if not os.getenv("IL_GRANITE_MODEL_NAME"):
+        error = "expecting granite model name as env variable `$IL_GRANITE_MODEL_NAME`, which does not exist."
+        logging.error(error)
+        raise ValueError(error)
+
+class HealthHandler(SimpleHTTPRequestHandler):
+    def do_GET(self):
+        model_name = os.getenv("IL_GRANITE_MODEL_NAME")
+        if self.path == "/health":
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(b'{"status": "healthy"}')
+        elif self.path == "/v1/models":
+            stubbed_models = {
+                "object": "list",
+                "data": [
+                    {
+                        "id": model_name,
+                        "object": "model",
+                        "owned_by": "vllm",
+                        "max_model_len": 4096,
+                        "permission": [
+                            {
+                                "object": "model_permission",
+                                "allow_create_engine": False,
+                                "allow_sampling": True,
+                                "allow_logprobs": False,
+                                "allow_fine_tuning": True,
+                            }
+                        ]
+                    }
+                ]
+            }
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(str.encode(json.dumps(stubbed_models)))
+        else:
+            self.send_response(404)
+            self.end_headers()
+
+# Start the server
+def run(server_class=HTTPServer, handler_class=HealthHandler, port=8001):
+    server_address = ("", port)
+    httpd = server_class(server_address, handler_class)
+    print(f"Serving on http://localhost:{port}/health and http://localhost:{port}/v1/models")
+    httpd.serve_forever()
+
+if __name__ == "__main__":
+    validate_env()
+    run()
diff --git a/src/Containerfile b/src/Containerfile
index 995993d8..383b6289 100644
--- a/src/Containerfile
+++ b/src/Containerfile
@@ -5,8 +5,10 @@ USER root
 WORKDIR /opt/app-root/src
 
 COPY ./ .
+
+RUN dnf install -y jq
 RUN mkdir -p node_modules
-RUN chown -R default:root package*.json next-env.d.ts node_modules
+RUN chown -R default:root package*.json next-env.d.ts node_modules /opt/app-root/src/src/healthcheck-probe.sh
 
 USER default
 
diff --git a/src/healthcheck-probe.sh b/src/healthcheck-probe.sh
new file mode 100755
index 00000000..c13d6788
--- /dev/null
+++ b/src/healthcheck-probe.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# -*- indent-tabs-mode: nil; tab-width: 2; sh-indentation: 2; -*-
+
+# probe script to check to run as readinesProb (https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/#define-readiness-probes)
+# requires jq
+
+set -x
+set -e
+set -o pipefail
+
+if ! curl "http://localhost:8080/health" >/dev/null 2>&1; then
+  echo "Error: Could not access the sidecar server."
+  exit 1
+fi
+
+## Below checks the actual health of the model enpdoints, which I am not sure we want. The UI container
+## should be dependent on only the sidecar health server, and not its results.
+# granite_health_api_status=$(echo $health_curl | jq '.granite.health_api_status' | cut -d "\"" -f 2)
+# granite_models_api_status=$(echo $health_curl | jq '.granite.models_api_status.status' | cut -d "\"" -f 2)
+# granite_model_avilable=$(echo $health_curl | jq '.granite.models_api_status.available')
+
+# if [[ "$granite_health_api_status" != "healthy" ]]; then
+#   echo "\`.granite.health_api_status\` did not evaluate to healthy: ${granite_health_api_status}"
+#   exit 1
+# fi
+
+# if [[ "$granite_models_api_status" != "healthy" ]]; then
+#   echo "\`.granite.models_api_status\` did not evaluate to healthy: ${granite_models_api_status}"
+#   exit 1
+# fi
+
+# if [[ "$granite_model_avilable" != true ]]; then
+#   echo "\`.granite.models_api_status.available\` did not evaluate to healthy: ${granite_model_avilable}"
+#   exit 1
+# fi