generated from bcgov/quickstart-openshift
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
chore: pipeline health checks during rollout (#795)
Co-authored-by: afwilcox <[email protected]>
- Loading branch information
Showing
7 changed files
with
459 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,393 @@ | ||
#!/bin/bash | ||
# Designed for use together with tools like Helm to provide a secondary health check on a release | ||
# in addition to default helm health checks. Ideally, you should set the timeout to be slightly less | ||
# than the default helm install/upgrade timeout to allow for this script to run and collect information from | ||
# ephemeral workloads before helm cleans them up in the event of an atomic failure. | ||
# | ||
# Note that this script does not surface log details, but does provide commands to fetch them if found | ||
# | ||
# Requirement: workloads to health check need the label 'app.kubernetes.io/name' | ||
# | ||
# Dependencies: oc, jq | ||
|
||
# ENV: | ||
# OC_NAMESPACE: namespace to scan | ||
# SKIP_AUTH: set to true to skip auth and use your existing local kubeconfig | ||
# OC_SERVER: OpenShift server URL | ||
# TIMEOUT_SECONDS: timeout in seconds for health check | ||
# POLL_INTERVAL_SECONDS: interval in seconds to poll health check | ||
# LABEL_SELECTOR: label selector to filter resources to health check on | ||
# ERROR_EXPR: error expression to search for in logs | ||
# FORCE_PASS: set to 1 to force pass the health check | ||
|
||
# TODO: break out funcs into plugins | ||
if [ -z "$LABEL_SELECTOR" ]; then | ||
echo "LABEL_SELECTOR is not set. Exiting..." | ||
help_str | ||
exit 1 | ||
fi | ||
if [ -z "$OC_NAMESPACE" ]; then | ||
echo "OC_NAMESPACE is not set. Exiting..." | ||
help_str | ||
exit 1 | ||
fi | ||
|
||
# configure defaults | ||
if [ -z "$ERROR_EXPR" ]; then | ||
ERROR_EXPR="error|fatal|exception|stacktrace" | ||
fi | ||
if [ -z "$TIMEOUT_SECONDS" ]; then | ||
TIMEOUT_SECONDS=420 # 7m | ||
fi | ||
if [ -z "$POLL_INTERVAL_SECONDS" ]; then | ||
POLL_INTERVAL_SECONDS=15 | ||
fi | ||
# prevent pipeline blocking but still get details if desired | ||
if [ -z "$FORCE_PASS" ]; then | ||
FORCE_PASS=0 | ||
fi | ||
|
||
# will be set to 1 if a timeout occurs from an unfinished rollout | ||
# we use this to fetch triage info | ||
TIMED_OUT=0 | ||
|
||
# string of commands to run after health check for additional info | ||
COMMANDS_TO_RUN="" | ||
|
||
# global flag to indicate if health check passed | ||
HEALTH_CHECK_FAILED=0 | ||
|
||
|
||
set -e # failfast | ||
trap 'echo "Error occurred at line $LINENO while executing function $FUNCNAME"' ERR | ||
|
||
help_str() { | ||
echo "Usage: SKIP_AUTH=true LABEL_SELECTOR=\"app.kubernetes.io/instance=nr-compliance-enforcement-PRNUM\" OC_NAMESPACE=c1c7ed-dev .github/scripts/rollout_healthcheck.sh" | ||
echo "" | ||
echo "Ensure that you have oc, jq and curl installed, and that you've logged in with oc" | ||
} | ||
|
||
# Handle auth | ||
OC_TEMP_TOKEN="" | ||
if [ -z "$OC_NAMESPACE" ]; then | ||
echo "OC_NAMESPACE is not set. Exiting..." | ||
help_str | ||
exit 1 | ||
fi | ||
if [ "$SKIP_AUTH" != "true" ]; then | ||
if [ -z "$OC_SERVER" ]; then | ||
echo "OC_SERVER is not set. Exiting..." | ||
help_str | ||
exit 1 | ||
fi | ||
if [ -z "$OC_TOKEN" ]; then | ||
echo "OC_TOKEN is not set. Exiting..." | ||
help_str | ||
exit 1 | ||
fi | ||
# Auth flow | ||
OC_TEMP_TOKEN=$(curl -k -X POST $OC_SERVER/api/v1/namespaces/$OC_NAMESPACE/serviceaccounts/pipeline/token --header "Authorization: Bearer $OC_TOKEN" -d '{"spec": {"expirationSeconds": 600}}' -H 'Content-Type: application/json; charset=utf-8' | jq -r '.status.token' ) | ||
oc login --token=$OC_TEMP_TOKEN --server=$OC_SERVER | ||
oc project $OC_NAMESPACE # Safeguard! | ||
fi | ||
|
||
get_workload_list() { | ||
local label_selector=$1 | ||
local workload_list | ||
workload_list=$(oc get all -n $OC_NAMESPACE -l $label_selector -oname) | ||
echo "$workload_list" | ||
} | ||
|
||
echo_red() { | ||
echo -e "\033[0;31m$1\033[0m" | ||
} | ||
|
||
echo_green() { | ||
echo -e "\033[0;32m$1\033[0m" | ||
} | ||
|
||
echo_yellow() { | ||
echo -e "\033[0;33m$1\033[0m" | ||
} | ||
|
||
echo_checkmark() { | ||
echo "✅" | ||
} | ||
|
||
echo_cross() { | ||
echo "❌" | ||
} | ||
|
||
# checks if the deployment has replicas in the correct states | ||
_health_check_deployment() { | ||
local healthy="false" | ||
local no_unready_replicas="false" | ||
local equal_ready_replicas="false" | ||
local one_available_replica="false" | ||
local workload_name=$1 | ||
local replicas | ||
local unavailable_replicas | ||
local ready_replicas | ||
replicas=$(oc get -n $OC_NAMESPACE $workload_name -o jsonpath='{.status.replicas}') | ||
# 2 following keys can not exist in the manifest if they're 0 | ||
unavailable_replicas=$(oc get -n $OC_NAMESPACE $workload_name -o jsonpath='{.status.unavailableReplicas}') | ||
one_available_replica=$(oc get -n $OC_NAMESPACE $workload_name -o jsonpath='{.status.availableReplicas}') | ||
if [ -z "$unavailable_replicas" ]; then | ||
unavailable_replicas=0 | ||
fi | ||
ready_replicas=$(oc get -n $OC_NAMESPACE $workload_name -o jsonpath='{.status.readyReplicas}') | ||
if [ -z "$ready_replicas" ]; then | ||
ready_replicas=0 | ||
fi | ||
if [ -z "$one_available_replica" ]; then | ||
one_available_replica=0 | ||
fi | ||
# begin replica flag setting | ||
if [ "$replicas" -eq "$ready_replicas" ]; then | ||
equal_ready_replicas="true" | ||
fi | ||
if [ "$unavailable_replicas" -eq 0 ]; then | ||
no_unready_replicas="true" | ||
fi | ||
if [ "$one_available_replica" -eq 1 ]; then | ||
one_available_replica="true" | ||
fi | ||
if [ "$equal_ready_replicas" == "true" ] && [ "$no_unready_replicas" == "true" ] && [ "$one_available_replica" == "true" ]; then | ||
healthy="true" | ||
fi | ||
echo "$healthy" | ||
} | ||
|
||
# checks if a pod is running | ||
_pod_running() { | ||
local pod_name=$1 | ||
local ready="false" | ||
local pod_status | ||
pod_status=$(oc get -n $OC_NAMESPACE $pod_name -o jsonpath='{.status.phase}') | ||
if [ "$pod_status" == "Running" ]; then | ||
ready="true" | ||
fi | ||
echo "$ready" | ||
} | ||
|
||
# polls deployments and pods for readiness before kicking off triage info collection | ||
poll_deployments() { | ||
local succeeded="true" | ||
local deployment_list=$1 | ||
local pod_list=$2 | ||
local deployment_status | ||
for deployment in $deployment_list; do | ||
deployment_status=$(_health_check_deployment $deployment) | ||
if [ "$deployment_status" == "false" ]; then | ||
succeeded="false" | ||
break | ||
fi | ||
done | ||
for pod in $pod_list; do | ||
if [ "$(_pod_running "$pod")" == "false" ]; then | ||
succeeded="false" | ||
break | ||
fi | ||
done | ||
echo "$succeeded" | ||
} | ||
|
||
# iterates through deployments and finds their corresponding replicaset list | ||
# verifies that the latest replicaset has ready replicas and that | ||
# kubernetes did not quietly rollback to a previous replicaset | ||
# Dependency: deployements and replicasets need the app.kubernetes.io/name label | ||
on_latest_replicasets() { | ||
local deployment_list=$1 | ||
local latest_replicaset | ||
local lrsr | ||
local app_name_labels | ||
app_name_labels=$(echo "$deployment_list" | xargs oc get -n $OC_NAMESPACE -ojson | jq -r '.items[].metadata.labels["app.kubernetes.io/name"]' | sort | uniq) | ||
for app_name in $app_name_labels; do | ||
latest_replicaset=$(oc get -n $OC_NAMESPACE replicaset -l app.kubernetes.io/name=$app_name,$LABEL_SELECTOR --sort-by=.metadata.creationTimestamp -ojson | jq -r '.items[-1].metadata.name') | ||
lrsr=$(oc get -n $OC_NAMESPACE replicaset $latest_replicaset -ojson | jq -r '.status.readyReplicas') | ||
if [ "$lrsr" -eq 0 ]; then | ||
echo_red "$(echo_cross) Deployment $app_name latest replicaset $latest_replicaset has 0 ready replicas" | ||
HEALTH_CHECK_FAILED=1 | ||
else | ||
echo_green "$(echo_checkmark) Deployment $app_name latest replicaset $latest_replicaset in use with $lrsr ready replicas" | ||
fi | ||
done | ||
|
||
} | ||
|
||
# pods should be marked as ready and running to be considered healthy | ||
all_pods_ready() { | ||
local pod_list=$1 | ||
local pod_status | ||
for pod in $pod_list; do | ||
if [ "$(_pod_running $pod)" == "false" ]; then | ||
echo_red "$(echo_cross) Pod $pod has not finished startup and is not classified as running" | ||
HEALTH_CHECK_FAILED=1 | ||
else | ||
echo_green "$(echo_checkmark) Pod $pod is running" | ||
fi | ||
done | ||
} | ||
|
||
# checks that pods aren't restarting during the release | ||
no_pod_restarts() { | ||
local pod_list=$1 | ||
local restarts | ||
for pod in $pod_list; do | ||
restarts=$(oc get -n $OC_NAMESPACE $pod -ojson | jq -r '.status.containerStatuses[].restartCount') | ||
if [ "$restarts" -gt 0 ]; then | ||
echo_red "$(echo_cross) Pod $pod has $restarts restarts!" | ||
HEALTH_CHECK_FAILED=1 | ||
COMMANDS_TO_RUN+="\noc describe -n $OC_NAMESPACE $pod;" | ||
else | ||
echo_green "$(echo_checkmark) Pod $pod has no restarts" | ||
fi | ||
done | ||
} | ||
|
||
# simple heuristic check of pods under the label selector | ||
# checks last 100 lines for error expression matches | ||
no_error_logs() { | ||
local pod_list=$1 | ||
local error_logs | ||
for pod in $pod_list; do | ||
error_logs=$(oc logs -n $OC_NAMESPACE $pod --all-containers --tail=100 --since=60m | grep -E "$ERROR_EXPR" || true) | ||
if [ -n "$error_logs" ]; then | ||
echo_red "$(echo_cross) Pod $pod has error logs" | ||
HEALTH_CHECK_FAILED=1 | ||
COMMANDS_TO_RUN+="\noc logs -n $OC_NAMESPACE $pod --all-containers --tail=100 --since=60m | grep -E \"$ERROR_EXPR\" || true;" | ||
else | ||
echo_green "$(echo_checkmark) Pod $pod has no recent error logs" | ||
fi | ||
done | ||
} | ||
|
||
# fetches and filters all namespace events and attempts to find | ||
# any non informational events related to the workloads rolled out by helm | ||
# TODO: consider tweaking sensitivity of event filtering | ||
no_associated_events() { | ||
local events="" | ||
# eg: app.kubernetes.io/instance=nr-compliance-enforcement-771 -> nr-compliance-enforcement-771 | ||
local object_pattern | ||
object_pattern=$(echo "$LABEL_SELECTOR" | cut -d'=' -f2) | ||
local time_window | ||
time_window=$(date -u -d '5 minutes ago' +'%Y-%m-%dT%H:%M:%SZ') | ||
local event_summary="" | ||
local event_count=0 | ||
local event_ln_check=0 | ||
events=$(oc get events -n "$OC_NAMESPACE" -o json | jq ' | ||
[.items[] | | ||
select( | ||
.type == "Warning" and | ||
(.lastTimestamp // .eventTime) >= "'$time_window'" and | ||
.count >= 3 | ||
) | | ||
{ | ||
name: .involvedObject.name, | ||
message: .message, | ||
reason: .reason, | ||
count: .count, | ||
lastSeen: (.lastTimestamp // .eventTime) | ||
}] | ||
') | ||
event_ln_check=$(echo "$events" | jq -r 'length') | ||
if [ "$event_ln_check" -gt 0 ]; then | ||
event_summary=$(echo -e "$events" | jq -r '.[] | [.name, .message, .reason] | @tsv') | ||
event_summary=$(echo -e "$event_summary" | grep "$object_pattern" || true) | ||
# exit out, found no applicable events after filtering | ||
if [ -z "$event_summary" ]; then | ||
echo_green "$(echo_checkmark) No warning-type events associated with release $object_pattern" | ||
return | ||
fi | ||
event_count=$(echo "$event_summary" | wc -l) | ||
echo_red "$(echo_cross) Found the following $event_count warning (error) events associated with this helm release:" | ||
echo_yellow "\tNote: warning event history can persist between deployments and may not be related to the current rollout. Wait 5 minutes for them to be filtered out." | ||
echo -e "$event_summary" | sed 's/^/\t/' # tab indent the events for readability | ||
HEALTH_CHECK_FAILED=1 | ||
COMMANDS_TO_RUN+="\noc get events -n $OC_NAMESPACE | grep -Ei $object_pattern;" | ||
else | ||
echo_green "$(echo_checkmark) No warning-type events associated with release $object_pattern" | ||
fi | ||
} | ||
|
||
# Creates a triage report for the rollout | ||
# summarizing details and providing commands to run for more info if applicable | ||
triage_rollout() { | ||
local deployment_list=$1 | ||
local pod_list=$2 | ||
local replicaset_list=$3 | ||
local statefulset_list=$4 | ||
echo_yellow "Status of workloads rolled out under $LABEL_SELECTOR:" | ||
on_latest_replicasets "$deployment_list" | ||
all_pods_ready "$pod_list" | ||
no_pod_restarts "$pod_list" | ||
no_error_logs "$pod_list" | ||
no_associated_events | ||
if [ "$COMMANDS_TO_RUN" != "" ]; then | ||
echo_yellow "Run these to get more information about pod logs or events:" | ||
echo -e "$COMMANDS_TO_RUN\n" | ||
echo "" | ||
echo_yellow "To remove log-related failures during your next rollout, delete any pods listed here." | ||
echo "" | ||
fi | ||
if [ "$TIMED_OUT" -eq 1 ]; then | ||
echo_red "Polling timed out, indicating the helm install was not successful or took too long to complete" | ||
HEALTH_CHECK_FAILED=1 | ||
fi | ||
echo_yellow "Triage complete." | ||
echo "" | ||
echo_yellow "Overall Health Check Status:" | ||
if [ "$HEALTH_CHECK_FAILED" -eq 1 ] && [ "$FORCE_PASS" -eq 1 ]; then | ||
echo_green "$(echo_checkmark) Health check passed (forced) review logs for details" | ||
HEALTH_CHECK_FAILED=0 | ||
elif [ "$HEALTH_CHECK_FAILED" -eq 1 ]; then | ||
echo_red "$(echo_cross) Health check failed" | ||
else | ||
echo_green "$(echo_checkmark) Health check passed" | ||
fi | ||
} | ||
|
||
main() { | ||
echo_yellow "Beginning Polled Health Check for Workloads labeled with $LABEL_SELECTOR..." | ||
echo_yellow "Polling timeout set to $TIMEOUT_SECONDS seconds on an periodic interval of $POLL_INTERVAL_SECONDS seconds." | ||
echo "..." | ||
local workload_list | ||
local deployment_list | ||
local pod_list | ||
local replicaset_list | ||
local statefulset_list | ||
local start_time | ||
workload_list=$(get_workload_list $LABEL_SELECTOR) | ||
echo_yellow "Found the following workloads to health check:" | ||
echo "---" | ||
echo "$workload_list" | ||
echo "---" | ||
if [ -z "$workload_list" ]; then | ||
echo_red "No workloads found to health check. Helm install could be stuck or not started yet!. Exiting..." | ||
exit 1 | ||
fi | ||
deployment_list=$(echo -e "$workload_list" | grep "deployment") | ||
pod_list=$(echo -e "$workload_list" | grep "pod") | ||
start_time=$(date +%s) | ||
echo_yellow "Polling deployments:" | ||
echo "---" | ||
echo "$deployment_list" | ||
echo "---" | ||
echo_yellow "Beginning polling..." | ||
while [ "$(poll_deployments "$deployment_list" "$pod_list")" == "false" ]; do | ||
echo "..." | ||
if [ $(($(date +%s) - $start_time)) -gt $TIMEOUT_SECONDS ]; then | ||
echo_red "One or more deployments did not finish within the timeout period!" | ||
echo_red "Collecting triage info..." | ||
TIMED_OUT=1 | ||
break | ||
fi | ||
sleep $POLL_INTERVAL_SECONDS | ||
done | ||
echo_yellow "Polling finished..." | ||
replicaset_list=$(echo -e "$workload_list" | grep "replicaset") | ||
statefulset_list=$(echo -e "$workload_list" | grep "statefulset") | ||
triage_rollout "$deployment_list" "$pod_list" "$replicaset_list" "$statefulset_list" | ||
exit $HEALTH_CHECK_FAILED | ||
} | ||
main |
Oops, something went wrong.