From 6597a2d54757da16c18be260914607c5d6f1ad37 Mon Sep 17 00:00:00 2001 From: Michael Smith Date: Fri, 22 Nov 2024 18:45:57 -0500 Subject: [PATCH] chore: add updates to force redeployment on Vercel (#348) ## Changes made - Updated `check.sh` script to add support for automatic re-deploying in the event that the the registry has a partial/full outage. --------- Co-authored-by: Cian Johnston --- .github/scripts/check.sh | 83 +++++++++++++++++++++++++++++++++--- .github/workflows/check.yaml | 3 +- 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/.github/scripts/check.sh b/.github/scripts/check.sh index abb47907..5c1e83d4 100755 --- a/.github/scripts/check.sh +++ b/.github/scripts/check.sh @@ -2,11 +2,17 @@ set -o pipefail set -u +VERBOSE="${VERBOSE:-0}" +if [[ "${VERBOSE}" -ne "0" ]]; then + set -x +fi + # List of required environment variables required_vars=( "INSTATUS_API_KEY" "INSTATUS_PAGE_ID" "INSTATUS_COMPONENT_ID" + "VERCEL_API_KEY" ) # Check if each required variable is set @@ -24,7 +30,7 @@ declare -a modules=() declare -a failures=() # Collect all module directories containing a main.tf file -for path in $(find . -not -path '*/.*' -type f -name main.tf -maxdepth 2 | cut -d '/' -f 2 | sort -u); do +for path in $(find . -maxdepth 2 -not -path '*/.*' -type f -name main.tf | cut -d '/' -f 2 | sort -u); do modules+=("${path}") done @@ -45,7 +51,7 @@ create_incident() { local incident_name="Testing Instatus" local message="The following modules are experiencing issues:\n" for i in "${!failures[@]}"; do - message+="$(($i + 1)). ${failures[$i]}\n" + message+="$((i + 1)). ${failures[$i]}\n" done component_status="PARTIALOUTAGE" @@ -74,6 +80,70 @@ create_incident() { echo "$incident_id" } +force_redeploy_registry () { + # These are not secret values; safe to just expose directly in script + local VERCEL_TEAM_SLUG="codercom" + local VERCEL_TEAM_ID="team_tGkWfhEGGelkkqUUm9nXq17r" + local VERCEL_APP="registry" + + local latest_res + latest_res=$(curl "https://api.vercel.com/v6/deployments?app=$VERCEL_APP&limit=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID&target=production&state=BUILDING,INITIALIZING,QUEUED,READY" \ + --fail \ + --silent \ + --header "Authorization: Bearer $VERCEL_API_KEY" \ + --header "Content-Type: application/json" + ) + + # If we have zero deployments, something is VERY wrong. Make the whole + # script exit with a non-zero status code + local latest_id + latest_id=$(echo "${latest_res}" | jq -r '.deployments[0].uid') + if [[ "${latest_id}" = "null" ]]; then + echo "Unable to pull any previous deployments for redeployment" + echo "Please redeploy the latest deployment manually in Vercel." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi + + local latest_date_ts_seconds + latest_date_ts_seconds=$(echo "${latest_res}" | jq -r '.deployments[0].createdAt/1000|floor') + local current_date_ts_seconds + current_date_ts_seconds="$(date +%s)" + local max_redeploy_interval_seconds=7200 # 2 hours + if (( current_date_ts_seconds - latest_date_ts_seconds < max_redeploy_interval_seconds )); then + echo "The registry was deployed less than 2 hours ago." + echo "Not automatically re-deploying the regitstry." + echo "A human reading this message should decide if a redeployment is necessary." + echo "Please check the Vercel dashboard for more information." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi + + local latest_deployment_state + latest_deployment_state="$(echo "${latest_res}" | jq -r '.deployments[0].state')" + if [[ "${latest_deployment_state}" != "READY" ]]; then + echo "Last deployment was not in READY state. Skipping redeployment." + echo "A human reading this message should decide if a redeployment is necessary." + echo "Please check the Vercel dashboard for more information." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi + + echo "=============================================================" + echo "!!! Redeploying registry with deployment ID: ${latest_id} !!!" + echo "=============================================================" + + if ! curl -X POST "https://api.vercel.com/v13/deployments?forceNew=1&skipAutoDetectionConfirmation=1&slug=$VERCEL_TEAM_SLUG&teamId=$VERCEL_TEAM_ID" \ + --fail \ + --header "Authorization: Bearer $VERCEL_API_KEY" \ + --header "Content-Type: application/json" \ + --data-raw "{ \"deploymentId\": \"${latest_id}\", \"name\": \"${VERCEL_APP}\", \"target\": \"production\" }"; then + echo "DEPLOYMENT FAILED! Please check the Vercel dashboard for more information." + echo "https://vercel.com/codercom/registry/deployments" + exit 1 + fi +} + # Check each module's accessibility for module in "${modules[@]}"; do # Trim leading/trailing whitespace from module name @@ -81,7 +151,6 @@ for module in "${modules[@]}"; do url="${REGISTRY_BASE_URL}/modules/${module}" printf "=== Checking module %s at %s\n" "${module}" "${url}" status_code=$(curl --output /dev/null --head --silent --fail --location "${url}" --retry 3 --write-out "%{http_code}") - # shellcheck disable=SC2181 if (( status_code != 200 )); then printf "==> FAIL(%s)\n" "${status_code}" status=1 @@ -94,11 +163,11 @@ done # Determine overall status and update Instatus component if (( status == 0 )); then echo "All modules are operational." - # set to + # set to update_component_status "OPERATIONAL" else echo "The following modules have issues: ${failures[*]}" - # check if all modules are down + # check if all modules are down if (( ${#failures[@]} == ${#modules[@]} )); then update_component_status "MAJOROUTAGE" else @@ -108,6 +177,10 @@ else # Create a new incident incident_id=$(create_incident) echo "Created incident with ID: $incident_id" + + # If a module is down, force a reployment to try getting things back online + # ASAP + force_redeploy_registry fi exit "${status}" diff --git a/.github/workflows/check.yaml b/.github/workflows/check.yaml index 4095073e..02422ff2 100644 --- a/.github/workflows/check.yaml +++ b/.github/workflows/check.yaml @@ -2,7 +2,7 @@ name: Health # Check modules health on registry.coder.com on: schedule: - - cron: "*/13 * * * *" # Runs every 13th minute + - cron: "0,15,30,45 * * * *" # Runs every 15 minutes workflow_dispatch: # Allows manual triggering of the workflow if needed jobs: @@ -20,3 +20,4 @@ jobs: INSTATUS_API_KEY: ${{ secrets.INSTATUS_API_KEY }} INSTATUS_PAGE_ID: ${{ secrets.INSTATUS_PAGE_ID }} INSTATUS_COMPONENT_ID: ${{ secrets.INSTATUS_COMPONENT_ID }} + VERCEL_API_KEY: ${{ secrets.VERCEL_API_KEY }}