From ba5bad522102466e98db2ba7ac60d71ee18909a6 Mon Sep 17 00:00:00 2001 From: Rei1010 <56469400+Rei1010@users.noreply.github.com> Date: Wed, 5 Feb 2025 16:40:56 +0800 Subject: [PATCH] Optimize E2E with pod status check (#847) Signed-off-by: wen.rui --- .github/workflows/call-e2e.yaml | 3 +-- hack/deploy-helm.sh | 5 +++-- hack/util.sh | 38 +++++++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/.github/workflows/call-e2e.yaml b/.github/workflows/call-e2e.yaml index c6fd17f3d..ba9194b18 100644 --- a/.github/workflows/call-e2e.yaml +++ b/.github/workflows/call-e2e.yaml @@ -28,6 +28,7 @@ jobs: environment: ${{ matrix.device }} env: E2E_TYPE: ${{ inputs.type }} + HAMI_VERSION: ${{ inputs.ref }} steps: - name: checkout code uses: actions/checkout@v4 @@ -68,8 +69,6 @@ jobs: ssh root@$VSPHERE_GPU_VM_IP "nerdctl image ls | grep hami" - name: deploy hami helm - env: - HAMI_VERSION: ${{ inputs.ref }} run: | make helm-deploy diff --git a/hack/deploy-helm.sh b/hack/deploy-helm.sh index ce2e98bca..02474298c 100755 --- a/hack/deploy-helm.sh +++ b/hack/deploy-helm.sh @@ -42,8 +42,9 @@ else fi # Set Helm Chart source based on E2E_TYPE. +echo "E2E Type is: ${E2E_TYPE}" + if [ "${E2E_TYPE}" == "pullrequest" ]; then - echo "E2E Type is: ${E2E_TYPE}" # Ensure the charts directory exists and contains a .tgz file if [ -d "charts" ] && [ -n "$(ls charts/*.tgz 2>/dev/null)" ]; then HELM_SOURCE=$(ls charts/*.tgz | head -n 1) # Use the first .tgz file found @@ -96,7 +97,7 @@ fi echo "Checking Pod status..." kubectl --kubeconfig "${KUBE_CONF}" get po -n "${TARGET_NS}" -if ! util::check_pods_status "${KUBE_CONF}" "${TARGET_NS}"; then +if ! util::check_pods_status "${KUBE_CONF}" ; then echo "Error: Pods are not running correctly." exit 1 fi diff --git a/hack/util.sh b/hack/util.sh index 49699374e..7f4a0969b 100755 --- a/hack/util.sh +++ b/hack/util.sh @@ -116,9 +116,38 @@ function util::wait_ip_reachable { # Check Pod status in a namespace. function util::check_pods_status { local kubeconfig=${1:-""} - local namespace=${2:-"hami-system"} + local namespace=${2:-""} + local retries=${3:-10} + local interval=${4:-30} + + local attempt=0 local unhealthy_pods - unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers | awk '!/Running|Succeeded/ {print $1}') + + while (( attempt < retries )); do + echo "Checking Pod status (Attempt $(( attempt + 1 ))/$retries)..." + + # Checking unhealthy pods in namespaces,ignore the Running & Succeeded status + if [[ -z "$namespace" ]]; then + unhealthy_pods=$(kubectl get po -A --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $2}') + else + unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $1}') + fi + + if [[ -z "$unhealthy_pods" ]]; then + echo "PASS: All Pods are in Running or Succeeded state." + return 0 + fi + + echo "Found unhealthy pods:" + echo "$unhealthy_pods" + + if (( attempt < retries - 1 )); then + echo "Retrying pod check in ${interval}s..." + sleep "$interval" + fi + + (( attempt++ )) + done if [[ -n "$unhealthy_pods" ]]; then echo "Found unhealthy pods in namespace $namespace:" @@ -134,8 +163,5 @@ function util::check_pods_status { done return 1 - else - echo "PASS: All Pods are in Running state." - return 0 fi -} \ No newline at end of file +}