Skip to content

Commit

Permalink
Optimize E2E with pod status check (#847)
Browse files Browse the repository at this point in the history
Signed-off-by: wen.rui <[email protected]>
  • Loading branch information
Rei1010 authored Feb 5, 2025
1 parent 03eef07 commit ba5bad5
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 10 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/call-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ jobs:
environment: ${{ matrix.device }}
env:
E2E_TYPE: ${{ inputs.type }}
HAMI_VERSION: ${{ inputs.ref }}
steps:
- name: checkout code
uses: actions/checkout@v4
Expand Down Expand Up @@ -68,8 +69,6 @@ jobs:
ssh root@$VSPHERE_GPU_VM_IP "nerdctl image ls | grep hami"
- name: deploy hami helm
env:
HAMI_VERSION: ${{ inputs.ref }}
run: |
make helm-deploy
Expand Down
5 changes: 3 additions & 2 deletions hack/deploy-helm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ else
fi

# Set Helm Chart source based on E2E_TYPE.
echo "E2E Type is: ${E2E_TYPE}"

if [ "${E2E_TYPE}" == "pullrequest" ]; then
echo "E2E Type is: ${E2E_TYPE}"
# Ensure the charts directory exists and contains a .tgz file
if [ -d "charts" ] && [ -n "$(ls charts/*.tgz 2>/dev/null)" ]; then
HELM_SOURCE=$(ls charts/*.tgz | head -n 1) # Use the first .tgz file found
Expand Down Expand Up @@ -96,7 +97,7 @@ fi
echo "Checking Pod status..."
kubectl --kubeconfig "${KUBE_CONF}" get po -n "${TARGET_NS}"

if ! util::check_pods_status "${KUBE_CONF}" "${TARGET_NS}"; then
if ! util::check_pods_status "${KUBE_CONF}" ; then
echo "Error: Pods are not running correctly."
exit 1
fi
Expand Down
38 changes: 32 additions & 6 deletions hack/util.sh
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,38 @@ function util::wait_ip_reachable {
# Check Pod status in a namespace.
function util::check_pods_status {
local kubeconfig=${1:-""}
local namespace=${2:-"hami-system"}
local namespace=${2:-""}
local retries=${3:-10}
local interval=${4:-30}

local attempt=0
local unhealthy_pods
unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers | awk '!/Running|Succeeded/ {print $1}')

while (( attempt < retries )); do
echo "Checking Pod status (Attempt $(( attempt + 1 ))/$retries)..."

# Checking unhealthy pods in namespaces,ignore the Running & Succeeded status
if [[ -z "$namespace" ]]; then
unhealthy_pods=$(kubectl get po -A --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $2}')
else
unhealthy_pods=$(kubectl get po -n "$namespace" --kubeconfig "$kubeconfig" --no-headers --ignore-not-found | awk '!/Running|Succeeded|Completed/ {print $1}')
fi

if [[ -z "$unhealthy_pods" ]]; then
echo "PASS: All Pods are in Running or Succeeded state."
return 0
fi

echo "Found unhealthy pods:"
echo "$unhealthy_pods"

if (( attempt < retries - 1 )); then
echo "Retrying pod check in ${interval}s..."
sleep "$interval"
fi

(( attempt++ ))
done

if [[ -n "$unhealthy_pods" ]]; then
echo "Found unhealthy pods in namespace $namespace:"
Expand All @@ -134,8 +163,5 @@ function util::check_pods_status {
done

return 1
else
echo "PASS: All Pods are in Running state."
return 0
fi
}
}

0 comments on commit ba5bad5

Please sign in to comment.