From 87ed14e8cb1cea6d19a6fdaa541d1b455e889dc0 Mon Sep 17 00:00:00 2001 From: shiva kumar Date: Tue, 10 Sep 2024 11:31:20 +0530 Subject: [PATCH] Pre-compiled end-to-end gpu driver validation 1. Pre-compiled end-to-end gpu driver validation Signed-off-by: shiva kumar --- .github/workflows/ci.yaml | 16 ++++++++++------ .github/workflows/image.yaml | 20 +++++++++++--------- .github/workflows/precompiled.yaml | 26 ++++++++++++++++++++------ tests/cases/nvidia-driver.sh | 8 ++++++++ tests/ci-remote-exec.sh | 11 +++-------- tests/ci-run-e2e.sh | 10 +++------- tests/local.sh | 4 +--- tests/remote-exec-local.sh | 8 +------- tests/scripts/.definitions.sh | 4 ---- tests/scripts/findkernelversion.sh | 2 ++ tests/scripts/install-operator.sh | 2 +- tests/scripts/kernel-upgrade-helper.sh | 2 -- tests/scripts/upgrade-kernel.sh | 8 +++++++- 13 files changed, 67 insertions(+), 54 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 264bc62a..4859984c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -20,7 +20,8 @@ on: types: - completed branches: - - main + # SHIVA + - e2etestdriver jobs: e2e-tests-nvidiadriver: @@ -61,15 +62,18 @@ jobs: - name: Validate gpu driver env: TEST_CASE: "./tests/cases/nvidia-driver.sh" - OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia" + GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia" run: | rc=0 - for driver_version in ${DRIVER_VERSIONS}; do - echo "Running e2e for DRIVER_VERSION=$driver_version" + for DRIVER_VERSION in ${DRIVER_VERSIONS}; do + echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION" status=0 - ./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$? + TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${COMMIT_SHORT_SHA}-${DRIVER_VERSION}" + # add escape character for space + TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS") + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$? if [ $status -ne 0 ]; then - echo "e2e validation failed for driver version $driver_version with status $status" + echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" rc=$status fi done diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 0aa960c0..9e2cbef1 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -16,17 +16,19 @@ name: image on: - pull_request: - types: - - opened - - synchronize - branches: - - main - - release-* + # SHIVA + # pull_request: + # types: + # - opened + # - synchronize + # branches: + # - main + # - release-* push: branches: - - main - - release-* + # - main + # - release-* + - e2etestdriver jobs: image: diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index f4c34f0a..b35ad366 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -16,8 +16,18 @@ name: Precompiled images on: - schedule: - - cron: '00 09 * * *' # scheduled job + # SHIVA + # schedule: + # - cron: '00 09 * * *' # scheduled job + pull_request: + types: + - opened + - synchronize + branches: + - e2etestdriver + push: + branches: + - e2etestdriver jobs: set-driver-version-matrix: @@ -201,7 +211,7 @@ jobs: UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh" run: | status=0 - ./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$? + ./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" \"${KERNEL_VERSION}\" || status=$? # On the target system, all scripts/test-case exit with code 1 for error handling. # However, since reboot-related disconnections break the SSH connection # and can cause the entire job to exit, we should ignore all errors except @@ -220,16 +230,19 @@ jobs: - name: Precompiled e2e test gpu driver validation env: TEST_CASE: "./tests/cases/nvidia-driver.sh" - OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" + GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" run: | rc=0 # for precompiled driver we are setting driver branch as driver version driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' driver_versions=$(echo "$driver_versions_json" | jq -r '.[]') for DRIVER_VERSION in $driver_versions; do - echo "Running e2e for DRIVER_VERSION=$driver_version" + echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION" status=0 - ./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$? + TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}" + # add escape character for space + TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS") + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$? if [ $status -eq 1 ]; then echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" rc=$status @@ -237,6 +250,7 @@ jobs: done ./tests/scripts/pull.sh /tmp/logs logs exit $rc + - name: Archive test logs if: ${{ failure() }} uses: actions/upload-artifact@v4 diff --git a/tests/cases/nvidia-driver.sh b/tests/cases/nvidia-driver.sh index d2afad83..dcd9b509 100755 --- a/tests/cases/nvidia-driver.sh +++ b/tests/cases/nvidia-driver.sh @@ -1,6 +1,14 @@ #! /bin/bash # This test case runs the operator installation / test case with the default options. +if [[ $# -lt 1 ]]; then + echo "Error: $0 must be called with driver options" + exit 1 +fi + +# export gpu-operator options +export TEST_CASE_ARGS="$1" + SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" source "${SCRIPTS_DIR}"/.definitions.sh diff --git a/tests/ci-remote-exec.sh b/tests/ci-remote-exec.sh index 72ed2046..e9ed1b97 100755 --- a/tests/ci-remote-exec.sh +++ b/tests/ci-remote-exec.sh @@ -2,16 +2,11 @@ set -xe -if [[ $# -gt 2 ]]; then - echo "Error: ci-remote-exec.sh must be called with 1(REMOTE_EXEC) or 2(REMOTE_EXEC, REMOTE_EXEC_ARGS) argument only." +if [[ $# -lt 1 ]]; then + echo "Error:$0 must be called with 1(REMOTE_EXEC) or more than 1 args (REMOTE_EXEC, ARGS1 ARGS2 etc)" exit 1 fi -export REMOTE_EXEC="${1}" -# to pass more enviroment variable , use space seprated REMOTE_EXEC_ARGS -# e.g. REMOTE_EXEC_ARGS="env1=arg1 env2=arg2" etc -export REMOTE_EXEC_ARGS="${2}" - TEST_DIR="$(pwd)/tests" -${TEST_DIR}/remote-exec-local.sh +${TEST_DIR}/remote-exec-local.sh "$@" diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 522aa558..9a3b328a 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -2,15 +2,11 @@ set -xe -if [[ $# -ne 3 ]]; then - echo "TEST_CASE TARGET_DRIVER_VERSION TEST_CASE_ARGS are required" +if [[ $# -ne 2 ]]; then + echo "TEST_CASE TEST_CASE_ARGS are required" exit 1 fi -export TEST_CASE="${1}" -export TARGET_DRIVER_VERSION="${2}" -export TEST_CASE_ARGS="${3}" - TEST_DIR="$(pwd)/tests" -${TEST_DIR}/local.sh +${TEST_DIR}/local.sh "$@" diff --git a/tests/local.sh b/tests/local.sh index c7a1c0f2..75a0fa1f 100755 --- a/tests/local.sh +++ b/tests/local.sh @@ -23,6 +23,4 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites. # are forwarded to the remote shell. remote \ PROJECT="${PROJECT}" \ - TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \ - TEST_CASE_ARGS=\"${TEST_CASE_ARGS}\" \ - ${TEST_CASE} + "$@" \ No newline at end of file diff --git a/tests/remote-exec-local.sh b/tests/remote-exec-local.sh index 4cace8a0..2cf7cc05 100755 --- a/tests/remote-exec-local.sh +++ b/tests/remote-exec-local.sh @@ -15,13 +15,7 @@ source ${SCRIPT_DIR}/.local.sh # Sync the project folder to the remote ${SCRIPT_DIR}/push.sh -# We trigger the installation of prerequisites on the remote instance -remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh - # We trigger the specified script on the remote instance. -# Note: We need to ensure that the required environment variables -# are forwarded to the remote shell. remote \ PROJECT="${PROJECT}" \ - REMOTE_EXEC_ARGS=\"${REMOTE_EXEC_ARGS}\" \ - ${REMOTE_EXEC} + "$@" \ No newline at end of file diff --git a/tests/scripts/.definitions.sh b/tests/scripts/.definitions.sh index bb38f2fe..945bb04c 100644 --- a/tests/scripts/.definitions.sh +++ b/tests/scripts/.definitions.sh @@ -16,15 +16,11 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )" : ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"} -: ${TARGET_DRIVER_VERSION:="550.90.07"} - : ${DAEMON_POD_STATUS_TIME_OUT:="15m"} : ${POD_STATUS_TIME_OUT:="2m"} : ${LOG_DIR:="/tmp/logs"} -: ${TEST_CASE_ARGS:="--set driver.repository=ghcr.io/nvidia"} -: ${REMOTE_EXEC_ARGS:=""} : ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"} : ${BASE_TARGET:="jammy"} diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh index b0f12343..4eb83d20 100755 --- a/tests/scripts/findkernelversion.sh +++ b/tests/scripts/findkernelversion.sh @@ -30,3 +30,5 @@ if [[ $status -eq 0 ]]; then else export should_continue=true fi +# SHIVA +export should_continue=true diff --git a/tests/scripts/install-operator.sh b/tests/scripts/install-operator.sh index bbd5221b..2b4bcbaf 100755 --- a/tests/scripts/install-operator.sh +++ b/tests/scripts/install-operator.sh @@ -12,7 +12,7 @@ echo "Current kernel version: $CURRENT_KERNEL" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source ${SCRIPT_DIR}/.definitions.sh -OPERATOR_OPTIONS="${TEST_CASE_ARGS} --set driver.version=${TARGET_DRIVER_VERSION}" +OPERATOR_OPTIONS="${TEST_CASE_ARGS}" # add helm driver repo helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update diff --git a/tests/scripts/kernel-upgrade-helper.sh b/tests/scripts/kernel-upgrade-helper.sh index cfb3facf..a8926397 100755 --- a/tests/scripts/kernel-upgrade-helper.sh +++ b/tests/scripts/kernel-upgrade-helper.sh @@ -8,8 +8,6 @@ fi SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" source "${SCRIPT_DIR}"/.definitions.sh -export KERNEL_VERSION="${REMOTE_EXEC_ARGS}" - echo "Checking current kernel version..." CURRENT_KERNEL=$(uname -r) echo "Current kernel version: $CURRENT_KERNEL" diff --git a/tests/scripts/upgrade-kernel.sh b/tests/scripts/upgrade-kernel.sh index e7d90ec3..0c575574 100755 --- a/tests/scripts/upgrade-kernel.sh +++ b/tests/scripts/upgrade-kernel.sh @@ -1,8 +1,14 @@ #! /bin/bash # This test case runs the operator installation / test case with the default options. +if [[ $# -ne 1 ]]; then + echo "Error: $0 must be called with kernel_version" + exit 1 +fi + SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" source "${SCRIPTS_DIR}"/.definitions.sh -# Run an end-to-end test cycle +# export kernel version and Run an end-to-end test cycle +export KERNEL_VERSION="$1" "${SCRIPTS_DIR}"/kernel-upgrade-helper.sh