diff --git a/.github/workflows/image.yaml b/.github/workflows/image.yaml index 4c98fe7f..223ee39f 100644 --- a/.github/workflows/image.yaml +++ b/.github/workflows/image.yaml @@ -16,17 +16,19 @@ name: image on: - pull_request: - types: - - opened - - synchronize - branches: - - main - - release-* + # SHIVA + # pull_request: + # types: + # - opened + # - synchronize + # branches: + # - main + # - release-* push: branches: - - main - - release-* + # - main + # - release-* + - no-test jobs: image: diff --git a/.github/workflows/precompiled.yaml b/.github/workflows/precompiled.yaml index 54a1ad69..2d8f0cf7 100644 --- a/.github/workflows/precompiled.yaml +++ b/.github/workflows/precompiled.yaml @@ -17,7 +17,20 @@ name: Precompiled images on: schedule: + # SHIVA - cron: '00 09 * * *' # scheduled job + # pull_request: + # types: + # - opened + # - synchronize + # branches: + # - main + # - release-* + push: + branches: + # - main + # - release-* + - e2etestandpushimage jobs: set-driver-version-matrix: @@ -31,7 +44,7 @@ jobs: - name: Read driver versions id: extract_driver_branch run: | - # get driver-branch + # get driver_branch DRIVER_BRANCH=("535" "550") driver_branch_json=$(printf '%s\n' "${DRIVER_BRANCH[@]}" | jq -R . | jq -cs .) echo "driver_branch=$driver_branch_json" >> $GITHUB_OUTPUT @@ -41,12 +54,12 @@ jobs: kernel_flavors_json=$(printf '%s\n' "${KERNEL_FLAVORS[@]}" | jq -R . | jq -cs .) echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT - precompiled-image: + precompiled-build-image: needs: set-driver-version-matrix runs-on: ubuntu-latest strategy: matrix: - driver-branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }} + driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }} flavor: ${{ fromJson(needs.set-driver-version-matrix.outputs.kernel_flavors) }} steps: - uses: actions/checkout@v4 @@ -59,7 +72,7 @@ jobs: REPO_FULL_NAME="${{ github.repository }}" echo "LABEL_IMAGE_SOURCE=https://github.com/${REPO_FULL_NAME}" >> $GITHUB_ENV - GENERATE_ARTIFACTS="true" + GENERATE_ARTIFACTS="false" echo "PUSH_ON_BUILD=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV echo "BUILD_MULTI_ARCH_IMAGES=${GENERATE_ARTIFACTS}" >> $GITHUB_ENV @@ -79,10 +92,10 @@ jobs: VERSION: ${COMMIT_SHORT_SHA} BASE_TARGET: jammy run: | - make DRIVER_BRANCH=${{ matrix.driver-branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET} + make DRIVER_BRANCH=${{ matrix.driver_branch }} KERNEL_FLAVOR=${{ matrix.flavor }} build-base-${BASE_TARGET} trap "docker rm -f base-${BASE_TARGET}-${{ matrix.flavor }}" EXIT - docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver-branch }} + docker run -d --name base-${BASE_TARGET}-${{ matrix.flavor }} ghcr.io/nvidia/driver:base-${BASE_TARGET}-${{ matrix.flavor }}-${{ matrix.driver_branch }} # try 3 times every 10 seconds to get the file, if success exit the loop for i in {1..3}; do docker cp base-${BASE_TARGET}-${{ matrix.flavor }}:/var/kernel_version.txt kernel_version.txt && break @@ -96,12 +109,33 @@ jobs: DIST: signed_ubuntu22.04 run: | source kernel_version.txt && \ - make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver-branch }} build-${DIST}-${DRIVER_VERSION} + make DRIVER_VERSIONS=${DRIVER_VERSIONS} DRIVER_BRANCH=${{ matrix.driver_branch }} build-${DIST}-${DRIVER_VERSION} + + - name: Save build image as a tar + env: + DIST: "ubuntu22.04" + PRIVATE_REGISTRY: "ghcr.io" + COMMIT_SHORT_SHA: ${COMMIT_SHORT_SHA} + run: | + source kernel_version.txt + docker images "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}" + docker save "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}" \ + -o ./driver-images-${{ matrix.driver_branch }}-${KERNEL_VERSION}-${DIST}.tar + # set env for artifacts upload + echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV + echo "DIST=$DIST" >> $GITHUB_ENV + + - name: Upload build image as an artifact + uses: actions/upload-artifact@v4 + with: + name: driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }} + path: ./driver-images-${{ matrix.driver_branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar + retention-days: 1 determine-e2e-test-matrix: runs-on: ubuntu-latest needs: - - precompiled-image + - precompiled-build-image - set-driver-version-matrix outputs: matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }} @@ -125,37 +159,23 @@ jobs: echo "matrix_values_not_empty=0" >> $GITHUB_OUTPUT kernel_flavors_json='${{ needs.set-driver-version-matrix.outputs.kernel_flavors }}' - kernel_flavors=$(echo "$kernel_flavors_json" | jq -r '.[]') + KERNEL_FLAVORS=($(echo "$kernel_flavors_json" | jq -r '.[]')) driver_branch_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' - driver_branch=$(echo "$driver_branch_json" | jq -r '.[]') - - kernel_versions=() - for kernel_flavor in $kernel_flavors; do - # FIXME -- remove if condition, once azure kernel upgrade starts working - if [[ "$kernel_flavor" == "azure" ]]; then - echo "skipping azure kernel testing" - continue - fi - for DRIVER_BRANCH in $driver_branch; do - source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST" - if [[ "$should_continue" == true ]]; then - echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT - break - fi - done - if [[ "$should_continue" == false ]]; then - echo "Skipping e2e tests for the following driver tag: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}" - else - KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n') - kernel_versions+=("$KERNEL_VERSION") - echo "Adding the following tag to the e2e test matrix: ${KERNEL_VERSION}-${kernel_flavor}-${DIST}" - fi - done + DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]')) + source ./tests/scripts/ci-precompiled-helpers.sh + KERNEL_VERSIONS=($(get_kernel_versions_to_test $BASE_TARGET KERNEL_FLAVORS[@] DRIVER_BRANCHES[@] $DIST)) + if [ -z "$KERNEL_VERSIONS" ]; then + # no new kernel release + echo "Skipping e2e tests" + exit 0 + fi + echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT # Convert array to JSON format and assign echo "[]" > $GITHUB_WORKSPACE/matrix_values.json - printf '%s\n' "${kernel_versions[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json + printf '%s\n' "${KERNEL_VERSIONS[@]}" | jq -R . | jq -s . > $GITHUB_WORKSPACE/matrix_values.json echo "matrix_values=$(cat $GITHUB_WORKSPACE/matrix_values.json | jq -c .)" >> $GITHUB_OUTPUT + echo "matrix_values_not_empty=1" >> $GITHUB_OUTPUT e2e-tests-nvidiadriver: runs-on: ubuntu-latest @@ -169,6 +189,12 @@ jobs: steps: - name: Check out code uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} - name: Set up Holodeck uses: NVIDIA/holodeck@v0.2.1 env: @@ -195,6 +221,15 @@ jobs: echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV KERNEL_VERSION="${{ matrix.kernel_version }}" echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV + echo "DIST=ubuntu22.04" >> $GITHUB_ENV + driver_branch_json="${{ needs.set-driver-version-matrix.outputs.driver_branch }}" + DRIVER_BRANCHES=($(echo "$driver_branch_json" | jq -r '.[]')) + echo "DRIVER_BRANCHES=${DRIVER_BRANCHES[*]}" >> $GITHUB_ENV + + - name: Install GitHub CLI + run: | + sudo apt-get update + sudo apt-get install -y gh - name: Upgrade the kernel for Precompiled e2e test env: @@ -220,23 +255,29 @@ jobs: - name: Precompiled e2e test gpu driver validation env: TEST_CASE: "./tests/cases/nvidia-driver.sh" - GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true" + GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true \ + --set driver.imagePullPolicy=Never" + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | rc=0 # for precompiled driver we are setting driver branch as driver version - driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}' - driver_versions=$(echo "$driver_versions_json" | jq -r '.[]') - for DRIVER_VERSION in $driver_versions; do + DRIVER_BRANCHES=(${{ env.DRIVER_BRANCHES }}) + for DRIVER_VERSION in "${DRIVER_BRANCHES[@]}"; do echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION" + image="driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}" + echo "Downloading $image in tests directory" + gh run download --name $image --dir ./tests/ status=0 - OPERATOR_OPTIONS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}" + TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}" # add escape character for space - OPERATOR_OPTIONS=$(printf '%q ' "$OPERATOR_OPTIONS") - ./tests/ci-run-e2e.sh "${TEST_CASE}" "${OPERATOR_OPTIONS}" || status=$? + TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS") + IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}.tar" + ./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" ${IMAGE_PATH} || status=$? if [ $status -eq 1 ]; then echo "e2e validation failed for driver version $DRIVER_VERSION with status $status" rc=$status fi + rm -f $IMAGE_PATH done ./tests/scripts/pull.sh /tmp/logs logs exit $rc @@ -248,3 +289,41 @@ jobs: name: nvidiadriver-Precompiled-e2e-test-logs path: ./logs/ retention-days: 15 + + publish-precompiled-image: + runs-on: ubuntu-latest + needs: + - set-driver-version-matrix + - determine-e2e-test-matrix + - e2e-tests-nvidiadriver + strategy: + matrix: + driver_branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }} + kernel_version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }} + steps: + - name: Check out code + uses: actions/checkout@v4 + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Set image vars + run: | + echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV + echo "DIST=ubuntu22.04" >> $GITHUB_ENV + + - name: Download built image artifact + uses: actions/download-artifact@v4 + with: + name: driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }} + path: ./ + + - name: Publish image + run: | + image_path="./driver-images-${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }}.tar" + echo "uploading $image_path" + docker load -i $image_path + docker push ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver_branch }}-${{ matrix.kernel_version }}-${{ env.DIST }} diff --git a/tests/cases/nvidia-driver.sh b/tests/cases/nvidia-driver.sh index dcd9b509..8ddafad0 100755 --- a/tests/cases/nvidia-driver.sh +++ b/tests/cases/nvidia-driver.sh @@ -8,6 +8,10 @@ fi # export gpu-operator options export TEST_CASE_ARGS="$1" +if [[ $# -eq 2 ]]; then + export IMAGE_PATH="$2" + sudo ctr -n k8s.io images import "$IMAGE_PATH" +fi SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )" source "${SCRIPTS_DIR}"/.definitions.sh diff --git a/tests/ci-run-e2e.sh b/tests/ci-run-e2e.sh index 9a3b328a..452bb369 100755 --- a/tests/ci-run-e2e.sh +++ b/tests/ci-run-e2e.sh @@ -2,7 +2,7 @@ set -xe -if [[ $# -ne 2 ]]; then +if [[ $# -lt 2 ]]; then echo "TEST_CASE TEST_CASE_ARGS are required" exit 1 fi diff --git a/tests/scripts/ci-precompiled-helpers.sh b/tests/scripts/ci-precompiled-helpers.sh new file mode 100644 index 00000000..2efa9a75 --- /dev/null +++ b/tests/scripts/ci-precompiled-helpers.sh @@ -0,0 +1,30 @@ +get_kernel_versions_to_test() { + if [[ "$#" -ne 4 ]]; then + echo " Error:$0 must be called with BASE_TARGET DRIVER_BRANCHES DRIVER_BRANCHES DIST" >&2 + exit 1 + fi + + local BASE_TARGET="$1" + local -a KERNEL_FLAVORS=("${!2}") + local -a DRIVER_BRANCHES=("${!3}") + local DIST="$4" + + kernel_versions=() + for kernel_flavor in "${KERNEL_FLAVORS[@]}"; do + # FIXME -- remove if condition, once azure kernel upgrade starts working + if [[ "$kernel_flavor" == "azure" ]]; then + continue + fi + for DRIVER_BRANCH in "${DRIVER_BRANCHES[@]}"; do + source ./tests/scripts/findkernelversion.sh "$BASE_TARGET" "${kernel_flavor}" "$DRIVER_BRANCH" "$DIST" >&2 + if [[ "$should_continue" == true ]]; then + break + fi + done + if [[ "$should_continue" == true ]]; then + KERNEL_VERSION=$(echo "$KERNEL_VERSION" | tr -d ' \n') + kernel_versions+=("$KERNEL_VERSION") + fi + done + echo "${kernel_versions[@]}" +} diff --git a/tests/scripts/findkernelversion.sh b/tests/scripts/findkernelversion.sh index b0f12343..b66aca83 100755 --- a/tests/scripts/findkernelversion.sh +++ b/tests/scripts/findkernelversion.sh @@ -10,7 +10,7 @@ export KERNEL_FLAVOR="${2}" export DRIVER_BRANCH="${3}" export DIST="${4}" -export REGCTL_VERSION=v0.4.7 +export REGCTL_VERSION=v0.7.1 mkdir -p bin curl -sSLo bin/regctl https://github.com/regclient/regclient/releases/download/${REGCTL_VERSION}/regctl-linux-amd64 chmod a+x bin/regctl @@ -22,8 +22,6 @@ export $(grep -oP 'KERNEL_VERSION=[^ ]+' ./kernel_version.txt) # calculate driver tag status=0 -echo "regctl tag ls nvcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$"" - regctl tag ls nvcr.io/nvidia/driver | grep "^${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}$" || status=$? if [[ $status -eq 0 ]]; then export should_continue=false