Skip to content

Commit

Permalink
Pre-compiled end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
1. Pre-compiled end-to-end  gpu driver validation

Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Sep 10, 2024
1 parent 7990081 commit bd7372b
Show file tree
Hide file tree
Showing 15 changed files with 74 additions and 58 deletions.
14 changes: 8 additions & 6 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ on:
types:
- completed
branches:
- main
# SHIVA
- e2etestdriver-no

jobs:
e2e-tests-nvidiadriver:
Expand Down Expand Up @@ -61,15 +62,16 @@ jobs:
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia"
run: |
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
for DRIVER_VERSION in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
status=0
./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$?
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${COMMIT_SHORT_SHA}-${DRIVER_VERSION}"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
done
Expand Down
20 changes: 11 additions & 9 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,19 @@
name: image

on:
pull_request:
types:
- opened
- synchronize
branches:
- main
- release-*
# SHIVA
# pull_request:
# types:
# - opened
# - synchronize
# branches:
# - main
# - release-*
push:
branches:
- main
- release-*
# - main
# - release-*
- e2etestdriver-no

jobs:
image:
Expand Down
35 changes: 27 additions & 8 deletions .github/workflows/precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,18 @@
name: Precompiled images

on:
schedule:
- cron: '00 09 * * *' # scheduled job
# SHIVA
# schedule:
# - cron: '00 09 * * *' # scheduled job
pull_request:
types:
- opened
- synchronize
branches:
- e2etestdriver
push:
branches:
- e2etestdriver

jobs:
set-driver-version-matrix:
Expand All @@ -42,6 +52,8 @@ jobs:
echo "kernel_flavors=$kernel_flavors_json" >> $GITHUB_OUTPUT
precompiled-image:
# SHIVA
if: false
needs: set-driver-version-matrix
runs-on: ubuntu-latest
strategy:
Expand Down Expand Up @@ -101,7 +113,8 @@ jobs:
determine-e2e-test-matrix:
runs-on: ubuntu-latest
needs:
- precompiled-image
# SHIVA
# - precompiled-image
- set-driver-version-matrix
outputs:
matrix_values_not_empty: ${{ steps.set_kernel_version.outputs.matrix_values_not_empty }}
Expand Down Expand Up @@ -201,7 +214,8 @@ jobs:
UPGRADE_KERNEL_SCRIPT: "./tests/scripts/upgrade-kernel.sh"
run: |
status=0
./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" "${KERNEL_VERSION}" || status=$?
# SHIVA
# ./tests/ci-remote-exec.sh "${UPGRADE_KERNEL_SCRIPT}" \"${KERNEL_VERSION}\" || status=$?
# On the target system, all scripts/test-case exit with code 1 for error handling.
# However, since reboot-related disconnections break the SSH connection
# and can cause the entire job to exit, we should ignore all errors except
Expand All @@ -211,7 +225,8 @@ jobs:
echo "Kernel version $KERNEL_VERSION upgrade failed"
exit 1
fi
./tests/scripts/remote_retry.sh || status=$?
# SHIVA
# ./tests/scripts/remote_retry.sh || status=$?
if [ $status -ne 0 ]; then
echo "Failed to connect to remote instance"
exit $status
Expand All @@ -220,23 +235,27 @@ jobs:
- name: Precompiled e2e test gpu driver validation
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
GPU_OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
run: |
rc=0
# for precompiled driver we are setting driver branch as driver version
driver_versions_json='${{ needs.set-driver-version-matrix.outputs.driver_branch }}'
driver_versions=$(echo "$driver_versions_json" | jq -r '.[]')
for DRIVER_VERSION in $driver_versions; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
echo "Running e2e for DRIVER_VERSION=$DRIVER_VERSION"
status=0
./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$?
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
# add space character for space
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
Expand Down
8 changes: 8 additions & 0 deletions tests/cases/nvidia-driver.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

if [[ $# -le 1 ]]; then
echo "Error: $0 must be called with driver options"
exit 1
fi

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

# export gpu-operator options
export TEST_CASE_ARGS="$1"

# Run an end-to-end test cycle
"${SCRIPTS_DIR}"/end-to-end-nvidia-driver.sh
11 changes: 3 additions & 8 deletions tests/ci-remote-exec.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,11 @@

set -xe

if [[ $# -gt 2 ]]; then
echo "Error: ci-remote-exec.sh must be called with 1(REMOTE_EXEC) or 2(REMOTE_EXEC, REMOTE_EXEC_ARGS) argument only."
if [[ $# -lt 1 ]]; then
echo "Error:$0 must be called with 1(REMOTE_EXEC) or more than 1 args (REMOTE_EXEC, ARGS1 ARGS2 etc)"
exit 1
fi

export REMOTE_EXEC="${1}"
# to pass more enviroment variable , use space seprated REMOTE_EXEC_ARGS
# e.g. REMOTE_EXEC_ARGS="env1=arg1 env2=arg2" etc
export REMOTE_EXEC_ARGS="${2}"

TEST_DIR="$(pwd)/tests"

${TEST_DIR}/remote-exec-local.sh
${TEST_DIR}/remote-exec-local.sh "$@"
10 changes: 3 additions & 7 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@

set -xe

if [[ $# -ne 3 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION TEST_CASE_ARGS are required"
if [[ $# -ne 2 ]]; then
echo "TEST_CASE TEST_CASE_ARGS are required"
exit 1
fi

export TEST_CASE="${1}"
export TARGET_DRIVER_VERSION="${2}"
export TEST_CASE_ARGS="${3}"

TEST_DIR="$(pwd)/tests"

${TEST_DIR}/local.sh
${TEST_DIR}/local.sh "$@"
4 changes: 1 addition & 3 deletions tests/local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,4 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.
# are forwarded to the remote shell.
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
TEST_CASE_ARGS=\"${TEST_CASE_ARGS}\" \
${TEST_CASE}
"$@"
8 changes: 1 addition & 7 deletions tests/remote-exec-local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,7 @@ source ${SCRIPT_DIR}/.local.sh
# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

# We trigger the installation of prerequisites on the remote instance
remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.sh

# We trigger the specified script on the remote instance.
# Note: We need to ensure that the required environment variables
# are forwarded to the remote shell.
remote \
PROJECT="${PROJECT}" \
REMOTE_EXEC_ARGS=\"${REMOTE_EXEC_ARGS}\" \
${REMOTE_EXEC}
"$@"
4 changes: 0 additions & 4 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,11 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}

: ${DAEMON_POD_STATUS_TIME_OUT:="15m"}
: ${POD_STATUS_TIME_OUT:="2m"}

: ${LOG_DIR:="/tmp/logs"}

: ${TEST_CASE_ARGS:="--set driver.repository=ghcr.io/nvidia"}
: ${REMOTE_EXEC_ARGS:=""}
: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}

: ${BASE_TARGET:="jammy"}
2 changes: 1 addition & 1 deletion tests/scripts/.local.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/env bash

function remote() {
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && $@"
}

function remote_retry() {
Expand Down
2 changes: 2 additions & 0 deletions tests/scripts/findkernelversion.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,5 @@ if [[ $status -eq 0 ]]; then
else
export should_continue=true
fi
# SHIVA
export should_continue=true
2 changes: 1 addition & 1 deletion tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ echo "Current kernel version: $CURRENT_KERNEL"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

OPERATOR_OPTIONS="${TEST_CASE_ARGS} --set driver.version=${TARGET_DRIVER_VERSION}"
OPERATOR_OPTIONS="${TEST_CASE_ARGS}"

# add helm driver repo
helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update
Expand Down
2 changes: 0 additions & 2 deletions tests/scripts/kernel-upgrade-helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@ fi
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source "${SCRIPT_DIR}"/.definitions.sh

export KERNEL_VERSION="${REMOTE_EXEC_ARGS}"

echo "Checking current kernel version..."
CURRENT_KERNEL=$(uname -r)
echo "Current kernel version: $CURRENT_KERNEL"
Expand Down
2 changes: 1 addition & 1 deletion tests/scripts/remote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

# keep alive 60sec and timeout after 30 tries
ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "${@}"
ssh -o ServerAliveInterval=60 -o ServerAliveCountMax=30 -i ${private_key} ${instance_hostname} "$@"
8 changes: 7 additions & 1 deletion tests/scripts/upgrade-kernel.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
#! /bin/bash
# This test case runs the operator installation / test case with the default options.

if [[ $# -ne 1 ]]; then
echo "Error: $0 must be called with kernel_version"
exit 1
fi

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
source "${SCRIPTS_DIR}"/.definitions.sh

# Run an end-to-end test cycle
# export kernel version and Run an end-to-end test cycle
export KERNEL_VERSION="$1"
"${SCRIPTS_DIR}"/kernel-upgrade-helper.sh

0 comments on commit bd7372b

Please sign in to comment.