Skip to content

Commit

Permalink
Pre-compiled end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
1.Pre-compiled end-to-end  gpu driver validation

Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Aug 24, 2024
1 parent 25e821d commit b3958ad
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 65 deletions.
53 changes: 10 additions & 43 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ on:
types:
- completed
branches:
- e2etestdriver
- e2etestdriver_no

pull_request:
types:
Expand All @@ -29,12 +29,12 @@ on:
branches:
# - main
# - release-*
- e2etestdriver
- e2etestdriver_no
push:
branches:
# - main
# - release-*
- e2etestdriver
- e2etestdriver_no

jobs:
e2e-tests-nvidiadriver:
Expand All @@ -55,12 +55,12 @@ jobs:
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"

- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

- name: Set and Calculate test vars
run: |
Expand All @@ -70,17 +70,19 @@ jobs:
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
- name: Validate gpu driver
env:
TEST_CASE: "./tests/cases/nvidia-driver.sh"
USE_PRECOMPILED: "0"
OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia"
run: |
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
status=0
./tests/ci-run-e2e.sh ${TEST_CASE} ${COMMIT_SHORT_SHA}-${driver_version} ${USE_PRECOMPILED} || status=$?
echo "SHIVA==== ${OPERATOR_OPTIONS}"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${COMMIT_SHORT_SHA}-${driver_version}" "${OPERATOR_OPTIONS}" || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $driver_version with status $status"
rc=$status
Expand All @@ -96,38 +98,3 @@ jobs:
name: nvidiadriver-e2e-test-logs
path: ./logs/
retention-days: 15

- name: Precompiled e2e test- upgrade kernel and Validate gpu driver
env:
TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
USE_PRECOMPILED: "1"
run: |
rc=0
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
status=0
./tests/ci-run-e2e.sh ${TEST_CASE_KERNEL_UPGRADE} ${driver_version} ${USE_PRECOMPILED} || status=$?
if [ $status -ne 0 ]; then
echo "Kernel upgrade failed"
rc=$status
else
DRIVER_BRANCH=$(echo "${driver_version}" | cut -d '.' -f 1)
DRIVER_VERSION="${DRIVER_BRANCH}"
./tests/ci-run-e2e.sh ${TEST_CASE} ${DRIVER_VERSION} ${USE_PRECOMPILED} || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
fi
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
120 changes: 120 additions & 0 deletions .github/workflows/ci_precompiled.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Pre-Compiled End-to-end tests

on:
workflow_run:
workflows: [image]
types:
- completed
branches:
- e2etestdriver_no

pull_request:
types:
- opened
- synchronize
branches:
# - main
# - release-*
- e2etestdriver
push:
branches:
# - main
# - release-*
- e2etestdriver

jobs:
e2e-tests-nvidiadriver:
# strategy:
# matrix:
# flavor:
# - aws
# - azure
# - generic
# - nvidia
# - oracle
runs-on: ubuntu-latest

steps:
- name: Check out code
uses: actions/checkout@v4

- name: Set up Holodeck
uses: NVIDIA/[email protected]
env:
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SSH_KEY: ${{ secrets.AWS_SSH_KEY }}
with:
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_ssh_key: ${{ secrets.AWS_SSH_KEY }}
holodeck_config: "tests/holodeck.yaml"

- name: Get public dns name
id: get_public_dns_name
uses: mikefarah/yq@master
with:
cmd: yq '.status.properties[] | select(.name == "public-dns-name") | .value' /github/workspace/.cache/holodeck.yaml

- name: Set and Calculate test vars
run: |
echo "instance_hostname=ubuntu@${{ steps.get_public_dns_name.outputs.result }}" >> $GITHUB_ENV
echo "private_key=${{ github.workspace }}/key.pem" >> $GITHUB_ENV
echo "${{ secrets.AWS_SSH_KEY }}" > ${{ github.workspace }}/key.pem && chmod 400 ${{ github.workspace }}/key.pem
echo "COMMIT_SHORT_SHA=${GITHUB_SHA:0:8}" >> $GITHUB_ENV
DRIVER_VERSIONS=$(grep '^DRIVER_VERSIONS ?=' versions.mk | awk -F' ?= ' '{print $2}')
echo "DRIVER_VERSIONS=$DRIVER_VERSIONS" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
- name: Precompiled e2e test- upgrade kernel and Validate gpu driver
env:
TEST_CASE_KERNEL_UPGRADE: "./tests/cases/nvidia-kernel-upgrade.sh"
TEST_CASE: "./tests/cases/nvidia-driver.sh"
OPERATOR_OPTIONS: "--set driver.repository=${{ env.PRIVATE_REGISTRY }}/nvidia --set driver.usePrecompiled=true"
run: |
rc=0
echo "SHIVAAAAAAAAA"
for driver_version in ${DRIVER_VERSIONS}; do
echo "Running e2e for DRIVER_VERSION=$driver_version"
status=0
echo "SHIVA==== ${OPERATOR_OPTIONS}"
./tests/ci-run-e2e.sh "${TEST_CASE_KERNEL_UPGRADE}" "${driver_version} \"${OPERATOR_OPTIONS}\" || status=$?
echo "shiva3"
if [ $status -ne 0 ]; then
echo "Kernel upgrade failed"
rc=$status
else
./tests/scripts/remote_retry.sh
DRIVER_BRANCH=$(echo "${driver_version}" | cut -d '.' -f 1)
DRIVER_VERSION="${DRIVER_BRANCH}"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${DRIVER_VERSION}" "${OPERATOR_OPTIONS}" || status=$?
if [ $status -ne 0 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
rc=$status
fi
fi
done
./tests/scripts/pull.sh /tmp/logs logs
exit $rc
- name: Archive test logs
if: ${{ failure() }}
uses: actions/upload-artifact@v4
with:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15
4 changes: 2 additions & 2 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ on:
branches:
# - main
# - release-*
- e2etestdriver
- e2etestdriver_no
push:
branches:
# - main
# - release-*
- e2etestdriver
- e2etestdriver_no

jobs:
image:
Expand Down
10 changes: 6 additions & 4 deletions tests/ci-run-e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@
set -xe

if [[ $# -ne 3 ]]; then
echo "TEST_CASE TARGET_DRIVER_VERSION USE_PRECOMPILED are required"
echo "TEST_CASE TARGET_DRIVER_VERSION OPERATOR_OPTIONS are required"
exit 1
fi

export TEST_CASE=${1}
export TARGET_DRIVER_VERSION=${2}
export USE_PRECOMPILED=${3}
export TEST_CASE="${1}"
export TARGET_DRIVER_VERSION="${2}"
export OPERATOR_OPTIONS="${3}"

echo "SHIVA===== ${OPERATOR_OPTIONS}"

TEST_DIR="$(pwd)/tests"

Expand Down
7 changes: 2 additions & 5 deletions tests/local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,6 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/scripts && pwd )"
source ${SCRIPT_DIR}/.definitions.sh
source ${SCRIPT_DIR}/.local.sh

if [ "${USE_PRECOMPILED}" == "1" ]; then
remote_retry
fi

# Sync the project folder to the remote
${SCRIPT_DIR}/push.sh

Expand All @@ -25,8 +21,9 @@ remote SKIP_PREREQUISITES="${SKIP_PREREQUISITES}" ./tests/scripts/prerequisites.
# We trigger the specified test case on the remote instance.
# Note: We need to ensure that the required environment variables
# are forwarded to the remote shell.
echo "SHIVA====== ${OPERATOR_OPTIONS}"
remote \
PROJECT="${PROJECT}" \
TARGET_DRIVER_VERSION="${TARGET_DRIVER_VERSION}" \
USE_PRECOMPILED="${USE_PRECOMPILED}" \
OPERATOR_OPTIONS=\"${OPERATOR_OPTIONS}\" \
${TEST_CASE}
4 changes: 1 addition & 3 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

: ${TEST_NAMESPACE:="test-operator"}

: ${PRIVATE_REGISTRY:="ghcr.io"}

: ${HELM_NVIDIA_REPO:="https://helm.ngc.nvidia.com/nvidia"}

: ${TARGET_DRIVER_VERSION:="550.90.07"}
Expand All @@ -25,7 +23,7 @@ CASES_DIR="$( cd "${TEST_DIR}/cases" && pwd )"

: ${LOG_DIR:="/tmp/logs"}

: ${USE_PRECOMPILED:="0"}
: ${OPERATOR_OPTIONS:="--set driver.repository=ghcr.io/nvidia"}
: ${SYSTEM_ONLINE_CHECK_TIMEOUT:="900"}

: ${BASE_TARGET:="jammy"}
2 changes: 2 additions & 0 deletions tests/scripts/.local.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
#!/usr/env bash

function remote() {
echo "SHIVA1"
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && "$@""
echo "SHIVA2"
}

function remote_retry() {
Expand Down
14 changes: 6 additions & 8 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,19 @@ echo "Current kernel version: $CURRENT_KERNEL"
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

if [ "${USE_PRECOMPILED}" == "1" ]; then
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.usePrecompiled=true"
fi

OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.repository=${PRIVATE_REGISTRY}/nvidia --set driver.version=${TARGET_DRIVER_VERSION}"
echo "SHIVA=== ${OPERATOR_OPTIONS}"
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set driver.version=${TARGET_DRIVER_VERSION}"

# add helm driver repo
helm repo add nvidia ${HELM_NVIDIA_REPO} && helm repo update

# Create the test namespace
kubectl create namespace "${TEST_NAMESPACE}"

echo "SHIVA==1 ${OPERATOR_OPTIONS}"
# Run the helm install command
echo "OPERATOR_OPTIONS: $OPERATOR_OPTIONS"
${HELM} install gpu-operator nvidia/gpu-operator \
eval ${HELM} install gpu-operator nvidia/gpu-operator \
-n "${TEST_NAMESPACE}" \
${OPERATOR_OPTIONS} \
"${OPERATOR_OPTIONS}" \
--wait
echo "SHIVA==2 ${OPERATOR_OPTIONS}"

0 comments on commit b3958ad

Please sign in to comment.