Skip to content

Commit

Permalink
Pre-compiled end-to-end gpu driver validation
Browse files Browse the repository at this point in the history
Signed-off-by: shiva kumar <[email protected]>
  • Loading branch information
shivakunv committed Sep 12, 2024
1 parent af8df3c commit d3253d2
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 47 deletions.
52 changes: 7 additions & 45 deletions .github/workflows/precompiled.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ jobs:
echo "SHIVA############# ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver-branch }}-${KERNEL_VERSION}-${DIST}"
docker images "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver-branch }}-${KERNEL_VERSION}-${DIST}"
docker save "${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver-branch }}-${KERNEL_VERSION}-${DIST}" \
| gzip > ./driver-images-${{ matrix.driver-branch }}-${KERNEL_VERSION}-${DIST}.tar.gz
-o ./driver-images-${{ matrix.driver-branch }}-${KERNEL_VERSION}-${DIST}.tar
# set env for artifacts upload
echo "KERNEL_VERSION=$KERNEL_VERSION" >> $GITHUB_ENV
Expand All @@ -132,7 +132,7 @@ jobs:
uses: actions/upload-artifact@v4
with:
name: driver-images-${{ matrix.driver-branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}
path: ./driver-images-${{ matrix.driver-branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar.gz
path: ./driver-images-${{ matrix.driver-branch }}-${{ env.KERNEL_VERSION }}-${{ env.DIST }}.tar
retention-days: 1

determine-e2e-test-matrix:
Expand Down Expand Up @@ -256,6 +256,8 @@ jobs:
echo "Downloading $image in tests directory"
gh run download --name $image --dir ./tests/
done
# SHIVA
ls ./tests/*
- name: Upgrade the kernel for Precompiled e2e test
env:
Expand Down Expand Up @@ -291,7 +293,7 @@ jobs:
TEST_CASE_ARGS="${GPU_OPERATOR_OPTIONS} --set driver.version=${DRIVER_VERSION}"
# add escape character for space
TEST_CASE_ARGS=$(printf '%q ' "$TEST_CASE_ARGS")
IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}.tar.gz"
IMAGE_PATH="./tests/driver-images-${DRIVER_VERSION}-${KERNEL_VERSION}-${DIST}.tar"
./tests/ci-run-e2e.sh "${TEST_CASE}" "${TEST_CASE_ARGS}" ${IMAGE_PATH} || status=$?
if [ $status -eq 1 ]; then
echo "e2e validation failed for driver version $DRIVER_VERSION with status $status"
Expand All @@ -305,9 +307,9 @@ jobs:
run: |
ls ./tests/*
for DRIVER_BRANCH in $driver_branch; do
image_path="./tests/driver-images-${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}.tar.gz"
image_path="./tests/driver-images-${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}.tar"
echo "uploading $image_path"
docker load -i "$image"
docker load -i $image_path
echo "docker tag driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST} ${PRIVATE_REGISTRY}/nvidia/driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}"
docker tag driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST} \
${PRIVATE_REGISTRY}/nvidia/driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${DIST}
Expand All @@ -322,43 +324,3 @@ jobs:
name: nvidiadriver-Precompiled-e2e-test-logs
path: ./logs/
retention-days: 15

# SHIVA
# This should be inside e2e as 2 times download not needed , also check if there is feasabilty , if we can uploaded images directly from artifacts to ghcr.io
precompiled-push-image:
runs-on: ubuntu-latest
needs:
- set-driver-version-matrix
- determine-e2e-test-matrix
strategy:
matrix:
kernel-version: ${{ fromJson(needs.determine-e2e-test-matrix.outputs.matrix_values) }}
driver-branch: ${{ fromJson(needs.set-driver-version-matrix.outputs.driver_branch) }}
steps:
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set and Calculate test vars
run: |
echo "DIST=ubuntu22.04" >> $GITHUB_ENV
echo "PRIVATE_REGISTRY=ghcr.io" >> $GITHUB_ENV
# - name: Download Docker image artifact
# uses: actions/download-artifact@v4
# with:
# name: driver-images-${{ matrix.driver-branch}}-${{ matrix.kernel-version }}-${{ env.DIST }}
# path: ./

- name: Push built image
run: |
ls ./*
# docker load -i ${{ matrix.driver-branch }}-${{ matrix.kernel-version }}-${{ env.DIST }}.tar.gz
# docker tag driver:${{ matrix.driver-branch }}-${{ matrix.kernel-version }}-${{ env.DIST }} \
# ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver-branch }}-${{ matrix.kernel-version }}-${{ env.DIST }}
# docker push ${PRIVATE_REGISTRY}/nvidia/driver:${{ matrix.driver-branch }}-${{ matrix.kernel-version }}-${{ env.DIST }}
- name: Remove built image tar
run: rm -f driver-images-${{ matrix.driver-branch }}-${{ matrix.kernel-version }}-${{ env.DIST }}.tar.gz
1 change: 1 addition & 0 deletions tests/cases/nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ fi
export TEST_CASE_ARGS="$1"
if [[ $# -eq 2 ]]; then
export IMAGE_PATH="$2"
docker load -i "$IMAGE_PATH"
fi

SCRIPTS_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )"/../scripts && pwd )"
Expand Down
4 changes: 2 additions & 2 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ kubectl create namespace "${TEST_NAMESPACE}"

# Run the helm install command
echo "OPERATOR_OPTIONS: ${OPERATOR_OPTIONS}"
eval ${HELM} install gpu-operator \
-n "${TEST_NAMESPACE}" ${IMAGE_PATH} \
eval ${HELM} install gpu-operator nvidia/gpu-operator \
-n "${TEST_NAMESPACE}" \
"${OPERATOR_OPTIONS}" \
--wait
4 changes: 4 additions & 0 deletions tests/scripts/verify-operator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,8 @@ if [ $exit_status -ne 0 ]; then
exit 1
else
echo "All gpu-operator pods are ready."
# SHIVA
curl -o ${SCRIPT_DIR}/must-gather.sh "https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh"
chmod +x ${SCRIPT_DIR}/must-gather.sh
ARTIFACT_DIR="${LOG_DIR}" ${SCRIPT_DIR}/must-gather.sh
fi

0 comments on commit d3253d2

Please sign in to comment.