fold A100/H100 logic into helper

NVIDIA · Oct 31, 2024 · 51164b5 · 51164b5
1 parent 93017fe
commit 51164b5
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 99 deletions.
diff --git a/.github/workflows/_sandbox.yaml b/.github/workflows/_sandbox.yaml
@@ -8,18 +8,15 @@ jobs:
     uses: ./.github/workflows/_test_slurm_pyxis.yaml
     strategy:
       matrix:
-        N_GPU: [8]
+        GPU: [A100, H100]
+        N_GPU: [2] #, 4, 8]
       fail-fast: false
-    secrets:
-      SSH_PRIVATE_KEY: ${{ secrets.AWS_PRIVATE_KEY }}
-      SLURM_LOGIN_USER: ${{ secrets.AWS_LOGIN_USER }}
-      CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
+    secrets: inherit
     with:
       NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
-      SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_AWS_SLURM }}
-      OUTPUT_BASEDIR: /home/ubuntu
       OUTPUT_MOUNTPOINT: /output
       NODES: 1
+      GPU_TYPE: ${{ matrix.GPU }}
       GPUS_PER_NODE: ${{ matrix.N_GPU }}
       NTASKS: 1
       NTASKS_PER_NODE: 1
@@ -40,71 +37,3 @@ jobs:
           test_single_gpu_encoder.py \
           test_multigpu_encoder.py \
           test_model_parallel_encoder.py
-
-  sitrep:
-    needs: te-multi-gpu
-    if: success() || failure()
-    runs-on: ubuntu-latest
-    env:
-      ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
-      BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
-    steps:
-      - name: Check out the repository under ${GITHUB_WORKSPACE}
-        uses: actions/checkout@v4
-
-      - name: Download artifacts
-        uses: actions/download-artifact@v4
-        with:
-          pattern: |
-            ${{ inputs.ARTIFACT_PREFIX }}-*
-          merge-multiple: true
-
-      - name: Generate sitrep
-        shell: bash -x -e {0}
-        run: |
-          # bring in utility functions
-          source .github/workflows/scripts/to_json.sh
-
-          test_outcome_files=$(find -name pytest-report.jsonl)
-
-          badge_label='TE Multi GPU tests'
-          passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
-          failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
-          total_tests=$((failed_tests + passed_tests))
-          
-          if [[ ${total_tests} == 0 ]]; then
-            badge_message='error'
-            badge_color=red
-            summary='TE multi GPU tests did not complete due to errors.'
-          else
-            badge_message="${passed_tests}/${total_tests} passed"
-            if [[ ${failed_tests} == 0 ]]; then
-              badge_color=brightgreen
-            else
-              badge_color=yellow
-            fi
-            summary="TE multi GPU tests : $badge_message"
-          fi
-
-          run_id=${{ github.run_id }} \
-          to_json \
-            run_id \
-            summary \
-            total_tests passed_tests failed_tests \
-            badge_label badge_color badge_message \
-          > sitrep.json
-
-          schemaVersion=1 \
-          label="${badge_label}" \
-          message="${badge_message}" \
-          color="${badge_color}" \
-          to_json schemaVersion label message color \
-          > ${{ env.BADGE_FILENAME_FULL }}
-
-      - name: Upload training logs as artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.ARTIFACT_NAME_FULL }}
-          path: |
-            sitrep.json
-            ${{ env.BADGE_FILENAME_FULL }}
diff --git a/.github/workflows/_test_slurm_pyxis.yaml b/.github/workflows/_test_slurm_pyxis.yaml
@@ -2,27 +2,14 @@ name: ~test multi-node jobs via SLURM+Pyxis
 
 on:
   workflow_call:
-    secrets:
-      SSH_PRIVATE_KEY:
-        required: true
-        description: SSH private key for accessing the SLURM login node
-      SLURM_LOGIN_USER:
-        required: true
-        description: Username for the SLURM login node
-      CONTAINER_REGISTRY_TOKEN:
-        required: true
-        description: Token for accessing the container registry
     inputs:
       NAME:
         type: string
         description: Name of the test case and output artifact tarball
         required: true
-      SLURM_LOGIN_HOSTNAME:
-        type: string
-        description: Hostname of the SLURM login node
-        required: true
       OUTPUT_BASEDIR:
         type: string
+        default: /nfs/cluster
         description: Base directory for the SLURM scratch space
         required: true
       OUTPUT_MOUNTPOINT:
@@ -33,6 +20,10 @@ on:
         type: number
         description: Number of nodes to request
         required: true
+      GPU_TYPE:
+        type: string
+        description: Type of GPU to request; A100 and H100 are supported.
+        required: true
       GPUS_PER_NODE:
         type: number
         description: Number of GPUs per node to request
@@ -78,15 +69,21 @@ on:
         value: ${{ jobs.run-test.outputs.SLURM_EXITCODE }}
 
 jobs:
-
   run-test:
     name: ${{ inputs.NAME }}
     runs-on: jumpbox
+    env:
+      SLURM_LOGIN_USER: ${{ inputs.GPU_TYPE == 'A100' && secrets.SLURM_LOGIN_USER || secrets.AWS_LOGIN_USER }}
+      SLURM_LOGIN_HOSTNAME: ${{ inputs.GPU_TYPE == 'A100' && vars.HOSTNAME_SLURM_LOGIN || vars.HOSTNAME_AWS_SLURM }}
     outputs:
       SLURM_JOB_ID: ${{ steps.submit.outputs.SLURM_JOB_ID }}
       SLURM_STATE: ${{ steps.exit-info.outputs.SLURM_STATE }}
       SLURM_EXITCODE: ${{ steps.exit-info.outputs.SLURM_EXITCODE }}
     steps:
+      - name: Validate inputs
+        run: |
+          [[ "${{ inputs.GPU_TYPE }}" == "A100" || "${{ inputs.GPU_TYPE }}" == "H100" ]]
+
       - name: Print environment variables
         run: env
 
@@ -96,7 +93,7 @@ jobs:
       - name: Setup SSH agent
         uses: webfactory/[email protected]
         with:
-          ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
+          ssh-private-key: ${{ inputs.GPU_TYPE == 'A100' && secrets.SSH_PRIVATE_KEY || secrets.AWS_PRIVATE_KEY }}
 
       - name: Setup SSH known hosts
         id: ssh-known-hosts
@@ -123,7 +120,7 @@ jobs:
         id: submit
         shell: bash -O expand_aliases -x -e {0}
         run: |
-          alias SSH='ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}'
+          alias SSH='ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME}'
           SSH mkdir -p ${{ steps.meta.outputs.OUTPUT_PATH }}
           SLURM_JOB_ID=$(SSH sbatch --parsable <<"EOF"
           #!/bin/bash
@@ -133,7 +130,7 @@ jobs:
           #SBATCH --gpus-per-node=${{ inputs.GPUS_PER_NODE }}
           #SBATCH --time=${{ inputs.TIME_LIMIT }}
           #SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
-          #SBATCH --export="${{ inputs.EXTRA_EXPORTS }},ENROOT_PASSWORD=${{ secrets.CONTAINER_REGISTRY_TOKEN }}"
+          #SBATCH --export="${{ inputs.EXTRA_EXPORTS }},ENROOT_PASSWORD=${{ secrets.github_token }}"
 
           # preload enroot container using one task per node
           time srun \
@@ -172,14 +169,14 @@ jobs:
         run: |
           . .github/workflows/scripts/wait_for_slurm_job.sh
 
-          wait_for_slurm_job ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} ${{ steps.submit.outputs.SLURM_JOB_ID }}
+          wait_for_slurm_job ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} ${{ steps.submit.outputs.SLURM_JOB_ID }}
 
       - name: Query for job exit info
         id: exit-info
         shell: bash -exu -o pipefail {0}
         run: |
           JOB_INFO=$(
-            ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
+            ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} \
             sacct -j ${{ steps.submit.outputs.SLURM_JOB_ID }} --format=JobID,JobName,State,Exitcode --parsable2 --noheader |\
             grep -E '^[0-9]+\|'
           )
@@ -196,7 +193,7 @@ jobs:
           echo "******************** TAIL OF SLURM LOG BEG ********************"
           echo "***************************************************************"
           echo "***************************************************************"
-          ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }}
+          ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }}
           echo "***************************************************************"
           echo "***************************************************************"
           echo "******************** TAIL OF SLURM LOG END ********************"
@@ -212,7 +209,7 @@ jobs:
         shell: bash -x -e {0}
         run: |
           function rsync-down() {
-            rsync -rtz --progress ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}:$1 $2
+            rsync -rtz --progress ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME}:$1 $2
           }
           mkdir -p artifacts/
           rsync-down ${{ steps.meta.outputs.LOG_FILE }} artifacts/
@@ -243,5 +240,5 @@ jobs:
         if: always() && steps.exit-info.outputs.SLURM_EXITCODE != 0
         shell: bash -x -e {0}
         run: |
-          ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
+          ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} \
             scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
diff --git a/.github/workflows/_test_te.yaml b/.github/workflows/_test_te.yaml
@@ -21,7 +21,7 @@ jobs:
       matrix:
         N_GPU: [2, 4, 8]
       fail-fast: false
-    secrets:
+    secrets: inherit
       SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
       SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
       CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}