Skip to content

Commit

Permalink
fold A100/H100 logic into helper
Browse files Browse the repository at this point in the history
  • Loading branch information
olupton committed Oct 31, 2024
1 parent 93017fe commit 51164b5
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 99 deletions.
79 changes: 4 additions & 75 deletions .github/workflows/_sandbox.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,15 @@ jobs:
uses: ./.github/workflows/_test_slurm_pyxis.yaml
strategy:
matrix:
N_GPU: [8]
GPU: [A100, H100]
N_GPU: [2] #, 4, 8]
fail-fast: false
secrets:
SSH_PRIVATE_KEY: ${{ secrets.AWS_PRIVATE_KEY }}
SLURM_LOGIN_USER: ${{ secrets.AWS_LOGIN_USER }}
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
secrets: inherit
with:
NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_AWS_SLURM }}
OUTPUT_BASEDIR: /home/ubuntu
OUTPUT_MOUNTPOINT: /output
NODES: 1
GPU_TYPE: ${{ matrix.GPU }}
GPUS_PER_NODE: ${{ matrix.N_GPU }}
NTASKS: 1
NTASKS_PER_NODE: 1
Expand All @@ -40,71 +37,3 @@ jobs:
test_single_gpu_encoder.py \
test_multigpu_encoder.py \
test_model_parallel_encoder.py
sitrep:
needs: te-multi-gpu
if: success() || failure()
runs-on: ubuntu-latest
env:
ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
steps:
- name: Check out the repository under ${GITHUB_WORKSPACE}
uses: actions/checkout@v4

- name: Download artifacts
uses: actions/download-artifact@v4
with:
pattern: |
${{ inputs.ARTIFACT_PREFIX }}-*
merge-multiple: true

- name: Generate sitrep
shell: bash -x -e {0}
run: |
# bring in utility functions
source .github/workflows/scripts/to_json.sh
test_outcome_files=$(find -name pytest-report.jsonl)
badge_label='TE Multi GPU tests'
passed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") | .outcome' | wc -l)
failed_tests=$(cat ${test_outcome_files} | jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") | .outcome' | wc -l)
total_tests=$((failed_tests + passed_tests))
if [[ ${total_tests} == 0 ]]; then
badge_message='error'
badge_color=red
summary='TE multi GPU tests did not complete due to errors.'
else
badge_message="${passed_tests}/${total_tests} passed"
if [[ ${failed_tests} == 0 ]]; then
badge_color=brightgreen
else
badge_color=yellow
fi
summary="TE multi GPU tests : $badge_message"
fi
run_id=${{ github.run_id }} \
to_json \
run_id \
summary \
total_tests passed_tests failed_tests \
badge_label badge_color badge_message \
> sitrep.json
schemaVersion=1 \
label="${badge_label}" \
message="${badge_message}" \
color="${badge_color}" \
to_json schemaVersion label message color \
> ${{ env.BADGE_FILENAME_FULL }}
- name: Upload training logs as artifacts
uses: actions/upload-artifact@v4
with:
name: ${{ env.ARTIFACT_NAME_FULL }}
path: |
sitrep.json
${{ env.BADGE_FILENAME_FULL }}
43 changes: 20 additions & 23 deletions .github/workflows/_test_slurm_pyxis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,14 @@ name: ~test multi-node jobs via SLURM+Pyxis

on:
workflow_call:
secrets:
SSH_PRIVATE_KEY:
required: true
description: SSH private key for accessing the SLURM login node
SLURM_LOGIN_USER:
required: true
description: Username for the SLURM login node
CONTAINER_REGISTRY_TOKEN:
required: true
description: Token for accessing the container registry
inputs:
NAME:
type: string
description: Name of the test case and output artifact tarball
required: true
SLURM_LOGIN_HOSTNAME:
type: string
description: Hostname of the SLURM login node
required: true
OUTPUT_BASEDIR:
type: string
default: /nfs/cluster
description: Base directory for the SLURM scratch space
required: true
OUTPUT_MOUNTPOINT:
Expand All @@ -33,6 +20,10 @@ on:
type: number
description: Number of nodes to request
required: true
GPU_TYPE:
type: string
description: Type of GPU to request; A100 and H100 are supported.
required: true
GPUS_PER_NODE:
type: number
description: Number of GPUs per node to request
Expand Down Expand Up @@ -78,15 +69,21 @@ on:
value: ${{ jobs.run-test.outputs.SLURM_EXITCODE }}

jobs:

run-test:
name: ${{ inputs.NAME }}
runs-on: jumpbox
env:
SLURM_LOGIN_USER: ${{ inputs.GPU_TYPE == 'A100' && secrets.SLURM_LOGIN_USER || secrets.AWS_LOGIN_USER }}
SLURM_LOGIN_HOSTNAME: ${{ inputs.GPU_TYPE == 'A100' && vars.HOSTNAME_SLURM_LOGIN || vars.HOSTNAME_AWS_SLURM }}
outputs:
SLURM_JOB_ID: ${{ steps.submit.outputs.SLURM_JOB_ID }}
SLURM_STATE: ${{ steps.exit-info.outputs.SLURM_STATE }}
SLURM_EXITCODE: ${{ steps.exit-info.outputs.SLURM_EXITCODE }}
steps:
- name: Validate inputs
run: |
[[ "${{ inputs.GPU_TYPE }}" == "A100" || "${{ inputs.GPU_TYPE }}" == "H100" ]]
- name: Print environment variables
run: env

Expand All @@ -96,7 +93,7 @@ jobs:
- name: Setup SSH agent
uses: webfactory/[email protected]
with:
ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }}
ssh-private-key: ${{ inputs.GPU_TYPE == 'A100' && secrets.SSH_PRIVATE_KEY || secrets.AWS_PRIVATE_KEY }}

- name: Setup SSH known hosts
id: ssh-known-hosts
Expand All @@ -123,7 +120,7 @@ jobs:
id: submit
shell: bash -O expand_aliases -x -e {0}
run: |
alias SSH='ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}'
alias SSH='ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME}'
SSH mkdir -p ${{ steps.meta.outputs.OUTPUT_PATH }}
SLURM_JOB_ID=$(SSH sbatch --parsable <<"EOF"
#!/bin/bash
Expand All @@ -133,7 +130,7 @@ jobs:
#SBATCH --gpus-per-node=${{ inputs.GPUS_PER_NODE }}
#SBATCH --time=${{ inputs.TIME_LIMIT }}
#SBATCH --output=${{ steps.meta.outputs.LOG_FILE }}
#SBATCH --export="${{ inputs.EXTRA_EXPORTS }},ENROOT_PASSWORD=${{ secrets.CONTAINER_REGISTRY_TOKEN }}"
#SBATCH --export="${{ inputs.EXTRA_EXPORTS }},ENROOT_PASSWORD=${{ secrets.github_token }}"
# preload enroot container using one task per node
time srun \
Expand Down Expand Up @@ -172,14 +169,14 @@ jobs:
run: |
. .github/workflows/scripts/wait_for_slurm_job.sh
wait_for_slurm_job ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} ${{ steps.submit.outputs.SLURM_JOB_ID }}
wait_for_slurm_job ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} ${{ steps.submit.outputs.SLURM_JOB_ID }}
- name: Query for job exit info
id: exit-info
shell: bash -exu -o pipefail {0}
run: |
JOB_INFO=$(
ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} \
sacct -j ${{ steps.submit.outputs.SLURM_JOB_ID }} --format=JobID,JobName,State,Exitcode --parsable2 --noheader |\
grep -E '^[0-9]+\|'
)
Expand All @@ -196,7 +193,7 @@ jobs:
echo "******************** TAIL OF SLURM LOG BEG ********************"
echo "***************************************************************"
echo "***************************************************************"
ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }}
ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} tail -n 200 ${{ steps.meta.outputs.LOG_FILE }}
echo "***************************************************************"
echo "***************************************************************"
echo "******************** TAIL OF SLURM LOG END ********************"
Expand All @@ -212,7 +209,7 @@ jobs:
shell: bash -x -e {0}
run: |
function rsync-down() {
rsync -rtz --progress ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }}:$1 $2
rsync -rtz --progress ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME}:$1 $2
}
mkdir -p artifacts/
rsync-down ${{ steps.meta.outputs.LOG_FILE }} artifacts/
Expand Down Expand Up @@ -243,5 +240,5 @@ jobs:
if: always() && steps.exit-info.outputs.SLURM_EXITCODE != 0
shell: bash -x -e {0}
run: |
ssh ${{ secrets.SLURM_LOGIN_USER }}@${{ inputs.SLURM_LOGIN_HOSTNAME }} \
ssh ${SLURM_LOGIN_USER}@${SLURM_LOGIN_HOSTNAME} \
scancel ${{ steps.submit.outputs.SLURM_JOB_ID }}
2 changes: 1 addition & 1 deletion .github/workflows/_test_te.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
matrix:
N_GPU: [2, 4, 8]
fail-fast: false
secrets:
secrets: inherit
SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
Expand Down

0 comments on commit 51164b5

Please sign in to comment.