fold A100/H100 logic into helper #1

Workflow file for this run

.github/workflows/_test_te.yaml at 51164b5

	name: ~test TransformerEngine

	on:
	workflow_call:
	inputs:
	TE_IMAGE:
	type: string
	description: 'JAX+TE+PAXML image'
	required: true
	default: 'ghcr.io/nvidia/upstream-pax:latest'
	ARTIFACT_PREFIX:
	type: string
	description: 'Name of the artifact zip file'
	required: false
	default: 'te'

	jobs:
	te-multi-gpu:
	uses: ./.github/workflows/_test_slurm_pyxis.yaml
	strategy:
	matrix:
	N_GPU: [2, 4, 8]
	fail-fast: false
	secrets: inherit
	SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }}
Check failure on line 25 in .github/workflows/_test_te.yaml View workflow run for this annotation GitHub Actions / .github/workflows/_test_te.yaml Invalid workflow file `You have an error in your yaml syntax on line 25`
	SLURM_LOGIN_USER: ${{ secrets.CLUSTER_LOGIN_USER }}
	CONTAINER_REGISTRY_TOKEN: ${{ secrets.github_token }}
	with:
	NAME: ${{ inputs.ARTIFACT_PREFIX }}-${{ matrix.N_GPU }}GPU
	SLURM_LOGIN_HOSTNAME: ${{ vars.HOSTNAME_SLURM_LOGIN }}
	OUTPUT_BASEDIR: /nfs/cluster
	OUTPUT_MOUNTPOINT: /output
	NODES: 1
	GPUS_PER_NODE: ${{ matrix.N_GPU }}
	NTASKS: 1
	NTASKS_PER_NODE: 1
	TIME_LIMIT: '00:10:00'
	EXTRA_EXPORTS: 'VOCAB_PATH=gs://t5-data/vocabs/cc_all.32000.100extra/sentencepiece.model'
	IMAGE: ${{ inputs.TE_IMAGE }}
	SRUN_PREAMBLE: \|
	nvidia-smi
	pip install \
	pytest \
	pytest-reportlog \
	cuda-python \
	-r ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder/requirements.txt
	SRUN_SCRIPT: \|
	set -ex
	cd ${SRC_PATH_TRANSFORMER_ENGINE}/examples/jax/encoder
	pytest --report-log=/output/pytest-report.jsonl \
	test_single_gpu_encoder.py \
	test_multigpu_encoder.py \
	test_model_parallel_encoder.py

	sitrep:
	needs: te-multi-gpu
	if: success() \|\| failure()
	runs-on: ubuntu-latest
	env:
	ARTIFACT_NAME_FULL: ${{ inputs.ARTIFACT_PREFIX }}-multigpu-test
	BADGE_FILENAME_FULL: badge-${{ inputs.ARTIFACT_PREFIX }}-multigpu-test.json
	steps:
	- name: Check out the repository under ${GITHUB_WORKSPACE}
	uses: actions/checkout@v4

	- name: Download artifacts
	uses: actions/download-artifact@v4
	with:
	pattern: \|
	${{ inputs.ARTIFACT_PREFIX }}-*
	merge-multiple: true

	- name: Generate sitrep
	shell: bash -x -e {0}
	run: \|
	# bring in utility functions
	source .github/workflows/scripts/to_json.sh

	test_outcome_files=$(find -name pytest-report.jsonl)

	badge_label='TE Multi GPU tests'
	passed_tests=$(cat ${test_outcome_files} \| jq -r 'select(."$report_type" == "CollectReport" and .outcome == "passed") \| .outcome' \| wc -l)
	failed_tests=$(cat ${test_outcome_files} \| jq -r 'select(."$report_type" == "CollectReport" and .outcome == "failed") \| .outcome' \| wc -l)
	total_tests=$((failed_tests + passed_tests))

	if [[ ${total_tests} == 0 ]]; then
	badge_message='error'
	badge_color=red
	summary='TE multi GPU tests did not complete due to errors.'
	else
	badge_message="${passed_tests}/${total_tests} passed"
	if [[ ${failed_tests} == 0 ]]; then
	badge_color=brightgreen
	else
	badge_color=yellow
	fi
	summary="TE multi GPU tests : $badge_message"
	fi

	run_id=${{ github.run_id }} \
	to_json \
	run_id \
	summary \
	total_tests passed_tests failed_tests \
	badge_label badge_color badge_message \
	> sitrep.json

	schemaVersion=1 \
	label="${badge_label}" \
	message="${badge_message}" \
	color="${badge_color}" \
	to_json schemaVersion label message color \
	> ${{ env.BADGE_FILENAME_FULL }}

	- name: Upload training logs as artifacts
	uses: actions/upload-artifact@v4
	with:
	name: ${{ env.ARTIFACT_NAME_FULL }}
	path: \|
	sitrep.json
	${{ env.BADGE_FILENAME_FULL }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fold A100/H100 logic into helper #1

Workflow file

fold A100/H100 logic into helper #1

Jobs

Run details

Workflow file for this run

GitHub Actions / .github/workflows/_test_te.yaml