From 4bb779166ad62765306f1391976daf1d1710ff7c Mon Sep 17 00:00:00 2001 From: Dmitry Rogozhkin Date: Tue, 7 Jan 2025 20:45:10 +0000 Subject: [PATCH] ci: add test for Huggingface Accelerate Signed-off-by: Dmitry Rogozhkin --- .github/workflows/_linux_accelerate.yml | 193 ++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 .github/workflows/_linux_accelerate.yml diff --git a/.github/workflows/_linux_accelerate.yml b/.github/workflows/_linux_accelerate.yml new file mode 100644 index 000000000..f7164f45a --- /dev/null +++ b/.github/workflows/_linux_accelerate.yml @@ -0,0 +1,193 @@ +name: Linux Accelerate Test + +on: + pull_request: + branches: + - main + paths: + - '.github/scripts/parse-junitxml.py' + - '.github/workflows/_linux_accelerate.yml' + workflow_dispatch: + inputs: + pytorch: + required: false + type: string + default: 'nightly' + description: Pytorch branch/commit + python: + required: false + type: string + default: '3.10' + description: Python version + runner: + required: true + type: string + default: 'linux.idc.xpu' + description: Runner label + accelerate: + required: false + type: string + default: 'v1.2.1' + description: Accelerate version + transformers: + required: false + type: string + default: 'v4.47.1' + description: Transformers version + +permissions: read-all + +jobs: + Torch-XPU-Accelerate-Tests: + runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }} + env: + CONDA_ENV_NAME: 'huggingface_accelerate_test' + WORK_DIR: 'accelerate' + NEOReadDebugKeys: 0 + DisableScratchPages: 0 + accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.2.1' }} + transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.1' }} + python: ${{ inputs.python != '' && inputs.python || '3.10' }} + PYTORCH_DEBUG_XPU_FALLBACK: 1 + ZE_AFFINITY_MASK: 0 + PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py + steps: + - name: Checkout torch-xpu-ops + uses: actions/checkout@v4 + with: + path: torch-xpu-ops + - name: Checkout Accelerate + uses: actions/checkout@v4 + with: + repository: huggingface/accelerate + ref: ${{ env.accelerate }} + path: accelerate + - name: Prepare Conda ENV + run: | + which conda && conda clean -ay + conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME + conda create -y -n $CONDA_ENV_NAME python=${{ env.python }} + source activate $CONDA_ENV_NAME + pip install junitparser + pip install transformers==${{ env.transformers }} + - name: Prepare Stock XPU Pytorch + run: | + source activate $CONDA_ENV_NAME + pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu + - name: Prepare Accelerate + run: | + source activate $CONDA_ENV_NAME + cd $WORK_DIR + pip install -e . + pip install -e ".[testing]" + rm -rf tests_log && mkdir -p tests_log + rm -rf reports + cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./ + - name: Report installed versions + run: | + source activate $CONDA_ENV_NAME + echo "pip installed packages:" + pip list | tee ${{ github.workspace }}/$WORK_DIR/tests_log/pip_list.txt + echo "lspci gpu devices:" + lspci -d ::0380 | tee ${{ github.workspace }}/$WORK_DIR/tests_log/lspci_0380.txt + echo "GPU render nodes:" + cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/$WORK_DIR/tests_log/device_IDs.txt + echo "xpu-smi output:" + xpu-smi discovery -y --json --dump -1 + - name: Sanity check installed packages + run: | + source activate $CONDA_ENV_NAME + # These checks are to exit earlier if for any reason torch + # packages were reinstalled back to CUDA versions (not expected). + pip show torch | grep Version | grep xpu + pip show torchaudio | grep Version | grep xpu + pip show torchvision | grep Version | grep xpu + python -c 'import torch; exit(not torch.xpu.is_available())' + - name: Run tests + run: | + source activate $CONDA_ENV_NAME + cd $WORK_DIR && rm -rf reports && mkdir -p reports + # Excluding tests due to: + # * tests/test_examples.py::FeatureExamplesTests::test_profiler fails on + # Kineto profiler initialization for XPU device: PTI_ERROR_INTERNAL + # * tests/test_cli.py::ModelEstimatorTester::test_gated for failures due + # to not root caused environment configuration issue + pattern="not test_profiler and not test_gated" + cmd=(python3 -m pytest -rsf --junitxml=reports/accelerate.xml -k "$pattern" tests/) + { + echo "### Running" + echo "\`\`\`" + echo "${cmd[@]@Q}" + echo "\`\`\`" + } >> $GITHUB_STEP_SUMMARY + "${cmd[@]}" + - name: Print result tables + if: ${{ ! cancelled() }} + run: | + source activate $CONDA_ENV_NAME + cd $WORK_DIR + { + echo "### Results" + python3 $PARSE_JUNIT reports/accelerate.xml --stats + echo "### Failed" + python3 $PARSE_JUNIT reports/accelerate.xml --errors --failed + echo "### Skipped" + python3 $PARSE_JUNIT reports/accelerate.xml --skipped + } >> $GITHUB_STEP_SUMMARY + - name: Print annotations + if: ${{ ! cancelled() }} + run: | + source activate $CONDA_ENV_NAME + { + echo "### Annotations" + echo "| | |" + echo "| --- | --- |" + echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |" + echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |" + echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ') |" + packages=" \ + level-zero \ + libigc1 \ + libigc2 \ + libze1 \ + libze-intel-gpu1 \ + intel-i915-dkms \ + intel-level-zero-gpu \ + intel-opencl-icd" + for package in $packages; do + package_version=$(dpkg -l | grep $package | grep ii | head -1 | sed "s/ */ /g" | cut -f3 -d" ") + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" + done + packages="accelerate \ + numpy \ + torch \ + torchaudio \ + torchvision \ + accelerate" + for package in $packages; do + package_version=$(python -c "import $package; print($package.__version__)" || true) + echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |" + done + # printing annotations for GPU cards + var="[$(cat /sys/class/drm/render*/device/vendor || true)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed 's/ /,/g') |" + var="[$(cat /sys/class/drm/render*/device/device || true)]" + echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed 's/ /,/g') |" + var=$(python -c "import torch; print(torch.version.xpu)" || true) + echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |" + var=$(python -c "import torch; print(torch.xpu.device_count())" || true) + echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |" + # printing annotations with key environment variables + echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |" + echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |" + echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |" + echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |" + } >> $GITHUB_STEP_SUMMARY + - name: Upload Test log + if: ${{ ! cancelled() }} + uses: actions/upload-artifact@v4 + with: + name: Torch-XPU-Accelerate-Log-${{ github.event.pull_request.number || github.sha }} + path: | + ${{ github.workspace }}/accelerate/reports + ${{ github.workspace }}/accelerate/tests_log