From 1a1b2940e610e9704c5c88f98af86cf94b2d0a5e Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 16 Oct 2023 16:24:30 +0200 Subject: [PATCH] Better way to run AMD CI with different flavors (#26634) * Enable testing against mi250 * Change BERT to trigger tests * Revert BERT's change * AMD CI * AMD CI --------- Co-authored-by: Morgan Funtowicz Co-authored-by: ydshieh --- .../workflows/self-push-amd-mi210-caller.yml | 25 +++++++++++++++ .../workflows/self-push-amd-mi250-caller.yml | 25 +++++++++++++++ .github/workflows/self-push-amd.yml | 31 ++++++------------- utils/notification_service.py | 7 +++-- 4 files changed, 64 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/self-push-amd-mi210-caller.yml create mode 100644 .github/workflows/self-push-amd-mi250-caller.yml diff --git a/.github/workflows/self-push-amd-mi210-caller.yml b/.github/workflows/self-push-amd-mi210-caller.yml new file mode 100644 index 00000000000000..5dd010ef66d8fb --- /dev/null +++ b/.github/workflows/self-push-amd-mi210-caller.yml @@ -0,0 +1,25 @@ +name: Self-hosted runner (AMD mi210 CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (push-caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi210 + if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi210 + secrets: inherit diff --git a/.github/workflows/self-push-amd-mi250-caller.yml b/.github/workflows/self-push-amd-mi250-caller.yml new file mode 100644 index 00000000000000..a55378c4caa54b --- /dev/null +++ b/.github/workflows/self-push-amd-mi250-caller.yml @@ -0,0 +1,25 @@ +name: Self-hosted runner (AMD mi250 CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (push-caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi250 + if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi250 + secrets: inherit diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml index 0dfbbca7ba1232..8751698277685d 100644 --- a/.github/workflows/self-push-amd.yml +++ b/.github/workflows/self-push-amd.yml @@ -1,21 +1,11 @@ name: Self-hosted runner AMD GPU (push) on: - workflow_run: - workflows: ["Self-hosted runner (push-caller)"] - branches: ["main"] - types: [completed] - push: - branches: - - ci_* - - ci-* - paths: - - "src/**" - - "tests/**" - - ".github/**" - - "templates/**" - - "utils/**" - repository_dispatch: + workflow_call: + inputs: + gpu_flavor: + required: true + type: string env: HF_HOME: /mnt/cache @@ -45,8 +35,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - gpu_flavor: [mi210] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -65,8 +54,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - gpu_flavor: [mi210] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -164,8 +152,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }} machine_type: [single-gpu, multi-gpu] - gpu_flavor: [mi210] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}'] + runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -321,7 +308,7 @@ jobs: CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }} ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - CI_EVENT: push + CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }} CI_TITLE_PUSH: ${{ github.event.head_commit.message }} CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} CI_SHA: ${{ env.CI_SHA }} diff --git a/utils/notification_service.py b/utils/notification_service.py index 1d10fa5d821f0b..650d76b28c375c 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -897,6 +897,9 @@ def prepare_reports(title, header, reports, to_truncate=True): job_name_prefix = f"{framework} {version}" elif ci_event.startswith("Nightly CI"): job_name_prefix = "Nightly CI" + elif ci_event.startswith("Push CI (AMD) - "): + flavor = ci_event.replace("Push CI (AMD) - ", "") + job_name_prefix = f"AMD {flavor}" for model in model_results.keys(): for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths: @@ -962,7 +965,7 @@ def prepare_reports(title, header, reports, to_truncate=True): "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", } - if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"): + if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI") or ci_event.startswith("Push CI (AMD)"): del additional_files["Examples directory"] del additional_files["PyTorch pipelines"] del additional_files["TensorFlow pipelines"] @@ -1027,6 +1030,6 @@ def prepare_reports(title, header, reports, to_truncate=True): message = Message(title, ci_title, model_results, additional_results, selected_warnings=selected_warnings) # send report only if there is any failure (for push CI) - if message.n_failures or ci_event != "push": + if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")): message.post() message.post_reply()