From 1a1b2940e610e9704c5c88f98af86cf94b2d0a5e Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:24:30 +0200
Subject: [PATCH] Better way to run AMD CI with different flavors (#26634)

* Enable testing against mi250

* Change BERT to trigger tests

* Revert BERT's change

* AMD CI

* AMD CI

---------

Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com>
Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
---
 .../workflows/self-push-amd-mi210-caller.yml  | 25 +++++++++++++++
 .../workflows/self-push-amd-mi250-caller.yml  | 25 +++++++++++++++
 .github/workflows/self-push-amd.yml           | 31 ++++++-------------
 utils/notification_service.py                 |  7 +++--
 4 files changed, 64 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/self-push-amd-mi210-caller.yml
 create mode 100644 .github/workflows/self-push-amd-mi250-caller.yml

diff --git a/.github/workflows/self-push-amd-mi210-caller.yml b/.github/workflows/self-push-amd-mi210-caller.yml
new file mode 100644
index 00000000000000..5dd010ef66d8fb
--- /dev/null
+++ b/.github/workflows/self-push-amd-mi210-caller.yml
@@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi210 CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi210
+    if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi210
+    secrets: inherit
diff --git a/.github/workflows/self-push-amd-mi250-caller.yml b/.github/workflows/self-push-amd-mi250-caller.yml
new file mode 100644
index 00000000000000..a55378c4caa54b
--- /dev/null
+++ b/.github/workflows/self-push-amd-mi250-caller.yml
@@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi250 CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi250
+    if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi250
+    secrets: inherit
diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml
index 0dfbbca7ba1232..8751698277685d 100644
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@@ -1,21 +1,11 @@
 name: Self-hosted runner AMD GPU (push)
 
 on:
-  workflow_run:
-    workflows: ["Self-hosted runner (push-caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - ci_*
-      - ci-*
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "templates/**"
-      - "utils/**"
-  repository_dispatch:
+  workflow_call:
+    inputs:
+      gpu_flavor:
+        required: true
+        type: string
 
 env:
   HF_HOME: /mnt/cache
@@ -45,8 +35,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-        gpu_flavor: [mi210]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     container:
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
       options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -65,8 +54,7 @@ jobs:
     strategy:
       matrix:
         machine_type: [single-gpu, multi-gpu]
-        gpu_flavor: [mi210]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     container:
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
       options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -164,8 +152,7 @@ jobs:
       matrix:
         folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
         machine_type: [single-gpu, multi-gpu]
-        gpu_flavor: [mi210]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     container:
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
       options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -321,7 +308,7 @@ jobs:
           CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
           CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
           ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_EVENT: push
+          CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
           CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
           CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
           CI_SHA: ${{ env.CI_SHA }}
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 1d10fa5d821f0b..650d76b28c375c 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -897,6 +897,9 @@ def prepare_reports(title, header, reports, to_truncate=True):
         job_name_prefix = f"{framework} {version}"
     elif ci_event.startswith("Nightly CI"):
         job_name_prefix = "Nightly CI"
+    elif ci_event.startswith("Push CI (AMD) - "):
+        flavor = ci_event.replace("Push CI (AMD) - ", "")
+        job_name_prefix = f"AMD {flavor}"
 
     for model in model_results.keys():
         for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
@@ -962,7 +965,7 @@ def prepare_reports(title, header, reports, to_truncate=True):
         "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
     }
 
-    if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
+    if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI") or ci_event.startswith("Push CI (AMD)"):
         del additional_files["Examples directory"]
         del additional_files["PyTorch pipelines"]
         del additional_files["TensorFlow pipelines"]
@@ -1027,6 +1030,6 @@ def prepare_reports(title, header, reports, to_truncate=True):
     message = Message(title, ci_title, model_results, additional_results, selected_warnings=selected_warnings)
 
     # send report only if there is any failure (for push CI)
-    if message.n_failures or ci_event != "push":
+    if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
         message.post()
         message.post_reply()