From 241bffded38fb81da533b5c4902c933270ba042a Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Fri, 31 Jan 2025 15:07:12 -0800 Subject: [PATCH] Update A6000 workflows to use newer docker container - 24.09 vs 24.03 (#6967) - Issues with nv-sd updates, will follow up with a subsequent PR --- .github/workflows/nv-a6000.yml | 6 +++--- .github/workflows/nv-flash-attn.yml | 4 ++-- .github/workflows/nv-human-eval.yml | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml index 0547431e3099..e48d9a7fffa2 100644 --- a/.github/workflows/nv-a6000.yml +++ b/.github/workflows/nv-a6000.yml @@ -23,7 +23,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:24.03-py3 + image: nvcr.io/nvidia/pytorch:24.09-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -57,8 +57,8 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.3" --cuda_ver="12" - python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.3" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2' unit/ --torch_ver="2.5" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'inference_v2_ops' unit/ --torch_ver="2.5" --cuda_ver="12" - name: MII unit tests run: | BRANCH="main" diff --git a/.github/workflows/nv-flash-attn.yml b/.github/workflows/nv-flash-attn.yml index 310972323043..591969fbd986 100644 --- a/.github/workflows/nv-flash-attn.yml +++ b/.github/workflows/nv-flash-attn.yml @@ -18,7 +18,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:24.03-py3 + image: nvcr.io/nvidia/pytorch:24.09-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -53,7 +53,7 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.3" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF unit/sequence_parallelism/test_ulysses.py --torch_ver="2.5" --cuda_ver="12" - name: Open GitHub issue if nightly CI fails if: ${{ failure() && (github.event_name == 'schedule') }} uses: JasonEtco/create-an-issue@v2 diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml index 2ecdf218b96a..3f59c42f697e 100644 --- a/.github/workflows/nv-human-eval.yml +++ b/.github/workflows/nv-human-eval.yml @@ -11,7 +11,7 @@ jobs: unit-tests: runs-on: [self-hosted, nvidia, a6000] container: - image: nvcr.io/nvidia/pytorch:24.03-py3 + image: nvcr.io/nvidia/pytorch:24.09-py3 ports: - 80 options: --gpus all --shm-size "8G" @@ -50,4 +50,4 @@ jobs: run: | unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch cd tests - python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.3" --cuda_ver="12" + python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.5" --cuda_ver="12"