From 052a9a79b444fdf1111a4352652e9730297ef07d Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 31 Oct 2024 13:45:01 -0400 Subject: [PATCH] only run the remainder of the gpu test suite if one case passes first (#2009) [skip ci] * only run the remainder of the gpu test suite if one case passes first * also reduce the test matrix --- .github/workflows/tests-nightly.yml | 7 ---- .github/workflows/tests.yml | 54 ++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml index 90b1e23cd2..6a1dbcc915 100644 --- a/.github/workflows/tests-nightly.yml +++ b/.github/workflows/tests-nightly.yml @@ -82,13 +82,6 @@ jobs: num_gpus: 1 axolotl_extras: mamba-ssm nightly_build: "true" - - cuda: 121 - cuda_version: 12.1.1 - python_version: "3.11" - pytorch: 2.3.1 - num_gpus: 1 - axolotl_extras: mamba-ssm - nightly_build: "true" - cuda: 124 cuda_version: 12.4.1 python_version: "3.11" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ba50adfd35..2c735bd102 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -72,7 +72,7 @@ jobs: run: | find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; - docker-e2e-tests: + docker-e2e-tests-1st: if: github.repository_owner == 'axolotl-ai-cloud' # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] @@ -83,24 +83,52 @@ jobs: fail-fast: false matrix: include: - - cuda: 121 - cuda_version: 12.1.1 - python_version: "3.10" - pytorch: 2.3.1 - num_gpus: 1 - axolotl_extras: mamba-ssm - - cuda: 121 - cuda_version: 12.1.1 - python_version: "3.11" - pytorch: 2.3.1 - num_gpus: 1 - axolotl_extras: mamba-ssm - cuda: 124 cuda_version: 12.4.1 python_version: "3.11" pytorch: 2.4.1 num_gpus: 1 axolotl_extras: + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install Modal + run: | + python -m pip install --upgrade pip + pip install modal==0.63.64 jinja2 + - name: Update env vars + run: | + echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV + echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV + echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV + echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV + echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV + echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV + - name: Run tests job on Modal + run: | + modal run cicd.tests + + docker-e2e-tests: + if: github.repository_owner == 'axolotl-ai-cloud' + # this job needs to be run on self-hosted GPU runners... + runs-on: [self-hosted, modal] + timeout-minutes: 90 + needs: [pre-commit, pytest, docker-e2e-tests-1st] + + strategy: + fail-fast: false + matrix: + include: + - cuda: 121 + cuda_version: 12.1.1 + python_version: "3.10" + pytorch: 2.3.1 + num_gpus: 1 + axolotl_extras: mamba-ssm - cuda: 124 cuda_version: 12.4.1 python_version: "3.11"