truefoundry · chiragjn · Aug 19, 2024 · Aug 20, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
@@ -28,7 +28,19 @@ jobs:
             cuda_version: 12.4.1
             cudnn_version: ""
             python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.4.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.5.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
     steps:
       - name: Checkout

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
        - '**.py'
        - 'requirements.txt'
        - '.github/workflows/*.yml'
-       - "*.md"
+       - "*.[q]md"
        - "examples/**/*.y[a]?ml"
   workflow_dispatch:
 

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -27,7 +27,12 @@ jobs:
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
@@ -84,7 +89,12 @@ jobs:
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
@@ -1,6 +1,9 @@
 name: docker-multigpu-tests-biweekly
 
 on:
+  pull_request:
+    paths:
+      - 'tests/e2e/multigpu/*.py'
   workflow_dispatch:
   schedule:
     - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
@@ -18,6 +21,20 @@ jobs:
             pytorch: 2.3.1
             axolotl_extras:
             num_gpus: 2
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            axolotl_extras:
+            num_gpus: 2
+            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
+            axolotl_extras:
+            num_gpus: 2
+            nightly_build: "true"
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
     steps:
@@ -39,6 +56,7 @@ jobs:
           echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
           echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
           echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
       - name: Run tests job on Modal
         run: |
           modal run cicd.multigpu
diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml
@@ -26,7 +26,12 @@ jobs:
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
@@ -83,7 +88,12 @@ jobs:
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:

diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -27,7 +27,7 @@ jobs:
         run: |
           pip3 install wheel packaging
           pip3 install -e .
-          pip3 install -r requirements-tests.txt
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
 
       - name: Extract tag name
         id: tag

diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
@@ -0,0 +1,128 @@
+name: Tests Nightly against upstream main
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
+
+jobs:
+  pre-commit:
+    name: pre-commit
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+          cache: 'pip' # caching pip dependencies
+      - uses: pre-commit/[email protected]
+        env:
+          SKIP: no-commit-to-branch
+
+  pytest:
+    name: PyTest
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
+    timeout-minutes: 20
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
+
+      - name: Update requirements.txt
+        run: |
+          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
+          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
+          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
+          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
+
+      - name: Install dependencies
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging
+          pip3 install -U -e .
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Run tests
+        run: |
+          pytest --ignore=tests/e2e/ tests/
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+  docker-e2e-tests:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    # this job needs to be run on self-hosted GPU runners...
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 60
+    needs: [pre-commit, pytest]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 121
+            cuda_version: 12.1.1
+            python_version: "3.10"
+            pytorch: 2.3.1
+            num_gpus: 1
+            axolotl_extras: mamba-ssm
+            nightly_build: "true"
+          - cuda: 121
+            cuda_version: 12.1.1
+            python_version: "3.11"
+            pytorch: 2.3.1
+            num_gpus: 1
+            axolotl_extras: mamba-ssm
+            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==0.63.64 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.tests
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -36,6 +36,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
     timeout-minutes: 20
 
     steps:
@@ -48,12 +49,20 @@ jobs:
           python-version: ${{ matrix.python_version }}
           cache: 'pip' # caching pip dependencies
 
-      - name: Install dependencies
+      - name: upgrade pip
         run: |
           pip3 install --upgrade pip
-          pip3 install --upgrade packaging
+          pip3 install --upgrade packaging setuptools wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
           pip3 install -U -e .
-          pip3 install -r requirements-tests.txt
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
 
       - name: Run tests
         run: |
@@ -67,7 +76,7 @@ jobs:
     if: github.repository_owner == 'axolotl-ai-cloud'
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
-    timeout-minutes: 60
+    timeout-minutes: 90
     needs: [pre-commit, pytest]
 
     strategy:
@@ -89,7 +98,13 @@ jobs:
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
             num_gpus: 1
             axolotl_extras:
     steps:

diff --git a/.isort.cfg b/.isort.cfg
@@ -1,3 +1,3 @@
 [settings]
 profile=black
-known_third_party=wandb
+known_third_party=wandb,comet_ml
diff --git a/.mypy.ini b/.mypy.ini
@@ -11,6 +11,9 @@ ignore_errors = True
 [mypy-axolotl.models.mixtral.*]
 ignore_errors = True
 
+[mypy-axolotl.integrations.liger.models.*]
+ignore_errors = True
+
 [mypy-axolotl.models.phi.*]
 ignore_errors = True