From c8439e01e819e77ba03004a86e0813a41e2d0a90 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 16 Oct 2024 15:39:57 -0400
Subject: [PATCH 01/48] e2e: replace old small job with new medium job

this commit adds a new E2E job meant to test integration
of training library changes with the CLI's "full" train
pipeline to prevent any regressions

it also updates the relevant mergify configuration

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/mergify.yml                           |  8 +-
 ...vidia-t4-x1.yml => e2e-nvidia-a10g-x1.yml} | 83 +++++++++----------
 README.md                                     |  1 +
 3 files changed, 46 insertions(+), 46 deletions(-)
 rename .github/workflows/{e2e-nvidia-t4-x1.yml => e2e-nvidia-a10g-x1.yml} (79%)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 774b6dd4..edd79057 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -28,18 +28,18 @@ pull_request_rules:
     # e2e workflow
     - or:
       - and:
-        # note this should match the triggering criteria in 'e2e-nvidia-t4-x1.yml'
-        - check-success=e2e-workflow-complete
+        # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
+        - check-success=e2e-medium-workflow-complete
         - or:
           - files~=\.py$
           - files=pyproject.toml
           - files~=^requirements.*\.txt$
-          - files=.github/workflows/e2e-nvidia-t4-x1.yml
+          - files=.github/workflows/e2e-nvidia-a10g-x1.yml
       - and:
         - -files~=\.py$
         - -files=pyproject.toml
         - -files~=^requirements.*\.txt$
-        - -files=.github/workflows/e2e-nvidia-t4-x1.yml
+        - -files=.github/workflows/e2e-nvidia-a10g-x1.yml
 
     # code lint workflow
     - or:
diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
similarity index 79%
rename from .github/workflows/e2e-nvidia-t4-x1.yml
rename to .github/workflows/e2e-nvidia-a10g-x1.yml
index 959dedd9..97cad6a2 100644
--- a/.github/workflows/e2e-nvidia-t4-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -1,17 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: E2E (NVIDIA Tesla T4 x1)
+name: E2E (NVIDIA A10G x1)
 
 on:
+  # run against every merge commit to 'main' and release branches
   push:
     branches:
       - main
       - release-*
+  # only run on PRs that touch certain regex paths
   pull_request_target:
-    types:
-      - opened
-      - synchronize
-      - reopened
     branches:
       - main
       - release-*
@@ -20,15 +18,24 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements**.txt'
-      - '.github/workflows/e2e-nvidia-t4-x1.yml' # Follow-on workflow
+      - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
 jobs:
-  start-runner:
-    name: Start external EC2 runner
+  start-medium-ec2-runner:
     runs-on: ubuntu-latest
     outputs:
       label: ${{ steps.start-ec2-runner.outputs.label }}
@@ -40,6 +47,7 @@ jobs:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           aws-region: ${{ secrets.AWS_REGION }}
+
       - name: Start EC2 runner
         id: start-ec2-runner
         uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
@@ -59,10 +67,10 @@ jobs:
               {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
             ]
   
-  e2e:
-    name: E2E Test
-    needs: start-runner
-    runs-on: ${{ needs.start-runner.outputs.label }}
+  e2e-medium-test:
+    needs:
+      - start-medium-ec2-runner
+    runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}
 
     # It is important that this job has no write permissions and has
     # no access to any secrets. This part (e2e) is where we are running
@@ -70,12 +78,10 @@ jobs:
     permissions: {}
 
     steps:
-      # for debugging
-      - name: Print environment state
+      - name: Install Packages
         run: |
-          echo "Current Working Directory: $PWD"
-          echo "Files in Local Directory:"
-          ls -l
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
 
       - name: Checkout instructlab/instructlab
         uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
@@ -93,30 +99,19 @@ jobs:
           # https://github.com/actions/checkout/issues/249
           fetch-depth: 0
 
-      # for debugging
-      - name: Print environment state
-        run: |
-          echo "Current Working Directory: $PWD"
-          echo "Files in Local Directory:"
-          ls -l
-
       - name: Fetch and checkout PR
-        id: fetch_pr
-        if: github.event_name == 'pull_request_target'
+        if: ${{ github.event_name == 'pull_request_target' }}
         working-directory: ./training
         run: |
           git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
           git checkout pr-${{ github.event.pull_request.number }}
 
-      - name: Install system packages
-        run: |
-          cat /etc/os-release
-          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
-
-      - name: Install instructlab
+      - name: Install ilab
         working-directory: ./instructlab
         run: |
-          export PATH="/home/ec2-user/.local/bin:/usr/local/cuda/bin:$PATH"
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
           python3.11 -m venv --upgrade-deps venv
           . venv/bin/activate
           nvidia-smi
@@ -127,7 +122,7 @@ jobs:
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
           python3.11 -m pip install packaging wheel setuptools-scm
-          python3.11 -m pip install .[cuda]
+          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
 
       - name: Update instructlab-training library
         working-directory: ./training
@@ -136,17 +131,21 @@ jobs:
           pip install .
           pip install .[cuda]
 
+      - name: Check disk
+        run: |
+          df -h
+
       - name: Run e2e test
         working-directory: ./instructlab
         run: |
           . venv/bin/activate
-          ./scripts/e2e-custom.sh -a
+          ./scripts/e2e-ci.sh -m
 
-  stop-runner:
+  stop-medium-ec2-runner:
     name: Stop external EC2 runner
     needs:
-      - start-runner
-      - e2e
+      - start-medium-ec2-runner
+      - e2e-medium-test
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
@@ -161,13 +160,13 @@ jobs:
         with:
           mode: stop
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+          label: ${{ needs.start-medium-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
 
-  e2e-workflow-complete:
+  e2e-medium-workflow-complete:
     # we don't want to block PRs on failed EC2 cleanup
     # so not requiring "stop-runner" as well
-    needs: ["start-runner", "e2e"]
+    needs: ["start-medium-ec2-runner", "e2e-medium-test"]
     runs-on: ubuntu-latest
     steps:
       - name: E2E Workflow Complete
diff --git a/README.md b/README.md
index 645583f3..e9cf199d 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # InstructLab Training Library
 
 ![Lint](https://github.com/instructlab/training/actions/workflows/lint.yml/badge.svg?branch=main)
+![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
 ![Build](https://github.com/instructlab/training/actions/workflows/pypi.yaml/badge.svg?branch=main)
 ![Release](https://img.shields.io/github/v/release/instructlab/training)
 ![License](https://img.shields.io/github/license/instructlab/training)

From bbb75f305eb62c50a75c2a52a846996e8dc7542a Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 16 Oct 2024 16:41:12 -0400
Subject: [PATCH 02/48] fix: incorrect label for AWS medium runner

was being incorrectly labeled as 'small"

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-a10g-x1.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index 97cad6a2..e9276848 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -61,7 +61,7 @@ jobs:
           iam-role-name: instructlab-ci-runner
           aws-resource-tags: >
             [
-              {"Key": "Name", "Value": "instructlab-ci-github-small-runner"},
+              {"Key": "Name", "Value": "instructlab-ci-github-medium-runner"},
               {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
               {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
               {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}

From 7e3cefbdc98c5cfac671c929db7ac403393021c2 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 16 Oct 2024 16:46:26 -0400
Subject: [PATCH 03/48] fix: add new AMI to job

was still using the old AMI from the previous job

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-a10g-x1.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index e9276848..18b19952 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -54,7 +54,7 @@ jobs:
         with:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-00c51d9c1374eda97
+          ec2-image-id: ami-01a89eee1adde309c
           ec2-instance-type: g4dn.2xlarge
           subnet-id: subnet-02d230cffd9385bd4
           security-group-id: sg-06300447c4a5fbef3

From 036b901dce6335f24ad82109bca8bd91eb4c01fa Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 16 Oct 2024 16:52:18 -0400
Subject: [PATCH 04/48] fix: correct instance type being used

was still using the old instance type

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-a10g-x1.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index 18b19952..3485ad73 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -55,7 +55,7 @@ jobs:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           ec2-image-id: ami-01a89eee1adde309c
-          ec2-instance-type: g4dn.2xlarge
+          ec2-instance-type: g5.4xlarge
           subnet-id: subnet-02d230cffd9385bd4
           security-group-id: sg-06300447c4a5fbef3
           iam-role-name: instructlab-ci-runner

From 9c899dcc0d85c95e267468bd30d006a0bb4f423c Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Thu, 19 Sep 2024 11:44:56 -0400
Subject: [PATCH 05/48] chore: add exit code & tox fix

Currently, the training library does not exit when an error is encountered
within the training loop (invoked through torchrun). This commit updates
that functionality so we correctly return an exit code of 1 on child failure.

Additionally, this commit also adds the `make fix` command which
automatically fixes all trivial issues picked up on by ruff

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/training/main_ds.py | 15 ++++++++++-----
 tox.ini                             |  1 -
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index c5cdb2ba..d1ae0e01 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -761,6 +761,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     print(f"\033[92mRunning training command as subprocess: {' '.join(command)}\033[0m")
     process = None
     interrupt: KeyboardInterrupt | Exception | None = None
+    failure = False
     try:
         process = StreamablePopen(
             f"{train_args.ckpt_output_dir}/full_logs_global{torch_args.node_rank}.log",
@@ -771,19 +772,20 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         print("Training subprocess interrupted by user.")
         interrupt = e
     except Exception as e:
-        print(f"An error occurred: {str(e)}")
+        print("Unexpected exception received during distributed training")
         interrupt = e
     finally:
         if "process" not in locals() or process is None:
             return
-        if process.poll() == 0:
-            print("\033[92mTraining subprocess exited successfully! 🎉\033[0m")
+
+        failure = process.poll() != 0
+        if not failure:
+            print("\033[92mOperation completed successfully! 🎉\033[0m")
         else:
             print(
                 "\033[91mTraining subprocess has not exited yet. Sending SIGTERM.\033[0m"
             )
 
-        print("Sending interrupt signal to Training subprocess.")
         process.terminate()
         try:
             print("Waiting for process to exit, 60s...")
@@ -795,8 +797,11 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
             process.kill()
 
         if interrupt:
-            print(f"Error caught from training subprocess.: {interrupt}")
             raise interrupt
+        if failure:
+            raise RuntimeError(
+                "Suffered a failure during distributed training. Please see the training logs for more context."
+            )
 
 
 if __name__ == "__main__":
diff --git a/tox.ini b/tox.ini
index b6b625cd..5756b665 100644
--- a/tox.ini
+++ b/tox.ini
@@ -66,7 +66,6 @@ commands =
     sh -c 'git diff --exit-code || (echo "pyproject.toml formatting is incorrect. Please run \"make toml-fmt\" and commit the changes." && exit 1)'
 allowlist_externals = make, sh
 
-
 [testenv:spellcheck]
 description = spell check (needs 'aspell' command)
 basepython = {[testenv:py3]basepython}

From af89deded32582e76663eff4fac3fbccfa14ebe6 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Mon, 21 Oct 2024 13:25:39 -0400
Subject: [PATCH 06/48] ci: grant HF_TOKEN access to the medium-size E2E CI job

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-a10g-x1.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index 3485ad73..ddc4cb81 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -137,6 +137,8 @@ jobs:
 
       - name: Run e2e test
         working-directory: ./instructlab
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           . venv/bin/activate
           ./scripts/e2e-ci.sh -m
@@ -155,6 +157,7 @@ jobs:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           aws-region: ${{ secrets.AWS_REGION }}
+
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
         with:

From e49809b3bd12d21610b3f9ebd325e4d6004b9c84 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Mon, 21 Oct 2024 13:24:43 -0400
Subject: [PATCH 07/48] ci: add large-size E2E CI job

this commit adds a new workflow to the Training repo
it will run a nightly cron job to test the current
'main' branch of Training against the current
'main' branch of the CLI (instructlab)

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-l40s-x4.yml | 231 +++++++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 .github/workflows/e2e-nvidia-l40s-x4.yml

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
new file mode 100644
index 00000000..e7ff227a
--- /dev/null
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -0,0 +1,231 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA L40S x4)
+
+on:
+  schedule:
+    - cron: '0 16 * * *' # Runs at 4PM UTC every day
+  workflow_dispatch:
+    inputs:
+      pr_or_branch:
+        description: 'pull request number or branch name'
+        required: true
+        default: 'main'
+
+jobs:
+  start-large-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ami-01a89eee1adde309c
+          ec2-instance-type: g6e.12xlarge
+          subnet-id: subnet-024298cefa3bedd61
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-large-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+
+  e2e-large-test:
+    needs:
+      - start-large-ec2-runner
+    runs-on: ${{ needs.start-large-ec2-runner.outputs.label }}
+
+    permissions:
+      pull-requests: write
+
+    steps:
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+  
+      - name: Checkout instructlab/training
+        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Determine if pr_or_branch is a PR number
+        id: check_pr
+        run: |
+          PR_OR_BRANCH=${{ github.event.inputs.pr_or_branch || 'main' }} # Default to 'main' if not set
+          if [[ "$PR_OR_BRANCH" =~ ^[0-9]+$ ]]; then
+            echo "is_pr=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "is_pr=false" >> "$GITHUB_OUTPUT"
+          fi
+          echo "pr_or_branch=$PR_OR_BRANCH" >> "$GITHUB_OUTPUT"
+
+      - name: Check if gh cli is installed
+        id: gh_cli
+        run: |
+          if command -v gh &> /dev/null ; then
+            echo "gh_cli_installed=true" >> "$GITHUB_OUTPUT"
+          else
+            echo "gh_cli_installed=false" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Install gh CLI
+        if: steps.gh_cli.outputs.gh_cli_installed == 'false'
+        run: |
+          sudo dnf install 'dnf-command(config-manager)' -y
+          sudo dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
+          sudo dnf install gh --repo gh-cli -y
+
+      - name: test gh CLI
+        run: |
+          gh --version
+
+      - name: set default repo
+        run: |
+          gh repo set-default ${{ github.server_url }}/${{ github.repository }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Fetch and checkout PR
+        if: steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Checkout branch
+        if: steps.check_pr.outputs.is_pr == 'false'
+        run: |
+          git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
+
+      - name: Install ilab
+        working-directory: ./instructlab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          nvidia-smi
+          python3.11 -m pip cache remove llama_cpp_python
+
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
+
+          # https://github.com/instructlab/instructlab/issues/1821
+          # install with Torch and build dependencies installed
+          python3.11 -m pip install packaging wheel setuptools-scm
+          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
+
+      - name: Update instructlab-training library
+        working-directory: ./training
+        run: |
+          . ../instructlab/venv/bin/activate
+          pip install .
+          pip install .[cuda]
+
+      - name: Check disk
+        run: |
+          df -h
+
+      - name: Run e2e test
+        working-directory: ./instructlab
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          . venv/bin/activate
+          ./scripts/e2e-ci.sh -l
+
+      - name: Add comment to PR if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Add comment to PR if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'true'
+        run: |
+          gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Post job results to Slack if the workflow failed
+        if: failure() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-failure
+        uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+          channel-id: 'e2e-ci-results'
+          slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *with failures* :meow_sad-rain: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+
+      - name: Post job results to Slack if the workflow succeeded
+        if: success() && steps.check_pr.outputs.is_pr == 'false'
+        id: slack-report-success
+        uses: slackapi/slack-github-action@37ebaef184d7626c5f204ab8d3baff4262dd30f0 # v1.27.0
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
+          channel-id: 'e2e-ci-results'
+          slack-message: "*e2e-nvidia-l40s-x4* job in *${{ github.repository }}* running on branch `${{ steps.check_pr.outputs.pr_or_branch }}` completed *successfully* :meow_party: | ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SON_OF_JEEVES_TOKEN }}
+
+  stop-large-ec2-runner:
+    needs:
+      - start-large-ec2-runner
+      - e2e-large-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-large-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}

From 51893905bcc43769a2c412835473bcbad3d17dd1 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Mon, 21 Oct 2024 17:22:43 -0400
Subject: [PATCH 08/48] chore: minor CI tweaks

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/mergify.yml                      | 4 ++--
 .github/workflows/e2e-nvidia-a10g-x1.yml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index edd79057..404565f2 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -25,11 +25,11 @@ pull_request_rules:
         - -files~=^\.github/(actions|workflows)/.*\.ya?ml$
         - -files~=^\.github/workflows/actionlint\.
 
-    # e2e workflow
+    # e2e medium workflow
     - or:
       - and:
         # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
-        - check-success=e2e-medium-workflow-complete
+        - check-success~=e2e-medium-workflow-complete
         - or:
           - files~=\.py$
           - files=pyproject.toml
diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index 3485ad73..64f1c5d3 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -142,7 +142,6 @@ jobs:
           ./scripts/e2e-ci.sh -m
 
   stop-medium-ec2-runner:
-    name: Stop external EC2 runner
     needs:
       - start-medium-ec2-runner
       - e2e-medium-test
@@ -155,6 +154,7 @@ jobs:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           aws-region: ${{ secrets.AWS_REGION }}
+
       - name: Stop EC2 runner
         uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
         with:

From 3a7b7db5e116d049e263a57c7488dd061dd7a81c Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 23 Oct 2024 14:57:16 -0400
Subject: [PATCH 09/48] fix: add working directory config to steps in large E2E
 CI job

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-l40s-x4.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index e7ff227a..e8b578f6 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -107,6 +107,7 @@ jobs:
           gh --version
 
       - name: set default repo
+        working-directory: ./training
         run: |
           gh repo set-default ${{ github.server_url }}/${{ github.repository }}
         env:
@@ -114,6 +115,7 @@ jobs:
 
       - name: Add comment to PR
         if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "${{ github.workflow }} workflow launched on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"
         env:
@@ -121,6 +123,7 @@ jobs:
 
       - name: Fetch and checkout PR
         if: steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr checkout ${{ steps.check_pr.outputs.pr_or_branch }}
         env:
@@ -128,6 +131,7 @@ jobs:
 
       - name: Checkout branch
         if: steps.check_pr.outputs.is_pr == 'false'
+        working-directory: ./training
         run: |
           git checkout ${{ steps.check_pr.outputs.pr_or_branch }}
 

From 890ebd91227a62886204d0efe461a5d24d157c25 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Wed, 23 Oct 2024 15:46:55 -0400
Subject: [PATCH 10/48] fix: add remaining missing working directory configs

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-l40s-x4.yml | 2 ++
 README.md                                | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index e8b578f6..b75c8cc5 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -174,6 +174,7 @@ jobs:
 
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow failed on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), please investigate."
         env:
@@ -181,6 +182,7 @@ jobs:
 
       - name: Add comment to PR if the workflow succeeded
         if: success() && steps.check_pr.outputs.is_pr == 'true'
+        working-directory: ./training
         run: |
           gh pr comment "${{ steps.check_pr.outputs.pr_or_branch }}" -b "e2e workflow succeeded on this PR: [View run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}), congrats!"
         env:
diff --git a/README.md b/README.md
index e9cf199d..be396a64 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
 # InstructLab Training Library
 
 ![Lint](https://github.com/instructlab/training/actions/workflows/lint.yml/badge.svg?branch=main)
-![`e2e-nvidia-a10g-x1.yaml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
 ![Build](https://github.com/instructlab/training/actions/workflows/pypi.yaml/badge.svg?branch=main)
 ![Release](https://img.shields.io/github/v/release/instructlab/training)
 ![License](https://img.shields.io/github/license/instructlab/training)
 
+![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
+![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
+
 - [Installing](#installing-the-library)
   - [Additional Nvidia packages](#additional-nvidia-packages)
 - [Using the library](#using-the-library)

From c0f3b442bec6c2aebd14a95a6c49e930e3efe648 Mon Sep 17 00:00:00 2001
From: James Kunstle <jkunstle@redhat.com>
Date: Wed, 16 Oct 2024 22:59:53 -0700
Subject: [PATCH 11/48] adds basic smoketests for main_ds and data_process CLI
 args

During development it's convenient to be able to run full distributed
training, even on a smaller dataset, just to make sure that nothing
obviously fails. This will also capture support for flash attention on
the machine that it's run on, and for granite models.

Signed-off-by: James Kunstle <jkunstle@redhat.com>
---
 tests/README.md    |  20 +++++
 tests/smoketest.sh | 212 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 232 insertions(+)
 create mode 100644 tests/README.md
 create mode 100755 tests/smoketest.sh

diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..7af121d2
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,20 @@
+## Overview
+
+`smoketest.sh` cd's into the source directory and runs the script-entrypoint for `main_ds.py` and `data_process.py`. Testing will break if file names or locations in the source tree change.
+
+Existing tests are "smoke tests," meant to demonstrate that training completes (returns 0) or not. This is helpful to check if all required dependencies are installed.
+
+Current tests add features as they go:
+
+1. No Flash Attention or Granite
+2. No Granite but Flash Attention enabled
+3. Granite and Flash Attention enabled
+
+## Usage
+
+The testing script can be run without parameters as `./smoketest.sh`. By default, this will run all tests with `FSDP` as the distributed training backend. To change the distributed training backend to the other available option, one can run the script as `./smoketest.sh deepspeed`.
+
+The second positional argument is for "number of GPUs"- e.g.: `./smoketest.sh fsdp 8`. This will run the test with 8 GPUs with fsdp as the distributed backend.
+
+> [!NOTE]
+> You'll need to install the training library to run the test. Inside a virtual environment and at inside the repo, please run `pip3 install -e .` to install the package in editable mode.
diff --git a/tests/smoketest.sh b/tests/smoketest.sh
new file mode 100755
index 00000000..e104b233
--- /dev/null
+++ b/tests/smoketest.sh
@@ -0,0 +1,212 @@
+#!/usr/bin/env bash
+set -eux -o pipefail
+
+# ############### Read-only parameters ############### 
+MODEL_NAME="instructlab/granite-7b-lab"
+# gets directory of current file.
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+CORRECT_WORKING_DIR="${SCRIPT_DIR}/../src/instructlab/training/"
+SAMPLE_DATA_PATH="${SCRIPT_DIR}/../sample-data/train_all_pruned_SDG.jsonl"
+TMP_DIR=$(mktemp -d)
+CHECKPOINTS_DIR="${TMP_DIR}/checkpoints"
+DATA_DIR="${TMP_DIR}/data"
+COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl"
+DEFAULT_DISTRIB_FRAMEWORK='fsdp'
+DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
+DEFAULT_GPUS=8
+NUM_GPUS="${2:-$DEFAULT_GPUS}"
+
+# ############### User-modifiable parameters ############### 
+# Change these as needed
+MAX_BATCH_LEN=60000
+NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
+
+# ############### Test Functions ############### 
+
+#######################################
+# Creates directories for the precomputed datasets
+# and the checkpoints that are saved during training inside
+# of the temporary storage created for these tests.
+# Globals:
+#   CHECKPOINTS_DIR 
+#   DATA_DIR
+# Arguments:
+#   None
+# Returns:
+#   None
+#######################################
+function setup_tmpdir () {
+    mkdir "$CHECKPOINTS_DIR"
+    mkdir "$DATA_DIR"
+}
+
+#######################################
+# Test most common training parameters without using
+# Flash Attention
+# Globals:
+#   SAMPLE_DATA_PATH
+#   DATA_DIR
+#   MODEL_NAME
+#   NUM_SAMPLES_TRAINED_ON
+#   COMPUTED_DATA_PATH
+# Arguments:
+#   None
+# Returns:
+#   echos number of samples trained on to standard out.
+#######################################
+function prepare_data () {
+    # preprocesses .jsonl messages data so that it's a valid
+    # input to the model (inputs tokenized, formatted with mask, etc.)
+    # then, data is trimmed to a determined length to make training
+    # go faster.
+    
+    python3 data_process.py \
+    --data_path="$SAMPLE_DATA_PATH" \
+    --data_output_path="$DATA_DIR" \
+    --max_seq_len=4096 \
+    --model_name_or_path="$MODEL_NAME"
+
+    # trim data so we only keep the first 'n' samples.
+    # should be enough data for training to be meaningful but not enough
+    # that training takes a large amount of time.
+    echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH"
+
+    echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES"
+}
+
+#######################################
+# Clears and remakes the temporary directory where
+# artifacts, such as checkpoints and logs, are stored
+# during training.
+# Globals:
+#   CHECKPOINTS_DIR
+# Arguments:
+#   None
+# Returns:
+#   writes location of checkpoints dir to standard out.
+#######################################
+function _cleanup_saved_checkpoints() {
+    echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR"
+    rm -rf "$CHECKPOINTS_DIR"
+    mkdir "$CHECKPOINTS_DIR"
+}
+
+#######################################
+# Test most common training parameters without using
+# Flash Attention
+# Globals:
+#   NUM_GPUS
+#   MODEL_NAME
+#   COMPUTED_DATA_PATH
+#   CHECKPOINTS_DIR
+#   DISTRIBUTED_FRAMEWORK
+#   MAX_BATCH_LEN
+# Arguments:
+#   None
+# Returns:
+#   None
+#######################################
+function test_standard_loop () {
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    --is_granite
+}
+
+#######################################
+# Test most common training parameters without using
+# Flash Attention
+# Globals:
+#   NUM_GPUS
+#   MODEL_NAME
+#   COMPUTED_DATA_PATH
+#   CHECKPOINTS_DIR
+#   DISTRIBUTED_FRAMEWORK
+#   MAX_BATCH_LEN
+# Arguments:
+#   None
+# Returns:
+#   None
+#######################################
+function test_standard_loop_nongranite () {
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN"
+    # --is_granite \
+}
+
+#######################################
+# Test most common training parameters without using
+# Granite or Flash Attention
+# Globals:
+#   NUM_GPUS
+#   MODEL_NAME
+#   COMPUTED_DATA_PATH
+#   CHECKPOINTS_DIR
+#   DISTRIBUTED_FRAMEWORK
+#   MAX_BATCH_LEN
+# Arguments:
+#   None
+# Returns:
+#   None
+#######################################
+function test_standard_loop_noflashattention_nogranite () {
+    torchrun \
+    --standalone \
+    --nproc_per_node="$NUM_GPUS" \
+    main_ds.py \
+    --model_name_or_path="$MODEL_NAME" \
+    --data_path="$COMPUTED_DATA_PATH" \
+    --output_dir="$CHECKPOINTS_DIR" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
+    --max_batch_len="$MAX_BATCH_LEN" \
+    --disable_flash_attn
+    # --is_granite
+}
+
+function main () {
+
+    setup_tmpdir
+    trap "rm -rf $TMP_DIR" EXIT
+
+    #NOTE (jkunstle): script is run as though it's
+    # in the same source dir as main_ds and data_process.
+    cd "$CORRECT_WORKING_DIR"
+    echo "CURRENT WORKING DIRECTORY: $(pwd)"
+
+    prepare_data
+    test_standard_loop_noflashattention_nogranite
+    _cleanup_saved_checkpoints
+    test_standard_loop_nongranite
+    _cleanup_saved_checkpoints
+    test_standard_loop
+}
+
+main

From 391a9dcce27e2848a6ee6bcd78425f424482bcd9 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Thu, 24 Oct 2024 12:58:53 -0400
Subject: [PATCH 12/48] ci: use org variable for AWS EC2 AMI in E2E CI jobs

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/workflows/e2e-nvidia-a10g-x1.yml | 2 +-
 .github/workflows/e2e-nvidia-l40s-x4.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-a10g-x1.yml
index 075039ba..f145ebd8 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-a10g-x1.yml
@@ -54,7 +54,7 @@ jobs:
         with:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-01a89eee1adde309c
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
           ec2-instance-type: g5.4xlarge
           subnet-id: subnet-02d230cffd9385bd4
           security-group-id: sg-06300447c4a5fbef3
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index b75c8cc5..4620d327 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -32,7 +32,7 @@ jobs:
         with:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-01a89eee1adde309c
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
           ec2-instance-type: g6e.12xlarge
           subnet-id: subnet-024298cefa3bedd61
           security-group-id: sg-06300447c4a5fbef3

From 97bfd39a33a671cbf4d749560f1911a859cf3ae1 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 25 Oct 2024 12:37:06 +0000
Subject: [PATCH 13/48] build(deps): Bump actions/setup-python from 5.2.0 to
 5.3.0

Bumps [actions/setup-python](https://github.com/actions/setup-python) from 5.2.0 to 5.3.0.
- [Release notes](https://github.com/actions/setup-python/releases)
- [Commits](https://github.com/actions/setup-python/compare/f677139bbe7f9c59b41e40162b753c062f5d49a3...0b93645e9fea7318ecaed2b359559ac225c90a2b)

---
updated-dependencies:
- dependency-name: actions/setup-python
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/lint.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 2261f879..22a5306d 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -69,7 +69,7 @@ jobs:
           fetch-depth: 0
 
       - name: Setup Python 3.11
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: 3.11
           cache: pip

From d912c2cc2fa8a41284851944fe4b2066fb85aef9 Mon Sep 17 00:00:00 2001
From: Nathan Weinberg <nweinber@redhat.com>
Date: Fri, 25 Oct 2024 10:41:49 -0400
Subject: [PATCH 14/48] ci: convert med E2E CI job to L4 GPU

also adds '-v' to 'pip install' so we can see
environmental variable info for debugging issues
related to installation

Signed-off-by: Nathan Weinberg <nweinber@redhat.com>
---
 .github/mergify.yml                              |  6 +++---
 ...e-nvidia-a10g-x1.yml => e2e-nvidia-l4-x1.yml} | 16 ++++++++--------
 README.md                                        |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)
 rename .github/workflows/{e2e-nvidia-a10g-x1.yml => e2e-nvidia-l4-x1.yml} (94%)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 404565f2..e9b9f510 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -28,18 +28,18 @@ pull_request_rules:
     # e2e medium workflow
     - or:
       - and:
-        # note this should match the triggering criteria in 'e2e-nvidia-a10g-x1.yml'
+        # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml'
         - check-success~=e2e-medium-workflow-complete
         - or:
           - files~=\.py$
           - files=pyproject.toml
           - files~=^requirements.*\.txt$
-          - files=.github/workflows/e2e-nvidia-a10g-x1.yml
+          - files=.github/workflows/e2e-nvidia-l4-x1.yml
       - and:
         - -files~=\.py$
         - -files=pyproject.toml
         - -files~=^requirements.*\.txt$
-        - -files=.github/workflows/e2e-nvidia-a10g-x1.yml
+        - -files=.github/workflows/e2e-nvidia-l4-x1.yml
 
     # code lint workflow
     - or:
diff --git a/.github/workflows/e2e-nvidia-a10g-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
similarity index 94%
rename from .github/workflows/e2e-nvidia-a10g-x1.yml
rename to .github/workflows/e2e-nvidia-l4-x1.yml
index f145ebd8..5453ff3a 100644
--- a/.github/workflows/e2e-nvidia-a10g-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-name: E2E (NVIDIA A10G x1)
+name: E2E (NVIDIA L4 x1)
 
 on:
   # run against every merge commit to 'main' and release branches
@@ -18,7 +18,7 @@ on:
       - '**.py'
       - 'pyproject.toml'
       - 'requirements**.txt'
-      - '.github/workflows/e2e-nvidia-a10g-x1.yml' # This workflow
+      - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -55,7 +55,7 @@ jobs:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           ec2-image-id: ${{ vars.AWS_EC2_AMI }}
-          ec2-instance-type: g5.4xlarge
+          ec2-instance-type: g6.8xlarge
           subnet-id: subnet-02d230cffd9385bd4
           security-group-id: sg-06300447c4a5fbef3
           iam-role-name: instructlab-ci-runner
@@ -117,19 +117,19 @@ jobs:
           nvidia-smi
           python3.11 -m pip cache remove llama_cpp_python
 
-          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install .
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .
 
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
-          python3.11 -m pip install packaging wheel setuptools-scm
-          python3.11 -m pip install .[cuda] -r requirements-vllm-cuda.txt
+          python3.11 -m pip install -v packaging wheel setuptools-scm
+          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
 
       - name: Update instructlab-training library
         working-directory: ./training
         run: |
           . ../instructlab/venv/bin/activate
-          pip install .
-          pip install .[cuda]
+          pip install -v .
+          pip install -v .[cuda]
 
       - name: Check disk
         run: |
diff --git a/README.md b/README.md
index be396a64..27219af9 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 ![Release](https://img.shields.io/github/v/release/instructlab/training)
 ![License](https://img.shields.io/github/license/instructlab/training)
 
-![`e2e-nvidia-a10g-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-a10g-x1.yml/badge.svg?branch=main)
+![`e2e-nvidia-l4-x1.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l4-x1.yml/badge.svg?branch=main)
 ![`e2e-nvidia-l40s-x4.yml` on `main`](https://github.com/instructlab/training/actions/workflows/e2e-nvidia-l40s-x4.yml/badge.svg?branch=main)
 
 - [Installing](#installing-the-library)

From 2cecef77f8548a3a53c908b2fe912253f9a8c41f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 25 Oct 2024 14:54:57 +0000
Subject: [PATCH 15/48] build(deps): Bump actions/checkout from 4.2.1 to 4.2.2

Bumps [actions/checkout](https://github.com/actions/checkout) from 4.2.1 to 4.2.2.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871...11bd71901bbe5b1630ceea73d27597364c9af683)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/actionlint.yml         | 2 +-
 .github/workflows/docs.yml               | 2 +-
 .github/workflows/e2e-nvidia-l4-x1.yml   | 4 ++--
 .github/workflows/e2e-nvidia-l40s-x4.yml | 4 ++--
 .github/workflows/lint.yml               | 2 +-
 .github/workflows/pypi.yaml              | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index e0cdafea..2fa5465b 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -35,7 +35,7 @@ jobs:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
 
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index d778c928..79e1ac6b 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -37,7 +37,7 @@ jobs:
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: "Check Markdown documents"
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index 5453ff3a..fc701989 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -84,7 +84,7 @@ jobs:
           sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
 
       - name: Checkout instructlab/instructlab
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "instructlab/instructlab"
           path: "instructlab"
@@ -92,7 +92,7 @@ jobs:
           fetch-depth: 0
 
       - name: Checkout instructlab/training
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "instructlab/training"
           path: "training"
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index 4620d327..c1f8a7f9 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -60,7 +60,7 @@ jobs:
           sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
 
       - name: Checkout instructlab/instructlab
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "instructlab/instructlab"
           path: "instructlab"
@@ -68,7 +68,7 @@ jobs:
           fetch-depth: 0
   
       - name: Checkout instructlab/training
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "instructlab/training"
           path: "training"
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 2261f879..425d79da 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -63,7 +63,7 @@ jobs:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           # https://github.com/actions/checkout/issues/249
           fetch-depth: 0
diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 6b790e16..b5befbb4 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -42,7 +42,7 @@ jobs:
                   egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
             - name: "Checkout"
-              uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+              uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
               with:
                   # for setuptools-scm
                   fetch-depth: 0

From 03d1b624ac2dcc4eed00d459f5e134c37a7e23fd Mon Sep 17 00:00:00 2001
From: Aldo Pareja <7622817+aldopareja@users.noreply.github.com>
Date: Fri, 25 Oct 2024 13:00:15 -0400
Subject: [PATCH 16/48] Implementing HF Padding-Free and GraniteLM Support
 (#257)

Updating the data collator for models with HF padding-free support, adding support for upcoming Granite HF model class, and updating flags/interface accordingly.

------------------------------------------------

* only compute lengths in the token dataset when it's not already present in the dataset

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* Refactor padding function to support position_ids for FlashAttention

- Added `supports_flash_attention` function to check GPU compatibility for FlashAttention.
- Updated `make_collate_fn` to return `position_ids` instead of `attention_mask` when FlashAttention is supported.
- Integrated the new padding logic into `setup_dataloader` to ensure compatibility with both Granite and non-Granite configurations.
- Ensured backward compatibility by maintaining the original padding logic for GPUs that do not support FlashAttention.
- Updated `main_ds.py` to use the new `supports_flash_attention` check for determining padding strategy.

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* logging the global gradnorm now

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* fixing deepspeed because it's not working with the scheduler we want

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* fixing accelerate lr_scheduler

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* fixing accelerate lr_scheduler

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* samples seen was broken because now the samples are a single line

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* find packing is wrong because when flash attention is supported padding should not be used when building the buckets

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* black formatting

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* it should not fail on granite 8b models anymore

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* linting

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* linting

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* bug on padding when creating the multipack sampler

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* linter

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* linter

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>

* Change old padding-free and granite flags to use_dolomite

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Add safeguards and checks for flash attention when enabled/disabled

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Rework flash attention checks for better modularity

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Fix arg name

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Update transformers to a version with Granite model class

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Adding stateguards for dolomite and granite and model path check

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Missing update

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Clean up early validation checks and move to utils

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Fix spelling mistake

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Include AMD in flash attn check

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Red-add is_padding_free with deprecation warning

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Make use_dolomite default false

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* this is needed because the tag <MASK> is too common and some datasets will fail

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* added a warning in case the special tokens used for data processing are present in the dataset

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* added a warning in case the special tokens used for data processing are present in the dataset

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Update valid data filter

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Fix ruff formatting

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Apply review feedback

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Added comments

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

---------

Signed-off-by: aldo pareja-cardona <aldo.pareja@ibm.com>
Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
Co-authored-by: aldo pareja-cardona <aldo.pareja@ibm.com>
Co-authored-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 requirements.txt                          |   2 +-
 src/instructlab/training/config.py        |   3 +-
 src/instructlab/training/data_process.py  |  25 ++-
 src/instructlab/training/main_ds.py       |  64 +++---
 src/instructlab/training/token_dataset.py |  25 ++-
 src/instructlab/training/utils.py         | 225 ++++++++++++++++------
 6 files changed, 237 insertions(+), 107 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b31fbdcb..2d77d1e1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ py-cpuinfo
 # we set this to be above 0a0 so that it doesn't
 # replace custom pytorch images with the 2.3.0
 torch>=2.3.0a0
-transformers>=4.41.2
+transformers>=4.45.2
 accelerate>=0.34.2
 datasets>=2.15.0
 numba
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index 05fe4792..51e9752e 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -171,8 +171,9 @@ class TrainingArgs(BaseModel):
     save_samples: int
     learning_rate: float
     warmup_steps: int
-    is_padding_free: bool
     random_seed: int = 42
+    use_dolomite: bool = False
+    is_padding_free: bool = False  # TODO: deprecate
     checkpoint_at_epoch: bool = True
     accelerate_full_state_at_epoch: bool = True
 
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 4bd7c789..10214e9d 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -199,7 +199,7 @@ def print_masked_samples(data, tokenizer, is_pretrain, num_proc):
     def get_masked_and_orig_text(sample):
         labels = sample["labels"]
         input_ids = sample["input_ids"]
-        mask_id = get_sp_token(tokenizer, "<MASK>")[0]
+        mask_id = get_sp_token(tokenizer, "<|MASK|>")[0]
         label = [mask_id if tk == -100 else tk for tk in labels]
         text = tokenizer.decode(label)
         orig_text = tokenizer.decode(input_ids)
@@ -239,7 +239,7 @@ def main(args: DataProcessArgs):
 
     # Adding after tokenizer setup as these are temp tokens, not to be saved
     tokenizer.add_special_tokens(
-        {"additional_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<MASK>"]}
+        {"additional_special_tokens": ["<|pretrain|>", "<|/pretrain|>", "<|MASK|>"]}
     )
 
     try:
@@ -347,9 +347,26 @@ def main(args: DataProcessArgs):
     )
 
     # extract only labels and messages formatted into a new dataset
-    data_with_labels = data_with_labels.select_columns(["labels", "input_ids"])
+    data_with_labels = data_with_labels.map(
+        lambda x: {
+            "len": len(x["input_ids"]),
+        },
+        num_proc=NUM_PROC,
+    )
+    data_with_labels = data_with_labels.select_columns(["labels", "input_ids", "len"])
+    # MASK and both pretrain tokens should not be in the final tokens, those are special tokens added only for data processing purposes.
+    max_id = len(tokenizer) - 3
+    final_valid_data = data_with_labels.filter(
+        lambda x: all(tk < max_id for tk in x["labels"]), num_proc=NUM_PROC
+    )
+    # Dropping samples that could break training due to oob ids
+    if len(final_valid_data) < len(data_with_labels):
+        dropped_samples = len(data_with_labels) - len(final_valid_data)
+        print(
+            f"\033[93mWarning: {dropped_samples} samples were dropped because they contained token IDs greater than or equal to {max_id}.\033[0m"
+        )
     # use path to get the stem of the file
-    data_with_labels.to_json(Path(args.data_output_path) / f"data.jsonl")
+    final_valid_data.to_json(Path(args.data_output_path) / "data.jsonl")
 
 
 if __name__ == "__main__":
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index d1ae0e01..ab59282f 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -41,8 +41,10 @@
     StreamablePopen,
     add_noisy_embeddings,
     apply_gradient_checkpointing,
+    check_flash_attn_enabled,
+    check_valid_train_args,
     convert_loss_to_reduce_sum,
-    ensure_loadable_granite_checkpoint,
+    ensure_loadable_dolomite_checkpoint,
     get_projection_layer_names,
     load_latest_full_state,
     prepare_peft_model,
@@ -84,7 +86,7 @@ def setup_optimizer(args, model):
     return optimizer
 
 
-def setup_model(args, tokenizer, train_loader, grad_accum):
+def setup_model(args, tokenizer, train_loader, grad_accum, flash_enabled):
     bnb_config = None
     if args.lora_r > 0 and args.lora_quant_bits == 4:
         # Third Party
@@ -102,15 +104,11 @@ def setup_model(args, tokenizer, train_loader, grad_accum):
         "torch_dtype": torch.bfloat16,
         "quantization_config": bnb_config,
     }
-    if not args.disable_flash_attn:
+    if flash_enabled:
         base_model_args["attn_implementation"] = "flash_attention_2"
-    elif args.is_granite:
-        raise RuntimeError(
-            "ERROR: Trying to use padding-free transformer without flash attention is not supported"
-        )
 
-    if args.is_granite:
-        with ensure_loadable_granite_checkpoint(
+    if args.use_dolomite:
+        with ensure_loadable_dolomite_checkpoint(
             args.model_name_or_path, args.output_dir
         ) as path:
             base_model_args["pretrained_model_name_or_path"] = path
@@ -165,9 +163,10 @@ def setup_model(args, tokenizer, train_loader, grad_accum):
         "Starcoder2ForCausalLM",
         "GemmaForCausalLM",
         "MixtralForCausalLM",
+        "GraniteForCausalLM",
     ], f"Model class name: {model.__class__.__name__} is not supported."
 
-    model = convert_loss_to_reduce_sum(model, is_granite=args.is_granite)
+    model = convert_loss_to_reduce_sum(model, use_dolomite=args.use_dolomite)
     model = add_noisy_embeddings(model, noise_alpha=args.NEFTune_alpha)
 
     # handling of gradient checkpointing
@@ -212,15 +211,15 @@ def setup_model(args, tokenizer, train_loader, grad_accum):
             target_modules=args.lora_target_modules,
         )
         model = prepare_peft_model(
-            model, peft_config, gradient_checkpointing=not args.is_granite
+            model, peft_config, gradient_checkpointing=not args.use_dolomite
         )
 
-    elif not args.is_granite:
+    elif not args.use_dolomite:
         model.gradient_checkpointing_enable()
 
     # granite gradient checkpointing is handled uniformly
     # for both lora and full here
-    if args.is_granite:
+    if args.use_dolomite:
         block_name = model._no_split_modules[0]
         apply_gradient_checkpointing(
             model,
@@ -252,6 +251,9 @@ def make_inputs_require_grad(module, input, output):
         deepcopy(train_loader),
         lr_scheduler,
     )
+    # Necessary so that Accelerate does not step once per GPU
+    # see https://github.com/huggingface/accelerate/blob/127818fc27ebe5cb236357fff59ff1748326d643/src/accelerate/scheduler.py#L69
+    lr_scheduler.split_batches = True
     return model, lr_scheduler, optimizer, accelerator
 
 
@@ -381,8 +383,8 @@ def train(
             num_loss_counted_tokens = float(
                 torch.tensor([batch.pop("num_loss_counted_tokens")])
             )
-            micro_batch_size = float(len(batch["input_ids"]))
-            if not args.is_granite:
+            micro_batch_size = float(torch.tensor([batch.pop("num_samples")]))
+            if not args.use_dolomite:
                 for k in batch:
                     batch[k] = batch[k].to(local_rank)
             output = model(
@@ -453,7 +455,7 @@ def train(
                         "batch_size": int(micro_batch_size),
                         "total_loss": float(log_loss / num_loss_counted_tokens),
                         "samples_seen": samples_seen,
-                        # "gradnorm": global_grad_norm,
+                        "gradnorm": global_grad_norm,
                         # "weight_norm": weight_norm,
                     }
                 )
@@ -535,6 +537,8 @@ def main(args):
     torch.distributed.all_reduce(tensor)
     torch.distributed.barrier()
 
+    flash_enabled = check_flash_attn_enabled(args.disable_flash_attn, args.use_dolomite)
+
     dataset = setup_dataset(
         args.data_path,
         mock=args.mock_data,
@@ -547,7 +551,7 @@ def main(args):
             avg_sample_len=dataset.get_lengths().mean(),
             effective_batch_size=args.effective_batch_size,
             max_batch_len_per_gpu=args.max_batch_len,
-            is_padding=not args.is_granite,
+            is_padding=not (args.use_dolomite or flash_enabled),
             dataset=dataset,
             seed=args.seed,
         )
@@ -570,7 +574,8 @@ def main(args):
         dataset,
         tokenizer.pad_token_id,
         num_workers=8,
-        is_granite=args.is_granite,
+        use_dolomite=args.use_dolomite,
+        flash_enabled=flash_enabled,
         max_batch_len=args.max_batch_len,
         packing_max_batch_len=packing_max_batch_len,
         samples_per_gpu=args.samples_per_gpu,
@@ -589,7 +594,8 @@ def main(args):
             dataset,
             tokenizer.pad_token_id,
             num_workers=8,
-            is_granite=args.is_granite,
+            use_dolomite=args.use_dolomite,
+            flash_enabled=flash_enabled,
             max_batch_len=args.max_batch_len,
             packing_max_batch_len=packing_max_batch_len,
             samples_per_gpu=args.samples_per_gpu,
@@ -613,7 +619,7 @@ def main(args):
         )
 
     model, lr_scheduler, optimizer, accelerator = setup_model(
-        args, tokenizer, train_loader, grad_accum
+        args, tokenizer, train_loader, grad_accum, flash_enabled
     )
 
     load_latest_full_state(args=args, accelerator=accelerator)
@@ -639,11 +645,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     """
     Wrapper around the main training job that calls torchrun.
     """
-    # early validation logic here
-    if train_args.max_batch_len < train_args.max_seq_len:
-        raise ValueError(
-            f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}"
-        )
+    check_valid_train_args(train_args)
 
     if train_args.process_data:
         dp.main(
@@ -697,14 +699,10 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         if train_args.mock_len:
             command.append(f"--mock_len={train_args.mock_len}")
 
-    if train_args.is_padding_free:
-        command.append("--is_granite")
+    if train_args.use_dolomite:
+        command.append("--use_dolomite")
 
     if train_args.disable_flash_attn:
-        if train_args.is_padding_free:
-            raise RuntimeError(
-                "ERROR: Trying to use padding-free transformer without flash attention is not supported"
-            )
         command.append("--disable_flash_attn")
 
     if train_args.lora:
@@ -888,7 +886,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
         default="SHARD_GRAD_OP",
         help="Sharding strategy to be used for FSDP distributed training.",
     )
-    parser.add_argument("--is_granite", action="store_true")
+    parser.add_argument("--use_dolomite", action="store_true")
     parser.add_argument("--lora_r", type=int, default=0)  # set to > 0 to activate lora
     parser.add_argument("--lora_alpha", type=int, default=32)
     parser.add_argument("--lora_dropout", type=float, default=0.1)
@@ -977,7 +975,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
 --save_samples=250000 \
 --log_level="INFO" \
 --fsdp_sharding_strategy="SHARD_GRAD_OP" \
---is_granite \
+--use_dolomite \
 --max_batch_len 70000 \
 --seed=42
 """
diff --git a/src/instructlab/training/token_dataset.py b/src/instructlab/training/token_dataset.py
index 9d46607e..fda9a751 100644
--- a/src/instructlab/training/token_dataset.py
+++ b/src/instructlab/training/token_dataset.py
@@ -17,12 +17,15 @@
 class TokenDataset(Dataset):
     def __init__(self, data_path):
         self.data = load_dataset("json", data_files=data_path, split="train")
-        self.lengths = np.array(
-            self.data.map(
-                lambda x: {"len": len(x["input_ids"])},
-                num_proc=8,
-            )["len"]
-        )
+        if "len" not in self.data.column_names:
+            self.lengths = np.array(
+                self.data.map(
+                    lambda x: {"len": len(x["input_ids"])},
+                    num_proc=8,
+                )["len"]
+            )
+        else:
+            self.lengths = np.array(self.data["len"])
 
     def __len__(self):
         return len(self.data)
@@ -87,7 +90,8 @@ def setup_dataloader(
     dataset: Dataset,
     pad_token_id: int,
     num_workers: int = 8,
-    is_granite=False,
+    use_dolomite=False,
+    flash_enabled=True,
     max_batch_len=60000,
     packing_max_batch_len=60000,
     samples_per_gpu=None,
@@ -95,7 +99,10 @@ def setup_dataloader(
     seed=47,
 ) -> DataLoader:
     collate_fn = make_collate_fn(
-        pad_token_id, is_granite=is_granite, max_batch_len=max_batch_len
+        pad_token_id,
+        use_dolomite=use_dolomite,
+        flash_enabled=flash_enabled,
+        max_batch_len=max_batch_len,
     )
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
@@ -108,7 +115,7 @@ def setup_dataloader(
             num_replicas=world_size,
             rank=rank,
             seed=seed,
-            padding=not is_granite,
+            padding=not flash_enabled,
         )
         sampler = {"batch_sampler": sampler}
     elif sampler == "distributed":
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 6d79d897..d685d212 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -10,6 +10,7 @@
 from typing import Any, List, Optional
 import importlib
 import inspect
+import json
 import logging
 import os
 import random
@@ -40,6 +41,44 @@
 import torch
 import torch.nn.functional as F
 
+# First Party
+from instructlab.training.config import TrainingArgs
+
+
+def check_valid_train_args(train_args: TrainingArgs):
+    # early validation logic here
+    if train_args.max_batch_len < train_args.max_seq_len:
+        raise ValueError(
+            f"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=} < {train_args.max_seq_len=}"
+        )
+
+    if os.path.exists(train_args.model_path):
+        if not os.path.isdir(train_args.model_path):
+            raise FileNotFoundError(
+                "Model path does not appear to be a directory. Please make sure that you're passing a Hugging Face Transformers compatible directory checkpoint."
+            )
+    else:
+        raise FileNotFoundError(
+            f"Provided path to model does not exist. Please make sure that you've passed a valid model and that it has appropriate permissions: {train_args.model_path}"
+        )
+
+    if train_args.use_dolomite:
+        with open(Path(train_args.model_path) / "config.json") as conf_json:
+            model_conf = json.load(conf_json)
+        if model_conf["model_type"] == "granite":
+            raise RuntimeError(
+                "Converting Granite models to Dolomite format is currently unsupported."
+            )
+        if train_args.disable_flash_attn:
+            raise RuntimeError(
+                "ERROR: Trying to use dolomite padding-free transformer without flash attention is not supported"
+            )
+
+    if train_args.is_padding_free:
+        print(
+            "\033[33m WARNING: is_padding_free is being deprecated due to adoption of the default padding-free support in Hugging Face Transformers. As such, this flag is non-functional in 0.6.0 and beyond. If you would like to use the older Dolomite padding-free implementation, please set use_dolomite moving forward.\033[0m"
+        )
+
 
 def retrieve_chat_template(chat_tmpl_path):
     try:
@@ -111,9 +150,39 @@ def listen(self):
                     break
 
 
-def make_collate_fn(pad_token_id, is_granite=False, max_batch_len=60000):
+def supports_flash_attention(device_id=0):
+    """Check if a GPU supports FlashAttention."""
+    major, minor = torch.cuda.get_device_capability(device_id)
+    # Check if the GPU architecture is Ampere (SM 8.x) or newer (SM 9.0)
+    is_sm8x = major == 8 and minor >= 0
+    is_sm90 = major == 9 and minor == 0
+    dev_name = torch.cuda.get_device_properties(device_id).gcnArchName.split(":")[0]
+    is_compat_amd = dev_name in ("gfx90a", "gfx940", "gfx941", "gfx942")
+    return is_sm8x or is_sm90 or is_compat_amd
+
+
+def check_flash_attn_enabled(disable_flash_attn: bool, use_dolomite: bool) -> bool:
+    if not disable_flash_attn:
+        if supports_flash_attention():
+            flash_enabled = True
+        else:
+            raise RuntimeError(
+                "ERROR: Trying to use Flash Attention on unsupported hardware. Please set disable_flash_attn to True."
+            )
+    elif use_dolomite:
+        raise RuntimeError(
+            "ERROR: Trying to use dolomite padding-free transformer without flash attention is not supported"
+        )
+    else:
+        flash_enabled = False
+    return flash_enabled
+
+
+def make_collate_fn(
+    pad_token_id, use_dolomite=False, flash_enabled=True, max_batch_len=60000
+):
     rank = int(os.environ["RANK"])
-    if is_granite:
+    if use_dolomite:
 
         def pad_collate_fn(batch):
             lens = np.array([len(item["input_ids"]) for item in batch])
@@ -140,70 +209,108 @@ def pad_collate_fn(batch):
                 "input_ids": input_ids,
                 "labels": labels,
                 "num_loss_counted_tokens": num_loss_counted_tokens,
+                "num_samples": len(batch),
             }
 
     else:
+        if flash_enabled:
+
+            def pad_collate_fn(batch):
+                input_ids = []
+                labels = []
+                position_ids = []
+                total_len = 0
+                num_loss_counted_tokens = 0
+
+                for num_samples, item in enumerate(batch):
+                    item_len = len(item["input_ids"])
+                    if total_len + item_len > max_batch_len:
+                        break
+
+                    input_ids.extend(item["input_ids"].tolist())
+                    labels.extend(item["labels"].tolist())
+                    position_ids.extend(range(total_len, total_len + item_len))
+
+                    total_len += item_len
+                    num_loss_counted_tokens += (item["labels"] != -100).sum().item()
+
+                print(
+                    f"\033[96m total length: {total_len} "
+                    f"num samples {len(batch)} - rank: {rank} "
+                    f"num_loss_counted_tokens: {num_loss_counted_tokens}\033[0m"
+                )
 
-        def pad_collate_fn(batch):
-            lens = np.array([len(item["input_ids"]) for item in batch])
-            max_len = max(lens)
-
-            input_ids = torch.stack(
-                [
-                    F.pad(
-                        item["input_ids"],
-                        (max_len - len(item["input_ids"]), 0),
-                        mode="constant",
-                        value=pad_token_id,
-                    )
-                    for item in batch
-                ]
-            )
-            labels = torch.stack(
-                [
-                    F.pad(
-                        item["labels"],
-                        (max_len - len(item["labels"]), 0),
-                        mode="constant",
-                        value=-100,
-                    )
-                    for item in batch
-                ]
-            )
-            num_loss_counted_tokens = (labels != -100).sum()
-
-            attention_mask = torch.stack(
-                [
-                    F.pad(
-                        item["attention_mask"],
-                        (max_len - len(item["attention_mask"]), 0),
-                        mode="constant",
-                        value=0,
-                    )
-                    for item in batch
-                ]
-            )
-            print(
-                f"\033[96m total tokens: {max_len * len(batch)} num samples: {len(batch)} num padding tokens: {max_len * len(batch) - lens.sum()} - rank: {rank} "
-                f"max len: {max_len} min len: {min(lens)} avg len: {lens.mean()} "
-                f"num_loss_counted_tokens: {num_loss_counted_tokens}\033[0m"
-            )
+                return {
+                    "input_ids": torch.tensor([input_ids], dtype=torch.long),
+                    "labels": torch.tensor([labels], dtype=torch.long),
+                    "position_ids": torch.tensor([position_ids], dtype=torch.long),
+                    "num_loss_counted_tokens": num_loss_counted_tokens,
+                    "num_samples": num_samples + 1,  # pylint: disable=W0631
+                }
 
-            return {
-                "input_ids": input_ids,
-                "labels": labels,
-                "num_loss_counted_tokens": num_loss_counted_tokens,
-                "attention_mask": attention_mask,
-            }
+        else:
+
+            def pad_collate_fn(batch):
+                lens = np.array([len(item["input_ids"]) for item in batch])
+                max_len = max(lens)
+
+                input_ids = torch.stack(
+                    [
+                        F.pad(
+                            item["input_ids"],
+                            (max_len - len(item["input_ids"]), 0),
+                            mode="constant",
+                            value=pad_token_id,
+                        )
+                        for item in batch
+                    ]
+                )
+                labels = torch.stack(
+                    [
+                        F.pad(
+                            item["labels"],
+                            (max_len - len(item["labels"]), 0),
+                            mode="constant",
+                            value=-100,
+                        )
+                        for item in batch
+                    ]
+                )
+                num_loss_counted_tokens = (labels != -100).sum()
+
+                attention_mask = torch.stack(
+                    [
+                        F.pad(
+                            item["attention_mask"],
+                            (max_len - len(item["attention_mask"]), 0),
+                            mode="constant",
+                            value=0,
+                        )
+                        for item in batch
+                    ]
+                )
+                print(
+                    f"\033[96m total tokens: {max_len * len(batch)} num samples: {len(batch)} num padding tokens: {max_len * len(batch) - lens.sum()} - rank: {rank} "
+                    f"max len: {max_len} min len: {min(lens)} avg len: {lens.mean()} "
+                    f"num_loss_counted_tokens: {num_loss_counted_tokens}\033[0m"
+                )
+
+                return {
+                    "input_ids": input_ids,
+                    "labels": labels,
+                    "num_loss_counted_tokens": num_loss_counted_tokens,
+                    "attention_mask": attention_mask,
+                    "num_samples": len(batch),
+                }
 
     return pad_collate_fn
 
 
-def convert_loss_to_reduce_sum(model, is_granite=False):
+def convert_loss_to_reduce_sum(model, use_dolomite=False):
     """
     this is necessary because multipack changes the samples per gpu, which biases the gradients to be larger for batches with less samples but longer lengths.
     """
-    if is_granite:
+    if use_dolomite:
 
         def get_autoregressive_language_modeling_loss(
             lm_logits: torch.Tensor,
@@ -489,7 +596,7 @@ class UniversalCheckpointArgs:
 
 
 @contextmanager
-def ensure_loadable_granite_checkpoint(
+def ensure_loadable_dolomite_checkpoint(
     model_name_or_path: str,
     tmpdir: str,
 ):
@@ -662,7 +769,7 @@ def save_hf_format_accelerate(
     tokenizer,
     accelerator: Accelerator,
     samples_seen,
-    convert_granite=True,
+    convert_dolomite=True,
     is_lora=False,
 ):
     log_rank_0(
@@ -672,7 +779,7 @@ def save_hf_format_accelerate(
     start = time.time()
 
     final_output_dir = Path(args.output_dir) / "hf_format" / f"samples_{samples_seen}"
-    if args.is_granite and convert_granite:
+    if args.use_dolomite and convert_dolomite:
         tmpdir = TemporaryDirectory("w")  # pylint: disable=consider-using-with
         output_dir = Path(tmpdir.name)
     else:
@@ -694,7 +801,7 @@ def _get_state_dict_patched(model, unwrap=False):
             model_state = model.module.state_dict()
 
         output_dir.mkdir(parents=True, exist_ok=True)
-        if not model.module.config.architectures and convert_granite:
+        if not model.module.config.architectures and convert_dolomite:
             model.module.config.architectures = ["LlamaForCausalLM"]
             warnings.warn(
                 f"Adding architectures to ckpt: {model.module.config.architectures}",
@@ -720,7 +827,7 @@ def _get_state_dict_patched(model, unwrap=False):
             safe_serialization=True,
         )
 
-    if args.is_granite and convert_granite and accelerator.is_main_process:
+    if args.use_dolomite and convert_dolomite and accelerator.is_main_process:
         # export doesnt like the directory to exist
         if final_output_dir.exists():
             shutil.rmtree(final_output_dir)

From e00cf7ecdfb7ee3a65eac27f99a00a0695c1f510 Mon Sep 17 00:00:00 2001
From: abdullah-ibm <abdullah@ibm.com>
Date: Tue, 22 Oct 2024 12:05:43 +0300
Subject: [PATCH 17/48] added some safeguards to prevent the code from
 proceeding when the necessary imports are not available

Signed-off-by: Harthi7 <abdullah-harthi7@live.com>
Signed-off-by: abdullah-ibm <abdullah@ibm.com>
---
 src/instructlab/training/main_ds.py | 34 +++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index d1ae0e01..1b017c82 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -12,10 +12,27 @@
 
 # Third Party
 from accelerate import Accelerator
-from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam
-from deepspeed.runtime.zero.utils import ZeRORuntimeException
+
+try:
+    from deepspeed.ops.adam import DeepSpeedCPUAdam
+except ImportError:
+    DeepSpeedCPUAdam = None
+    local_rank = int(os.getenv('LOCAL_RANK', None))
+    if __name__ == '__main__' and (not local_rank or local_rank == 0):
+        print("DeepSpeed CPU Optimizer is not available. Some features may be unavailable.")
+
+try:
+    from deepspeed.ops.adam import FusedAdam
+    from deepspeed.runtime.zero.utils import ZeRORuntimeException
+except ImportError:
+    FusedAdam = None
+    ZeRORuntimeException = None
+    local_rank = int(os.getenv('LOCAL_RANK', None))
+    if __name__ == '__main__' and (not local_rank or local_rank == 0):
+        print("DeepSpeed is not available. Some features may be unavailable.")
 
 # pylint: disable=no-name-in-module
+from instructlab.training.confg import DistributedBackend
 from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, get_scheduler
@@ -513,6 +530,12 @@ def main(args):
     # Third Party
     import yaml
 
+    if args.distributed_training_framework == 'deepspeed' and not FusedAdam:
+        raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer")
+
+    if args.distributed_training_framework == 'deeppeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam:
+        raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") 
+    
     metric_logger = AsyncStructuredLogger(
         args.output_dir
         + f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl"
@@ -733,6 +756,13 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     )
 
     # deepspeed options
+    if train_args.distributed_backend == DistributedBackend.DeepSpeed:
+        if not FusedAdam:
+            raise ImportError("DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly")
+
+        if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam:
+            raise ImportError("DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading.")
+        
     if train_args.deepspeed_options.save_samples:
         command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}")
     if train_args.deepspeed_options.cpu_offload_optimizer:

From d0e944ffbeb506565cbaba1ec62717afc2675390 Mon Sep 17 00:00:00 2001
From: abdullah-ibm <abdullah@ibm.com>
Date: Wed, 30 Oct 2024 11:54:15 +0300
Subject: [PATCH 18/48] typo fix

Signed-off-by: abdullah-ibm <abdullah@ibm.com>
---
 src/instructlab/training/main_ds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 1b017c82..3a7e1225 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -533,7 +533,7 @@ def main(args):
     if args.distributed_training_framework == 'deepspeed' and not FusedAdam:
         raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer")
 
-    if args.distributed_training_framework == 'deeppeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam:
+    if args.distributed_training_framework == 'deepspeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam:
         raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") 
     
     metric_logger = AsyncStructuredLogger(

From 11935b9467e661c866634a91083bacd56e49c258 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 30 Oct 2024 12:57:06 +0000
Subject: [PATCH 19/48] build(deps): Bump pypa/gh-action-pypi-publish from
 1.10.3 to 1.11.0

Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.10.3 to 1.11.0.
- [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases)
- [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/f7600683efdcb7656dec5b29656edb7bc586e597...fb13cb306901256ace3dab689990e13a5550ffaa)

---
updated-dependencies:
- dependency-name: pypa/gh-action-pypi-publish
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/pypi.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index b5befbb4..627588b3 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -77,7 +77,7 @@ jobs:
                   path: dist
 
             - name: "Upload to Test PyPI"
-              uses: pypa/gh-action-pypi-publish@f7600683efdcb7656dec5b29656edb7bc586e597 # v1.10.3
+              uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa # v1.11.0
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -129,4 +129,4 @@ jobs:
                   rm ./dist/*.sigstore.json
 
             - name: "Upload to PyPI"
-              uses: pypa/gh-action-pypi-publish@f7600683efdcb7656dec5b29656edb7bc586e597 # v1.10.3
+              uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa # v1.11.0

From 836fe2fdbd64160d0e2a3d62621e2c7004753297 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 13:02:28 +0000
Subject: [PATCH 20/48] build(deps): Bump
 hynek/build-and-inspect-python-package

Bumps [hynek/build-and-inspect-python-package](https://github.com/hynek/build-and-inspect-python-package) from 2.9.0 to 2.10.0.
- [Release notes](https://github.com/hynek/build-and-inspect-python-package/releases)
- [Changelog](https://github.com/hynek/build-and-inspect-python-package/blob/main/CHANGELOG.md)
- [Commits](https://github.com/hynek/build-and-inspect-python-package/compare/73aea398b9c8de9ea9e4464c6b13cb8b1f3d6294...f01e4d047aadcc0c054c95ec9900da3ec3fc7a0f)

---
updated-dependencies:
- dependency-name: hynek/build-and-inspect-python-package
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/pypi.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 627588b3..2b8f85ca 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -48,7 +48,7 @@ jobs:
                   fetch-depth: 0
 
             - name: "Build and Inspect"
-              uses: hynek/build-and-inspect-python-package@73aea398b9c8de9ea9e4464c6b13cb8b1f3d6294 # v2.9.0
+              uses: hynek/build-and-inspect-python-package@f01e4d047aadcc0c054c95ec9900da3ec3fc7a0f # v2.10.0
 
     # push to Test PyPI on
     # - a new GitHub release is published

From a6602f99f5fe30bfd34dc9ce7cf03db924b2d38d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 31 Oct 2024 13:02:32 +0000
Subject: [PATCH 21/48] build(deps): Bump machulav/ec2-github-runner from 2.3.6
 to 2.3.7

Bumps [machulav/ec2-github-runner](https://github.com/machulav/ec2-github-runner) from 2.3.6 to 2.3.7.
- [Release notes](https://github.com/machulav/ec2-github-runner/releases)
- [Commits](https://github.com/machulav/ec2-github-runner/compare/fcfb31a5760dad1314a64a0e172b78ec6fc8a17e...1827d6ca7544d7044ddbd2e9360564651b463da2)

---
updated-dependencies:
- dependency-name: machulav/ec2-github-runner
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/e2e-nvidia-l4-x1.yml   | 4 ++--
 .github/workflows/e2e-nvidia-l40s-x4.yml | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index fc701989..16574190 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -50,7 +50,7 @@ jobs:
 
       - name: Start EC2 runner
         id: start-ec2-runner
-        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
         with:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
@@ -158,7 +158,7 @@ jobs:
           aws-region: ${{ secrets.AWS_REGION }}
 
       - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
         with:
           mode: stop
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index c1f8a7f9..27290c30 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -28,7 +28,7 @@ jobs:
 
       - name: Start EC2 runner
         id: start-ec2-runner
-        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
         with:
           mode: start
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
@@ -229,7 +229,7 @@ jobs:
           aws-region: ${{ secrets.AWS_REGION }}
 
       - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@fcfb31a5760dad1314a64a0e172b78ec6fc8a17e # v2.3.6
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
         with:
           mode: stop
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}

From 12c67d199ac3cbb147f01845763fc0442fc3ffad Mon Sep 17 00:00:00 2001
From: abdullah-ibm <abdullah@ibm.com>
Date: Fri, 1 Nov 2024 17:28:36 +0300
Subject: [PATCH 22/48] fixed pylint error                       Signed-off-by:
 Harthi7 <abdullah-harthi7@live.com> Signed-off-by: abdullah-ibm
 <abdullah@ibm.com>

---
 src/instructlab/training/main_ds.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index fb7427fa..973a7e9c 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -536,8 +536,8 @@ def main(args):
         raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer")
 
     if args.distributed_training_framework == 'deepspeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam:
-        raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.") 
-    
+        raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.")
+
     metric_logger = AsyncStructuredLogger(
         args.output_dir
         + f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl"
@@ -760,7 +760,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
 
         if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam:
             raise ImportError("DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading.")
-        
+
     if train_args.deepspeed_options.save_samples:
         command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}")
     if train_args.deepspeed_options.cpu_offload_optimizer:

From fb5c9960a0c8c62b9210afe7f8d22e0bd41fe027 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 29 Oct 2024 11:14:58 -0400
Subject: [PATCH 23/48] Adding support for Dolomite + Granite model class

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/main_ds.py |  5 ++++
 src/instructlab/training/utils.py   | 37 ++++++++++++++++-------------
 2 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index ab59282f..78068b7b 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -4,6 +4,7 @@
 from copy import deepcopy
 from pathlib import Path
 import argparse
+import json
 import math
 import os
 import re
@@ -528,6 +529,10 @@ def main(args):
     tokenizer = setup_tokenizer(args.model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE)
     # device = torch.device("cuda", args.local_rank)
 
+    with open(Path(args.model_path) / "config.json") as conf_json:
+        model_conf = json.load(conf_json)
+    args.model_type = model_conf["model_type"]
+
     #### distributed init #####
     torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
     args.local_rank = int(os.environ["LOCAL_RANK"])
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index d685d212..d6cef034 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -10,7 +10,6 @@
 from typing import Any, List, Optional
 import importlib
 import inspect
-import json
 import logging
 import os
 import random
@@ -62,17 +61,10 @@ def check_valid_train_args(train_args: TrainingArgs):
             f"Provided path to model does not exist. Please make sure that you've passed a valid model and that it has appropriate permissions: {train_args.model_path}"
         )
 
-    if train_args.use_dolomite:
-        with open(Path(train_args.model_path) / "config.json") as conf_json:
-            model_conf = json.load(conf_json)
-        if model_conf["model_type"] == "granite":
-            raise RuntimeError(
-                "Converting Granite models to Dolomite format is currently unsupported."
-            )
-        if train_args.disable_flash_attn:
-            raise RuntimeError(
-                "ERROR: Trying to use dolomite padding-free transformer without flash attention is not supported"
-            )
+    if train_args.use_dolomite and train_args.disable_flash_attn:
+        raise RuntimeError(
+            "ERROR: Trying to use dolomite padding-free transformer without flash attention is not supported"
+        )
 
     if train_args.is_padding_free:
         print(
@@ -802,10 +794,21 @@ def _get_state_dict_patched(model, unwrap=False):
 
         output_dir.mkdir(parents=True, exist_ok=True)
         if not model.module.config.architectures and convert_dolomite:
-            model.module.config.architectures = ["LlamaForCausalLM"]
-            warnings.warn(
-                f"Adding architectures to ckpt: {model.module.config.architectures}",
-            )
+            arch_added = False
+            if args.model_type == "llama":
+                model.module.config.architectures = ["LlamaForCausalLM"]
+                arch_added = True
+            elif args.model_type == "granite":
+                model.module.config.architectures = ["GraniteForCausalLM"]
+                arch_added = True
+            if arch_added:
+                warnings.warn(
+                    f"Adding architectures to ckpt: {model.module.config.architectures}",
+                )
+            else:
+                warnings.warn(
+                    f"Converting from dolomite, but no architecture field added to config.json",
+                )
         model.module.config.to_json_file(output_config_file)
         tokenizer.save_pretrained(output_dir)
 
@@ -834,7 +837,7 @@ def _get_state_dict_patched(model, unwrap=False):
         export_to_huggingface(
             pretrained_model_name_or_path=tmpdir.name,
             save_path=final_output_dir,
-            model_type="llama",
+            model_type=args.model_type,
         )
         tmpdir.cleanup()
 

From 2c00ef669b310956e1d48c81ec7aed1eaa6269de Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Tue, 29 Oct 2024 13:36:05 -0400
Subject: [PATCH 24/48] Incorrect arg fix

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/main_ds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 78068b7b..7910c341 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -529,7 +529,7 @@ def main(args):
     tokenizer = setup_tokenizer(args.model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE)
     # device = torch.device("cuda", args.local_rank)
 
-    with open(Path(args.model_path) / "config.json") as conf_json:
+    with open(Path(args.model_name_or_path) / "config.json") as conf_json:
         model_conf = json.load(conf_json)
     args.model_type = model_conf["model_type"]
 

From 8852f48f577aa41dc848909ef6cfe7c920aa75ec Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Wed, 30 Oct 2024 11:23:47 -0400
Subject: [PATCH 25/48] Bring HF padding-free into dolomite parity

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 src/instructlab/training/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index d6cef034..41b410c7 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -221,7 +221,7 @@ def pad_collate_fn(batch):
 
                     input_ids.extend(item["input_ids"].tolist())
                     labels.extend(item["labels"].tolist())
-                    position_ids.extend(range(total_len, total_len + item_len))
+                    position_ids.extend(range(item_len))
 
                     total_len += item_len
                     num_loss_counted_tokens += (item["labels"] != -100).sum().item()

From 2a9626fff10a4694832e70253746a25d3e72e3cc Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 1 Nov 2024 16:38:16 -0400
Subject: [PATCH 26/48] Update dolomite requirement

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 2d77d1e1..84da8460 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,7 +17,7 @@ numba
 numpy>=1.23.5,<2.0.0 ; python_version == '3.10'
 numpy>=1.26.4,<2.0.0 ; python_version != '3.10'
 rich
-instructlab-dolomite>=0.1.1
+instructlab-dolomite>=0.2.0
 trl>=0.9.4
 peft
 pydantic>=2.7.0

From 8147095108674ec2ebe9864b98165fa1f83cdd4b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 4 Nov 2024 12:30:43 +0000
Subject: [PATCH 27/48] build(deps): Bump rhysd/actionlint in
 /.github/workflows

Bumps rhysd/actionlint from 1.7.3 to 1.7.4.

---
updated-dependencies:
- dependency-name: rhysd/actionlint
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/actionlint.dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/actionlint.dockerfile b/.github/workflows/actionlint.dockerfile
index 6f171d53..9de2e84a 100644
--- a/.github/workflows/actionlint.dockerfile
+++ b/.github/workflows/actionlint.dockerfile
@@ -1,3 +1,3 @@
 # Since dependabot cannot update workflows using docker,
 # we use this indirection since dependabot can update this file.
-FROM rhysd/actionlint:1.7.3@sha256:7617f05bd698cd2f1c3aedc05bc733ccec92cca0738f3e8722c32c5b42c70ae6
+FROM rhysd/actionlint:1.7.4@sha256:82244e1db1c60d82c7792180a48dd0bcb838370bb589d53ff132503fc9485868

From f5e3344634f19466dba02101da456953cc4a8602 Mon Sep 17 00:00:00 2001
From: Lo-Mein <ryanlo5102000@gmail.com>
Date: Thu, 31 Oct 2024 12:18:53 -0400
Subject: [PATCH 28/48] Added CHANGELOG.md

Signed-off-by: Lo-Mein <ryanlo5102000@gmail.com>
---
 CHANGELOG.md | 213 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..bc0027f1
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,213 @@
+# Changelog
+
+## v0.5.5
+
+### Features
+* e2e: replace old small job with new medium job
+
+### Fixes
+* fix: incorrect label for AWS medium runner
+* chore: add exit code & tox fix
+
+### Infrastructure
+* ci: grant HF_TOKEN access to the medium-size E2E CI job
+
+## v0.5.4
+
+### Features
+* Add rocm extra to pyproject.toml
+
+## v0.5.3
+
+### Fixes
+* fix: Add explicit flash_attn requirement for ROCm
+
+## v0.5.2 - Fix Pretraining Masking
+
+### Fixes
+* fix: improve linting and automation
+* Fix pretrain token list->int for masking
+
+## v0.5.1
+
+### Fixes
+* fix: updates sorting logic to correctly compare numbers
+
+## v0.5.0 - FSDP and Full-State Checkpoint Resuming
+
+### Features
+* feat: add e2e test for instructlab CI
+* feat: add mergify
+* Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
+* adds Accelerate full-state (opt, lr_sched, params)
+* changes StreamablePopen to return a process and implement listening
+
+### Fixes
+* Fix lint error to make CI happy
+* Fix typos
+* Ap/fix multipack for non granite models
+* Fix generic chat template saved to tokenizer for generation
+* Fix linting error and missing quote
+
+### Infrastructure
+* Add license identifiers
+* ci: update runner labels to uniquely identify instance sizes
+* ci: minor cleanup of E2E job
+* Fixing e2e to use relative path for working-directory
+* switch -T to -a
+* github: add stale bot to training repo
+* fix: markdown lint error and mergify bug
+* Bump actions/checkout from 4.1.7 to 4.2.0
+* Bump step-security/harden-runner from 2.8.1 to 2.9.1
+* Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2
+* Bump actions/setup-python from 5.1.0 to 5.2.0
+* Bump rhysd/actionlint from 1.7.1 to 1.7.2
+* Bump hynek/build-and-inspect-python-package from 2.6.0 to 2.9.0
+* Bump DavidAnson/markdownlint-cli2-action from 16.0.0 to 17.0.0
+* ci: fix lint action
+* ci: add AWS tags to show github ref and PR num for all jobs
+
+## v0.5.0 Alpha 0 - The FSDP Release Pre-release
+
+### Description
+The FSDP Release introduces FSDP support in addition to the existing DeepSpeed support through the accelerate library.
+
+### Features
+* feat: add e2e test for instructlab CI
+* feat: add mergify
+* Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
+
+### Fixes
+* Fix lint error to make CI happy
+* Fix typos
+* Ap/fix multipack for non granite models
+* Fix linting error and missing quote
+
+### Infrastructure
+* Add license identifiers
+* ci: update runner labels to uniquely identify instance sizes
+* ci: minor cleanup of E2E job
+* Fixing e2e to use relative path for working-directory
+* Bump step-security/harden-runner from 2.8.1 to 2.9.1
+* Bump pypa/gh-action-pypi-publish from 1.9.0 to 1.10.2
+* Bump actions/setup-python from 5.1.0 to 5.2.0
+* Bump rhysd/actionlint from 1.7.1 to 1.7.2
+* Bump hynek/build-and-inspect-python-package from 2.6.0 to 2.9.0
+* Bump DavidAnson/markdownlint-cli2-action from 16.0.0 to 17.0.0
+* ci: fix lint action
+* ci: add AWS tags to show github ref and PR num for all jobs
+
+## v0.4.2
+
+### Features
+* Provide safeguards during training
+
+## v0.4.1
+
+### Changes
+* makes saving every save_samples an optional feature
+
+## v0.4.0
+
+### Features
+* Adds a flag to save checkpoints at the end of an epoch
+
+### Changes
+* Change success message at end of training
+
+## v0.3.2
+
+### Features
+* Accept tuples for lora.target_modules
+
+### Documentation
+* patch some hyper parameter arg descriptions in README
+
+## v0.3.1
+
+### Dependencies
+* Update requirements to have bitsandbytes min and dolomite min
+
+## v0.3.0
+
+### Features
+* Updating token masking to support pretraining w/ masked special tokens
+* Adding weight merging for LoRA/QLoRA ckpts
+
+### Fixes
+* remove dead code
+* fix: changes the check to check against both the enum option and enum value
+
+## v0.2.0
+
+### Features
+* Fix ckpt save to include architecture for inference runtime consumption
+* Logging updates
+
+### Performance
+* Reducing deepspeed timeout to 10mins
+
+## v0.1.0
+
+### Features
+* Flash Attention Disable Toggle (Take 2)
+
+### Performance
+* Reduce Unnecessary Multiprocessing
+
+### Fixes
+* 🐛: fix optimizer selection logic so that FusedAdam is never loaded when CPU offloading is enabled
+* Add wheel to requirements
+
+## v0.0.5.1
+
+### Fixes
+This release includes PR [#121](https://github.com/instructlab/training/pull/121) to overcome an issue where our way of lazily importing the run_training function is being picked up as an error by pylint.
+
+## v0.0.5
+Minor bugfixes and updates.
+
+## v0.0.4
+Minor bugfixes and updates.
+
+## v0.0.3
+Minor bugfixes and updates.
+
+## v0.0.2
+
+### Features
+This introduces the instructlab library as a package in the instructlab package namespace.
+
+To install it:
+```
+pip install instructlab-training
+```
+
+And to install it with flash-attn and other CUDA-dependent packages, you can use
+```
+pip install instructlab-training[cuda]
+```
+
+Here's how to use it:
+```python
+from instructlab.training.config import TorchrunArgs, TrainingArgs, run_training
+
+torchrun_args = TorchrunArgs(
+    nproc_per_node = 1,  # 1 GPU
+    nnodes = 1,  # only 1 overall machine in the system
+    node_rank = 0,  # rank of the current machine
+    rdzv_id = 123,  # what ID other nodes will join on
+    rdzv_endpoint = '0.0.0.0:12345'  # address where other nodes will join
+)
+
+training_args = TrainingArgs(
+    # specify training args here
+)
+
+run_training(torch_args = torchrun_args, train_args = training_args)
+```
+
+## v0.0.1
+
+### Features
+Initial release with same features as v0.0.2.
\ No newline at end of file

From d18e57234e274eec5d57bf334338f347cb4d95a8 Mon Sep 17 00:00:00 2001
From: Lo-Mein <ryanlo5102000@gmail.com>
Date: Tue, 5 Nov 2024 12:16:34 -0500
Subject: [PATCH 29/48] Fixed linting errors

Signed-off-by: Lo-Mein <ryanlo5102000@gmail.com>
---
 CHANGELOG.md | 101 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 69 insertions(+), 32 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bc0027f1..ca1cb720 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,54 +2,64 @@
 
 ## v0.5.5
 
-### Features
+### v0.5.5 Features
+
 * e2e: replace old small job with new medium job
 
-### Fixes
+### v0.5.5 Fixes
+
 * fix: incorrect label for AWS medium runner
 * chore: add exit code & tox fix
 
-### Infrastructure
+### v0.5.5 Infrastructure
+
 * ci: grant HF_TOKEN access to the medium-size E2E CI job
 
 ## v0.5.4
 
-### Features
+### v0.5.4 Features
+
 * Add rocm extra to pyproject.toml
 
 ## v0.5.3
 
-### Fixes
+### v0.5.3 Fixes
+
 * fix: Add explicit flash_attn requirement for ROCm
 
 ## v0.5.2 - Fix Pretraining Masking
 
-### Fixes
+### v0.5.2 Fixes
+
 * fix: improve linting and automation
 * Fix pretrain token list->int for masking
 
 ## v0.5.1
 
-### Fixes
+### v0.5.1 Fixes
+
 * fix: updates sorting logic to correctly compare numbers
 
 ## v0.5.0 - FSDP and Full-State Checkpoint Resuming
 
-### Features
+### v0.5.0 Features
+
 * feat: add e2e test for instructlab CI
 * feat: add mergify
 * Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
 * adds Accelerate full-state (opt, lr_sched, params)
 * changes StreamablePopen to return a process and implement listening
 
-### Fixes
+### v0.5.0 Fixes
+
 * Fix lint error to make CI happy
 * Fix typos
 * Ap/fix multipack for non granite models
 * Fix generic chat template saved to tokenizer for generation
 * Fix linting error and missing quote
 
-### Infrastructure
+### v0.5.0 Infrastructure
+
 * Add license identifiers
 * ci: update runner labels to uniquely identify instance sizes
 * ci: minor cleanup of E2E job
@@ -69,21 +79,25 @@
 
 ## v0.5.0 Alpha 0 - The FSDP Release Pre-release
 
-### Description
+### v0.5.0 Alpha Description
+
 The FSDP Release introduces FSDP support in addition to the existing DeepSpeed support through the accelerate library.
 
-### Features
+### v0.5.0 Alpha Features
+
 * feat: add e2e test for instructlab CI
 * feat: add mergify
 * Adding FSDP Support to Training Library by @aldopareja @Maxusmusti @RobotSail
 
-### Fixes
+### v0.5.0 Alpha Fixes
+
 * Fix lint error to make CI happy
 * Fix typos
 * Ap/fix multipack for non granite models
 * Fix linting error and missing quote
 
-### Infrastructure
+### v0.5.0 Alpha Infrastructure
+
 * Add license identifiers
 * ci: update runner labels to uniquely identify instance sizes
 * ci: minor cleanup of E2E job
@@ -99,96 +113,118 @@ The FSDP Release introduces FSDP support in addition to the existing DeepSpeed s
 
 ## v0.4.2
 
-### Features
+### v0.4.2 Features
+
 * Provide safeguards during training
 
 ## v0.4.1
 
-### Changes
+### v0.4.1 Changes
+
 * makes saving every save_samples an optional feature
 
 ## v0.4.0
 
-### Features
+### v0.4.0 Features
+
 * Adds a flag to save checkpoints at the end of an epoch
 
-### Changes
+### v0.4.0 Changes
+
 * Change success message at end of training
 
 ## v0.3.2
 
-### Features
+### v0.3.2 Features
+
 * Accept tuples for lora.target_modules
 
-### Documentation
+### v0.3.2 Documentation
+
 * patch some hyper parameter arg descriptions in README
 
 ## v0.3.1
 
-### Dependencies
+### v0.3.1 Dependencies
+
 * Update requirements to have bitsandbytes min and dolomite min
 
 ## v0.3.0
 
-### Features
+### v0.3.0 Features
+
 * Updating token masking to support pretraining w/ masked special tokens
 * Adding weight merging for LoRA/QLoRA ckpts
 
-### Fixes
+### v0.3.0 Fixes
+
 * remove dead code
 * fix: changes the check to check against both the enum option and enum value
 
 ## v0.2.0
 
-### Features
+### v0.2.0 Features
+
 * Fix ckpt save to include architecture for inference runtime consumption
 * Logging updates
 
-### Performance
+### v0.2.0 Performance
+
 * Reducing deepspeed timeout to 10mins
 
 ## v0.1.0
 
-### Features
+### v0.1.0 Features
+
 * Flash Attention Disable Toggle (Take 2)
 
-### Performance
+### v0.1.0 Performance
+
 * Reduce Unnecessary Multiprocessing
 
-### Fixes
+### v0.1.0 Fixes
+
 * 🐛: fix optimizer selection logic so that FusedAdam is never loaded when CPU offloading is enabled
 * Add wheel to requirements
 
 ## v0.0.5.1
 
-### Fixes
+### v0.0.5.1 Fixes
+
 This release includes PR [#121](https://github.com/instructlab/training/pull/121) to overcome an issue where our way of lazily importing the run_training function is being picked up as an error by pylint.
 
 ## v0.0.5
+
 Minor bugfixes and updates.
 
 ## v0.0.4
+
 Minor bugfixes and updates.
 
 ## v0.0.3
+
 Minor bugfixes and updates.
 
 ## v0.0.2
 
 ### Features
+
 This introduces the instructlab library as a package in the instructlab package namespace.
 
 To install it:
-```
+
+```bash
 pip install instructlab-training
 ```
 
 And to install it with flash-attn and other CUDA-dependent packages, you can use
-```
+
+```bash
 pip install instructlab-training[cuda]
 ```
 
 Here's how to use it:
+
 ```python
 from instructlab.training.config import TorchrunArgs, TrainingArgs, run_training
 
@@ -209,5 +245,6 @@ run_training(torch_args = torchrun_args, train_args = training_args)
 
 ## v0.0.1
 
-### Features
+### v0.0.1 Features
+
 Initial release with same features as v0.0.2.
\ No newline at end of file

From 43c6f211e5fc022a4013993a396779aa06839855 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 6 Nov 2024 12:38:29 +0000
Subject: [PATCH 30/48] build(deps): Bump pypa/gh-action-pypi-publish from
 1.11.0 to 1.12.0

Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.11.0 to 1.12.0.
- [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases)
- [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/fb13cb306901256ace3dab689990e13a5550ffaa...61da13deb5f5124fb1536194f82ed3d9bbc7e8f3)

---
updated-dependencies:
- dependency-name: pypa/gh-action-pypi-publish
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/pypi.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index 2b8f85ca..dedc40b8 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -77,7 +77,7 @@ jobs:
                   path: dist
 
             - name: "Upload to Test PyPI"
-              uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa # v1.11.0
+              uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -129,4 +129,4 @@ jobs:
                   rm ./dist/*.sigstore.json
 
             - name: "Upload to PyPI"
-              uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa # v1.11.0
+              uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0

From 1d00406098d2a817cc49fb684fce322426016286 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 12:47:07 +0000
Subject: [PATCH 31/48] build(deps): Bump pypa/gh-action-pypi-publish from
 1.12.0 to 1.12.2

Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.12.0 to 1.12.2.
- [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases)
- [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/61da13deb5f5124fb1536194f82ed3d9bbc7e8f3...15c56dba361d8335944d31a2ecd17d700fc7bcbc)

---
updated-dependencies:
- dependency-name: pypa/gh-action-pypi-publish
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/pypi.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml
index dedc40b8..a8be1c1c 100644
--- a/.github/workflows/pypi.yaml
+++ b/.github/workflows/pypi.yaml
@@ -77,7 +77,7 @@ jobs:
                   path: dist
 
             - name: "Upload to Test PyPI"
-              uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0
+              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2
               with:
                   repository-url: https://test.pypi.org/legacy/
 
@@ -129,4 +129,4 @@ jobs:
                   rm ./dist/*.sigstore.json
 
             - name: "Upload to PyPI"
-              uses: pypa/gh-action-pypi-publish@61da13deb5f5124fb1536194f82ed3d9bbc7e8f3 # v1.12.0
+              uses: pypa/gh-action-pypi-publish@15c56dba361d8335944d31a2ecd17d700fc7bcbc # v1.12.2

From ebd3f73014efbab12cfb1ce3ab2e827bc4a944be Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:14:31 -0500
Subject: [PATCH 32/48] enhancement: enhances bash script with proper syntax

This commit enhances the existing bash script to encapsulate variables as "${VAR}" so that the
bounds are clearly known. For reference, see: https://google.github.io/styleguide/shellguide.html\#variable-expansion

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 tests/smoketest.sh | 64 +++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/tests/smoketest.sh b/tests/smoketest.sh
index e104b233..9bdb0df4 100755
--- a/tests/smoketest.sh
+++ b/tests/smoketest.sh
@@ -12,9 +12,9 @@ CHECKPOINTS_DIR="${TMP_DIR}/checkpoints"
 DATA_DIR="${TMP_DIR}/data"
 COMPUTED_DATA_PATH="${DATA_DIR}/data.jsonl"
 DEFAULT_DISTRIB_FRAMEWORK='fsdp'
-DISTRIB_FRAMEWORK="${1:-$DEFAULT_DISTRIB_FRAMEWORK}" # defaults to FSDP
+DISTRIB_FRAMEWORK="${1:-${DEFAULT_DISTRIB_FRAMEWORK}}" # defaults to FSDP
 DEFAULT_GPUS=8
-NUM_GPUS="${2:-$DEFAULT_GPUS}"
+NUM_GPUS="${2:-${DEFAULT_GPUS}}"
 
 # ############### User-modifiable parameters ############### 
 # Change these as needed
@@ -36,8 +36,8 @@ NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
 #   None
 #######################################
 function setup_tmpdir () {
-    mkdir "$CHECKPOINTS_DIR"
-    mkdir "$DATA_DIR"
+    mkdir "${CHECKPOINTS_DIR}"
+    mkdir "${DATA_DIR}"
 }
 
 #######################################
@@ -61,17 +61,17 @@ function prepare_data () {
     # go faster.
     
     python3 data_process.py \
-    --data_path="$SAMPLE_DATA_PATH" \
-    --data_output_path="$DATA_DIR" \
+    --data_path="${SAMPLE_DATA_PATH}" \
+    --data_output_path="${DATA_DIR}" \
     --max_seq_len=4096 \
-    --model_name_or_path="$MODEL_NAME"
+    --model_name_or_path="${MODEL_NAME}"
 
     # trim data so we only keep the first 'n' samples.
     # should be enough data for training to be meaningful but not enough
     # that training takes a large amount of time.
-    echo "$(head -"$NUM_SAMPLES_TRAINED_ON" "$COMPUTED_DATA_PATH")" > "$COMPUTED_DATA_PATH"
+    echo "$(head -"${NUM_SAMPLES_TRAINED_ON}" "${COMPUTED_DATA_PATH}")" > "${COMPUTED_DATA_PATH}"
 
-    echo "TRAINING ON $(wc -l "$COMPUTED_DATA_PATH") SAMPLES"
+    echo "TRAINING ON $(wc -l "${COMPUTED_DATA_PATH}") SAMPLES"
 }
 
 #######################################
@@ -86,9 +86,9 @@ function prepare_data () {
 #   writes location of checkpoints dir to standard out.
 #######################################
 function _cleanup_saved_checkpoints() {
-    echo "CLEARING CHECKPOINTS: $CHECKPOINTS_DIR"
-    rm -rf "$CHECKPOINTS_DIR"
-    mkdir "$CHECKPOINTS_DIR"
+    echo "CLEARING CHECKPOINTS: ${CHECKPOINTS_DIR}"
+    rm -rf "${CHECKPOINTS_DIR}"
+    mkdir "${CHECKPOINTS_DIR}"
 }
 
 #######################################
@@ -109,18 +109,18 @@ function _cleanup_saved_checkpoints() {
 function test_standard_loop () {
     torchrun \
     --standalone \
-    --nproc_per_node="$NUM_GPUS" \
+    --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
-    --model_name_or_path="$MODEL_NAME" \
-    --data_path="$COMPUTED_DATA_PATH" \
-    --output_dir="$CHECKPOINTS_DIR" \
+    --model_name_or_path="${MODEL_NAME}" \
+    --data_path="${COMPUTED_DATA_PATH}" \
+    --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
     --accelerate_full_state_at_epoch \
-    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
-    --max_batch_len="$MAX_BATCH_LEN" \
+    --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
+    --max_batch_len="${MAX_BATCH_LEN}" \
     --is_granite
 }
 
@@ -142,18 +142,18 @@ function test_standard_loop () {
 function test_standard_loop_nongranite () {
     torchrun \
     --standalone \
-    --nproc_per_node="$NUM_GPUS" \
+    --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
-    --model_name_or_path="$MODEL_NAME" \
-    --data_path="$COMPUTED_DATA_PATH" \
-    --output_dir="$CHECKPOINTS_DIR" \
+    --model_name_or_path="${MODEL_NAME}" \
+    --data_path="${COMPUTED_DATA_PATH}" \
+    --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
     --accelerate_full_state_at_epoch \
-    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
-    --max_batch_len="$MAX_BATCH_LEN"
+    --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
+    --max_batch_len="${MAX_BATCH_LEN}"
     # --is_granite \
 }
 
@@ -175,18 +175,18 @@ function test_standard_loop_nongranite () {
 function test_standard_loop_noflashattention_nogranite () {
     torchrun \
     --standalone \
-    --nproc_per_node="$NUM_GPUS" \
+    --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
-    --model_name_or_path="$MODEL_NAME" \
-    --data_path="$COMPUTED_DATA_PATH" \
-    --output_dir="$CHECKPOINTS_DIR" \
+    --model_name_or_path="${MODEL_NAME}" \
+    --data_path="${COMPUTED_DATA_PATH}" \
+    --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
     --accelerate_full_state_at_epoch \
-    --distributed_training_framework="$DISTRIB_FRAMEWORK" \
-    --max_batch_len="$MAX_BATCH_LEN" \
+    --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
+    --max_batch_len="${MAX_BATCH_LEN}" \
     --disable_flash_attn
     # --is_granite
 }
@@ -194,11 +194,11 @@ function test_standard_loop_noflashattention_nogranite () {
 function main () {
 
     setup_tmpdir
-    trap "rm -rf $TMP_DIR" EXIT
+    trap 'rm -rf ${TMP_DIR}' EXIT
 
     #NOTE (jkunstle): script is run as though it's
     # in the same source dir as main_ds and data_process.
-    cd "$CORRECT_WORKING_DIR"
+    cd "${CORRECT_WORKING_DIR}"
     echo "CURRENT WORKING DIRECTORY: $(pwd)"
 
     prepare_data

From fad328fa101197e956772919f14c33150f695708 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:16:04 +0000
Subject: [PATCH 33/48] This PR implements the capability of the training
 library to save LoRA models when training with FSDP as the distributed
 backend. This is accomplished by creating a copy of the LoRA model on the
 CPU, loading in the state dict after gathering it from the distributed model,
 and saving after merging the adapters back into the original model.
 Afterwards, the CPU copy is discarded and training continues.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 .pylintrc                                     |   3 +-
 src/instructlab/training/main_ds.py           |  51 ++----
 src/instructlab/training/setup_accelerator.py |  38 ++--
 src/instructlab/training/utils.py             | 162 +++++++++++++++++-
 4 files changed, 194 insertions(+), 60 deletions(-)

diff --git a/.pylintrc b/.pylintrc
index aff36d65..c0fbd95c 100644
--- a/.pylintrc
+++ b/.pylintrc
@@ -474,7 +474,8 @@ disable=raw-checker-failed,
         broad-exception-caught,
         super-init-not-called,
         duplicate-code,
-        too-many-positional-arguments
+        too-many-positional-arguments,
+        too-many-lines
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 7910c341..e802f28d 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -45,8 +45,8 @@
     check_flash_attn_enabled,
     check_valid_train_args,
     convert_loss_to_reduce_sum,
+    create_lora_config,
     ensure_loadable_dolomite_checkpoint,
-    get_projection_layer_names,
     load_latest_full_state,
     prepare_peft_model,
     prepare_universal_checkpoint_from_latest,
@@ -113,13 +113,16 @@ def setup_model(args, tokenizer, train_loader, grad_accum, flash_enabled):
             args.model_name_or_path, args.output_dir
         ) as path:
             base_model_args["pretrained_model_name_or_path"] = path
+            base_model_args["use_padding_free_transformer"] = True
             model = GPTDolomiteForCausalLM.from_pretrained(
                 **base_model_args,
-                use_padding_free_transformer=True,
             )
     else:
         model = AutoModelForCausalLM.from_pretrained(**base_model_args)
 
+    # store the base model args so we can recall them later if saving a LoRA model
+    args.base_model_args = base_model_args
+
     if len(tokenizer) > model.config.vocab_size:
         print(
             f"WARNING: tokenizer has {len(tokenizer)} tokens but model has {model.config.vocab_size} vocab size"
@@ -175,46 +178,14 @@ def setup_model(args, tokenizer, train_loader, grad_accum, flash_enabled):
     # - with the exception of granite, which handles it
     #   in the later stanza
     if args.lora_r > 0:
-        # if lora
-        # Third Party
-        from peft import LoraConfig
-
-        # ensure we select only the modules that exist in the model
-        proj_layers = get_projection_layer_names(model)
-        if not args.lora_target_modules:
-            print(
-                f"WARNING: lora_target_modules was not specified, defaulting to all of the model's projection modules"
-            )
-            if not proj_layers:
-                raise RuntimeError("could not find any projection layers in the model")
-            args.__dict__["lora_target_modules"] = proj_layers
-        else:
-            # when the user specifies the module, we should verify that they align with what's in the model
-            lora_target_modules_set = set(args.lora_target_modules)
-            diff = lora_target_modules_set - set(proj_layers)
-            layers_to_target = lora_target_modules_set - diff
-            if len(diff) == len(args.lora_target_modules):
-                raise ValueError(
-                    f"None of the modules you requested exist in the model.\nRequested modules: {args.lora_target_modules}; Available modules: {proj_layers}.\nThis is usually a misconfiuration error. Consider omitting your `lora_target_modules` list to have these discovered automatically."
-                )
-            if diff:
-                print(
-                    f"\033[33mWARNING: the following modules were targeted for LoRA but are not present in the model: {list(diff)}. Applying LoRA only to {list(layers_to_target)} modules.\033[0m"
-                )
-            args.__dict__["lora_target_modules"] = list(layers_to_target)
-
-        peft_config = LoraConfig(
-            lora_alpha=args.lora_alpha,
-            lora_dropout=args.lora_dropout,
-            r=args.lora_r,
-            bias="none",
-            task_type="CAUSAL_LM",
-            target_modules=args.lora_target_modules,
-        )
+        lora_config = create_lora_config(model, args)
         model = prepare_peft_model(
-            model, peft_config, gradient_checkpointing=not args.use_dolomite
+            model,
+            lora_config,
+            args.distributed_training_framework,
+            gradient_checkpointing=not args.use_dolomite,
         )
-
+        args.lora_config = lora_config
     elif not args.use_dolomite:
         model.gradient_checkpointing_enable()
 
diff --git a/src/instructlab/training/setup_accelerator.py b/src/instructlab/training/setup_accelerator.py
index 33972b59..239367f4 100644
--- a/src/instructlab/training/setup_accelerator.py
+++ b/src/instructlab/training/setup_accelerator.py
@@ -3,12 +3,10 @@
 
 # Third Party
 from accelerate import Accelerator
-from torch.distributed.fsdp import (  # FullyShardedDataParallel as FSDP,
-    BackwardPrefetch,
-    MixedPrecision,
-    ShardingStrategy,
-)
+from peft.utils.other import fsdp_auto_wrap_policy
+from torch.distributed.fsdp import BackwardPrefetch, MixedPrecision, ShardingStrategy
 from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
+from transformers import PreTrainedModel
 import torch
 
 # First Party
@@ -51,34 +49,52 @@ def get_ds_plugin(world_size, samples_per_gpu, grad_accum, opts: DeepSpeedOption
     return ds_plugin
 
 
-def get_fsdp_config(args, model):
+def get_fsdp_config(args, model: PreTrainedModel):
     # Third Party
     from accelerate.utils import FullyShardedDataParallelPlugin
     from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload
 
+    is_lora = args.lora_r > 0
     block_name = model._no_split_modules[0]
 
-    fsdp_plugin = FullyShardedDataParallelPlugin(
-        auto_wrap_policy=partial(
+    wrap_policy = None
+    if is_lora > 0:
+        wrap_policy = fsdp_auto_wrap_policy(model)
+    else:
+        wrap_policy = partial(
             transformer_auto_wrap_policy,
             transformer_layer_cls={
                 get_module_class_from_name(model, block_name),
             },
-        ),
+        )
+
+    # TODO(osilkin): BACKWARD_POST trades memory utilization for processing time, which is important for systems utilizing LoRA
+    #                We should have this be configurable in the future.
+    prefetch_policy = (
+        BackwardPrefetch.BACKWARD_POST if is_lora else BackwardPrefetch.BACKWARD_PRE
+    )
+    fsdp_plugin = FullyShardedDataParallelPlugin(
+        auto_wrap_policy=wrap_policy,
         limit_all_gathers=True,
         mixed_precision_policy=MixedPrecision(
             param_dtype=torch.bfloat16,
             reduce_dtype=torch.bfloat16,
             buffer_dtype=torch.bfloat16,
         ),
-        backward_prefetch=BackwardPrefetch.BACKWARD_PRE,
+        backward_prefetch=prefetch_policy,
         sharding_strategy=ShardingStrategy[args.fsdp_sharding_strategy],
         cpu_offload=CPUOffload(args.cpu_offload_params_fsdp),
     )
+
+    # `use_orig_params` must be disabled when using LoRA and FSDP together
+    # Source: https://huggingface.co/docs/peft/en/accelerate/fsdp#the-important-parts
+    if args.lora_r > 0:
+        fsdp_plugin.use_orig_params = False
+
     return fsdp_plugin
 
 
-def setup_accelerator(args, model, grad_accum):
+def setup_accelerator(args, model: PreTrainedModel, grad_accum):
     if args.distributed_training_framework == "deepspeed":
         # Third Party
         from deepspeed import DeepSpeedEngine
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 41b410c7..41148c8d 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -1,13 +1,14 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # Standard
+from argparse import Namespace
 from collections import OrderedDict
 from contextlib import contextmanager
 from copy import deepcopy
 from functools import partial
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple
 import importlib
 import inspect
 import logging
@@ -21,7 +22,7 @@
 
 # Third Party
 # pylint: disable=no-name-in-module
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedType
 from instructlab.dolomite.hf_models import (
     GPTDolomiteConfig,
     export_to_huggingface,
@@ -29,19 +30,23 @@
 )
 from rich.logging import RichHandler
 from torch import distributed as dist
+from torch import nn
 from torch.distributed import get_rank, is_initialized
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     CheckpointImpl,
     apply_activation_checkpointing,
     checkpoint_wrapper,
 )
-from transformers import PreTrainedModel
+from torch.distributed.fsdp import FullStateDictConfig
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import StateDictType
+from transformers import AutoModelForCausalLM, PreTrainedModel, PreTrainedTokenizer
 import numpy as np
 import torch
 import torch.nn.functional as F
 
 # First Party
-from instructlab.training.config import TrainingArgs
+from instructlab.training.config import DistributedBackend, TrainingArgs
 
 
 def check_valid_train_args(train_args: TrainingArgs):
@@ -403,9 +408,133 @@ def patch_target_module(
     setattr(source, obj_name_to_patch, replace_with)
 
 
+def wraps(module: nn.Module, wrapped_classes: Tuple[Any]) -> bool:
+    """Checks if a module or its children are an instance of one of the provided classes.
+
+    Args:
+        module (nn.Module): A PyTorch module.
+        wrapped_classes(Tuple): A tuple of potential classes the module could be.
+
+    Returns:
+        bool: True if the module or any of its children are instances of one of `wrapped_classes`, False otherwise.
+    """
+    if isinstance(module, wrapped_classes):
+        return True
+
+    for m in module.children():
+        if wraps(m, wrapped_classes):
+            return True
+
+    return False
+
+
+def create_lora_config(model: PreTrainedModel, args: Namespace) -> "peft.LoraConfig":
+    # if lora
+    # Third Party
+    from peft import LoraConfig
+
+    # ensure we select only the modules that exist in the model
+    proj_layers = get_projection_layer_names(model)
+    if not args.lora_target_modules:
+        print(
+            f"WARNING: lora_target_modules was not specified, defaulting to all of the model's projection modules"
+        )
+        if not proj_layers:
+            raise RuntimeError("could not find any projection layers in the model")
+        args.__dict__["lora_target_modules"] = proj_layers
+    else:
+        # when the user specifies the module, we should verify that they align with what's in the model
+        lora_target_modules_set = set(args.lora_target_modules)
+        diff = lora_target_modules_set - set(proj_layers)
+        layers_to_target = lora_target_modules_set - diff
+        if len(diff) == len(args.lora_target_modules):
+            raise ValueError(
+                f"None of the modules you requested exist in the model.\nRequested modules: {args.lora_target_modules}; Available modules: {proj_layers}.\nThis is usually a misconfiuration error. Consider omitting your `lora_target_modules` list to have these discovered automatically."
+            )
+        if diff:
+            print(
+                f"\033[33mWARNING: the following modules were targeted for LoRA but are not present in the model: {list(diff)}. Applying LoRA only to {list(layers_to_target)} modules.\033[0m"
+            )
+        args.__dict__["lora_target_modules"] = list(layers_to_target)
+
+    return LoraConfig(
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        r=args.lora_r,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=args.lora_target_modules,
+    )
+
+
+def save_fsdp_lora_model(
+    args: Namespace,
+    model: FSDP,
+    tokenizer: PreTrainedTokenizer,
+    accelerator: Accelerator,
+    output_dir: Path,
+):
+    """Given a LoRA model wrapped by FSDP and Accelerate, save a full copy of the original
+    model with the trained LoRA adapters merged into the copy.
+
+    This function creates a full copy of the model being trained and stores it in CPU memory.
+    If encountering OOM errors on CPU, this is likely a culprit.
+
+    Args:
+        args (Namespace): Args received by the ArgumentParser.
+        model (FSDP): FSDP model as prepared by `accelerate.Accelerator`
+        accelerator (Accelerator): The given accelerator object.
+    """
+    # Third Party
+    from peft import LoraConfig, LoraModel
+
+    if accelerator.distributed_type != DistributedType.FSDP:
+        raise RuntimeError(
+            "`save_fsdp_lora_model` was called when FSDP was not being used."
+        )
+    if not wraps(model, FSDP):
+        raise RuntimeError(
+            "`save_fsdp_lora_model` was called but provided model is not an FSDP model."
+        )
+    if not wraps(model, LoraModel):
+        raise RuntimeError(
+            "`save_fsdp_lora_model` was called but provided model is not a LoRA model."
+        )
+
+    # okay now that validation is out of the way, we are free to implement saving
+    lora_conf: LoraConfig = args.lora_config
+    sd_config = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, sd_config):
+        state = model.state_dict()
+
+    # When training a LoRA with FSDP and Accelerate, you cannot directly merge the adapters into
+    # the model wrapped by FSDP. To get around this limitation, we get a copy of the state dict
+    # create an identical model on CPU, load the state dict into the CPU model, merge the adapters
+    # and save the model to disk.
+    if accelerator.is_main_process:
+        # remove device_map from args list so we can load the model on CPU
+        old_device_map = args.base_model_args.pop("device_map", None)
+        model_copy = AutoModelForCausalLM.from_pretrained(
+            **args.base_model_args, device_map="cpu"
+        )
+        model_copy = LoraModel(model_copy, lora_conf, "default")
+        model_copy.load_state_dict(state)
+        model_copy.merge_and_unload(progressbar=True)
+        model_copy.save_pretrained(output_dir, safe_serialization=True)
+        model.config.to_json_file(f"{output_dir}/config.json")
+        tokenizer.save_pretrained(output_dir)
+        del model_copy
+        if old_device_map:
+            # return the previous device_map so it can be used later on if needed
+            args.base_model_args["device_map"] = old_device_map
+
+    dist.barrier()
+
+
 def prepare_peft_model(
-    model,
+    model: PreTrainedModel,
     peft_config,
+    distributed_backend: str,
     gradient_checkpointing=True,
     gradient_checkpointing_kwargs={"use_reentrant": True},
     mixed_precision="bf16",
@@ -413,6 +542,7 @@ def prepare_peft_model(
     # will guard this
     # Third Party
     from peft import (
+        LoraModel,
         PeftConfig,
         PeftModel,
         get_peft_model,
@@ -454,7 +584,11 @@ def make_inputs_require_grad(module, input, output):
                     make_inputs_require_grad
                 )
 
-        model = get_peft_model(model, peft_config)
+        if distributed_backend == DistributedBackend.FSDP.value:
+            # FSDP doesn't like `get_peft_model` as it leads to dtype mismatches
+            model = LoraModel(model, peft_config, "default")
+        else:
+            model = get_peft_model(model, peft_config)
         if mixed_precision == "bf16" and getattr(model, "is_loaded_in_4bit", False):
             peft_module_casting_to_bf16(model)
 
@@ -729,7 +863,7 @@ def _copy_no_lora_dict(state_dict):
 
 
 def save_dict_accelerate(
-    accelerator,
+    accelerator: Accelerator,
     state_to_save,
     save_directory,
     max_shard_size="5GB",
@@ -780,6 +914,18 @@ def save_hf_format_accelerate(
     CONFIG_NAME = "config.json"
     output_config_file = output_dir / CONFIG_NAME
 
+    # XXX(osilkin): LoRA + FSDP requires a different saving path than the others
+    #               so we set this variable and use it to avoid those paths further down.
+    is_fsdp_lora = is_lora and accelerator.distributed_type == DistributedType.FSDP
+    if is_fsdp_lora:
+        save_fsdp_lora_model(
+            args=args,
+            model=model,
+            tokenizer=tokenizer,
+            accelerator=accelerator,
+            output_dir=output_dir,
+        )
+
     get_state_dict_unpatched = accelerator.get_state_dict
 
     def _get_state_dict_patched(model, unwrap=False):
@@ -787,7 +933,7 @@ def _get_state_dict_patched(model, unwrap=False):
 
     accelerator.get_state_dict = _get_state_dict_patched
 
-    if accelerator.is_main_process:
+    if not is_fsdp_lora and accelerator.is_main_process:
         if is_lora:
             model.module.merge_adapter()
             model_state = model.module.state_dict()

From 200d70763082d0a86032a36d9db590ed2066ace8 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Tue, 5 Nov 2024 17:37:55 -0500
Subject: [PATCH 34/48] chore: add smoketest for lora+fsdp

This commit adds a smoketest for testing LoRA + FSDP.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 tests/smoketest.sh | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/smoketest.sh b/tests/smoketest.sh
index 9bdb0df4..a54c9764 100755
--- a/tests/smoketest.sh
+++ b/tests/smoketest.sh
@@ -191,6 +191,34 @@ function test_standard_loop_noflashattention_nogranite () {
     # --is_granite
 }
 
+
+##############################################################################
+# Validates the pathing logic for FSDP & LoRA.
+# A valid run should result in a model with all adapters merged
+# with the base model.
+##############################################################################
+function test_standard_loop_fsdp_lora() {
+    torchrun \
+    --standalone \
+    --nproc_per_node="${NUM_GPUS}" \
+    main_ds.py \
+    --model_name_or_path="${MODEL_NAME}" \
+    --is_granite \
+    --data_path="${COMPUTED_DATA_PATH}" \
+    --output_dir="${CHECKPOINTS_DIR}" \
+    --num_epochs=1 \
+    --effective_batch_size=128 \
+    --save_samples=0 \
+    --checkpoint_at_epoch \
+    --accelerate_full_state_at_epoch \
+    --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
+    --max_batch_len="${MAX_BATCH_LEN}" \
+    --is_granite \
+    --lora_r=4 \
+    --lora_alpha=32 \
+    --lora_dropout=0.1
+}
+
 function main () {
 
     setup_tmpdir

From 6ca54fb2f01c49830760db42436a4d95922e6fcd Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Thu, 7 Nov 2024 17:33:23 -0500
Subject: [PATCH 35/48] Update default chat template to Granite 3.0 template
 and update token processing (#319)

Handles the new roles and chat template included in Granite 3.0 models

* Add new template: In progress

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* First pass role token handling

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Quick list concat

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Add pretraining role

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* remove TODOs

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* More forgiving newline buffer

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Make sure dolomite conversion isn't always attempted

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Add tool response role handling

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Added tool sp token

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

* Fix sp token retrieval

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>

---------

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 .../chat_templates/ibm_generic_tmpl.py        | 40 ++++++++++-----
 .../chat_templates/ibm_legacy_tmpl.py         | 30 +++++++++++
 src/instructlab/training/data_process.py      | 51 +++++++++++++++++--
 src/instructlab/training/tokenizer_utils.py   |  3 ++
 src/instructlab/training/utils.py             |  6 ++-
 5 files changed, 111 insertions(+), 19 deletions(-)
 create mode 100644 src/instructlab/training/chat_templates/ibm_legacy_tmpl.py

diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
index 5eaf795e..73a21652 100644
--- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
@@ -4,27 +4,41 @@
 from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
-    system=TokenInfo("<|system|>", add_to_tokenizer=True),
-    user=TokenInfo("<|user|>", add_to_tokenizer=True),
-    assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
-    eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
-    pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
-    bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
+    start_role=TokenInfo("<|start_of_role|>", add_to_tokenizer=True),
+    end_role=TokenInfo("<|end_of_role|>", add_to_tokenizer=True),
+    tool=TokenInfo("<|tool_call|>", add_to_tokenizer=True),
+    eos=TokenInfo("<|end_of_text|>", add_to_tokenizer=True),
+    bos=TokenInfo("<|end_of_text|>", add_to_tokenizer=True),
+    pad=TokenInfo("<|end_of_text|>", add_to_tokenizer=True),
 )
 
 CHAT_TEMPLATE = (
+    "{%- if tools %}"
+    "{{ '<|start_of_role|>available_tools<|end_of_role|>\n' }}"
+    "{% for tool in tools %}"
+    "{{ tool | tojson(indent=4) }}"
+    "{% if not loop.last %}"
+    "{{- '\n\n' }}"
+    "{% endif %}"
+    "{% endfor %}"
+    "{{ '<|end_of_text|>\n' }}"
+    "{% endif %}"
     "{% for message in messages %}"
-    "{% if message['role'] == 'pretraining' %}"
-    "{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
-    "{% elif message['role'] == 'system' %}"
-    "{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
+    "{% if message['role'] == 'system' %}"
+    "{{ '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}"
+    "{% elif message['role'] == 'pretraining' %}"
+    "{{ '<|pretrain|>' + message['content'] + '<|end_of_text|>' + '<|/pretrain|>'}}"
     "{% elif message['role'] == 'user' %}"
-    "{{'<|user|>' + '\n' + message['content'] + '\n'}}"
+    "{{ '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}"
     "{% elif message['role'] == 'assistant' %}"
-    "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
+    "{{ '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>\n' }}"
+    "{% elif message['role'] == 'assistant_tool_call' %}"
+    "{{ '<|start_of_role|>assistant<|end_of_role|><|tool_call|>' + message['content'] + '<|end_of_text|>\n' }}"
+    "{% elif message['role'] == 'tool_response' %}"
+    "{{ '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>\n' }}"
     "{% endif %}"
     "{% if loop.last and add_generation_prompt %}"
-    "{{ '<|assistant|>' + '\n' }}"
+    "{{ '<|start_of_role|>assistant<|end_of_role|>' }}"
     "{% endif %}"
     "{% endfor %}"
 )
diff --git a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
new file mode 100644
index 00000000..5eaf795e
--- /dev/null
+++ b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+
+# First Party
+from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+
+SPECIAL_TOKENS = SpecialTokens(
+    system=TokenInfo("<|system|>", add_to_tokenizer=True),
+    user=TokenInfo("<|user|>", add_to_tokenizer=True),
+    assistant=TokenInfo("<|assistant|>", add_to_tokenizer=True),
+    eos=TokenInfo("<|endoftext|>", add_to_tokenizer=True),
+    pad=TokenInfo("<|pad|>", add_to_tokenizer=True),
+    bos=TokenInfo("<|begginingoftext|>", add_to_tokenizer=True),
+)
+
+CHAT_TEMPLATE = (
+    "{% for message in messages %}"
+    "{% if message['role'] == 'pretraining' %}"
+    "{{'<|pretrain|>' + message['content'] + '<|endoftext|>' + '<|/pretrain|>' }}"
+    "{% elif message['role'] == 'system' %}"
+    "{{'<|system|>'+ '\n' + message['content'] + '\n'}}"
+    "{% elif message['role'] == 'user' %}"
+    "{{'<|user|>' + '\n' + message['content'] + '\n'}}"
+    "{% elif message['role'] == 'assistant' %}"
+    "{{'<|assistant|>' + '\n' + message['content'] + '<|endoftext|>' + ('' if loop.last else '\n')}}"
+    "{% endif %}"
+    "{% if loop.last and add_generation_prompt %}"
+    "{{ '<|assistant|>' + '\n' }}"
+    "{% endif %}"
+    "{% endfor %}"
+)
diff --git a/src/instructlab/training/data_process.py b/src/instructlab/training/data_process.py
index 10214e9d..389a8cb0 100644
--- a/src/instructlab/training/data_process.py
+++ b/src/instructlab/training/data_process.py
@@ -28,7 +28,11 @@ def check_valid_sample(
     if len(whole_sentence_tk) >= max_len or len(whole_sentence_tk) < 20:
         return False
     # last token should be eos_token
-    if not eos_tk[0] in (whole_sentence_tk[-1], whole_sentence_tk[-2]):
+    if not eos_tk[0] in (
+        whole_sentence_tk[-1],
+        whole_sentence_tk[-2],
+        whole_sentence_tk[-3],
+    ):
         return False
 
     # NOTE - below checks are no longer strictly required, but we may want to revisit to make sure there's nothing we need to bring back in validity checking
@@ -61,6 +65,7 @@ def unmask_message_content(
     system_tokens,
     pretrain_token,
     pretrain_end_token,
+    tool_resp_tokens=None,
 ):
     """
     Create labels for tokens in a sequence with special handling for pretraining tokens and role-specific sequences.
@@ -130,6 +135,10 @@ def find_longest_match(start_idx, sequences):
             default=None,
         )
 
+    special_sequences = [user_tokens, assist_tokens, system_tokens]
+    if tool_resp_tokens:
+        special_sequences.append(tool_resp_tokens)
+
     in_pretraining = False
     unmasking = False
     i = 0
@@ -143,7 +152,7 @@ def find_longest_match(start_idx, sequences):
             i += 1
             continue
 
-        match = find_longest_match(i, [user_tokens, assist_tokens, system_tokens])
+        match = find_longest_match(i, special_sequences)
         if match:
             unmasking = match == assist_tokens
             i += len(match)
@@ -167,8 +176,6 @@ def find_longest_match(start_idx, sequences):
     ]
 
     # Assertions
-    special_sequences = [user_tokens, assist_tokens, system_tokens]
-
     # 1. No special sequence of tokens should be unmasked
     for i in range(len(final_sentence_tk)):
         for seq in special_sequences:
@@ -229,10 +236,43 @@ def main(args: DataProcessArgs):
     CHAT_TEMPLATE, SPECIAL_TOKENS = retrieve_chat_template(args.chat_tmpl_path)
     tokenizer = setup_tokenizer(args.model_path, SPECIAL_TOKENS, CHAT_TEMPLATE)
 
-    system_tk, user_tk, assistant_tk, eos_tk, pad_tk, bos_tk = [
+    (
+        system_tk,
+        user_tk,
+        assistant_tk,
+        eos_tk,
+        pad_tk,
+        bos_tk,
+        start_role_tk,
+        end_role_tk,
+        _,
+    ) = [
         get_sp_token(tokenizer, getattr(SPECIAL_TOKENS, sp).token)
         for sp in SPECIAL_TOKENS.__annotations__.keys()
     ]
+    if start_role_tk and end_role_tk:
+        system_tk = (
+            start_role_tk
+            + tokenizer.encode("system", add_special_tokens=False)
+            + end_role_tk
+        )
+        user_tk = (
+            start_role_tk
+            + tokenizer.encode("user", add_special_tokens=False)
+            + end_role_tk
+        )
+        assistant_tk = (
+            start_role_tk
+            + tokenizer.encode("assistant", add_special_tokens=False)
+            + end_role_tk
+        )
+        tool_resp_tk = (
+            start_role_tk
+            + tokenizer.encode("tool_response", add_special_tokens=False)
+            + end_role_tk
+        )
+    else:
+        tool_resp_tk = None
     log_rank_0(
         f"Special tokens: eos: {eos_tk}, pad: {pad_tk}, bos: {bos_tk}, system: {system_tk}, user: {user_tk}, assistant: {assistant_tk}"
     )
@@ -324,6 +364,7 @@ def main(args: DataProcessArgs):
         system_tokens=system_tk,
         pretrain_token=get_sp_token(tokenizer, "<|pretrain|>")[0],
         pretrain_end_token=get_sp_token(tokenizer, "<|/pretrain|>")[0],
+        tool_resp_tokens=tool_resp_tk,
     )
     print("\033[92munmasking the appropriate message content...\033[0m")
     data_with_labels = data_with_input_ids.map(
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index 45ad4699..f142dec6 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -25,6 +25,9 @@ class SpecialTokens:
     eos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
     pad: TokenInfo = field(default_factory=lambda: TokenInfo(""))
     bos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    start_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    end_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    tool: TokenInfo = field(default_factory=lambda: TokenInfo(""))
 
     def get_tokens_to_add(self) -> List[str]:
         return [
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 41b410c7..b6f655bf 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -761,7 +761,6 @@ def save_hf_format_accelerate(
     tokenizer,
     accelerator: Accelerator,
     samples_seen,
-    convert_dolomite=True,
     is_lora=False,
 ):
     log_rank_0(
@@ -770,6 +769,11 @@ def save_hf_format_accelerate(
     )
     start = time.time()
 
+    if args.model_type in ("gpt_megatron", "gpt_dolomite"):
+        convert_dolomite = False
+    else:
+        convert_dolomite = True
+
     final_output_dir = Path(args.output_dir) / "hf_format" / f"samples_{samples_seen}"
     if args.use_dolomite and convert_dolomite:
         tmpdir = TemporaryDirectory("w")  # pylint: disable=consider-using-with

From 24591a7160cd3af83a11ca2897dd1d7328bb0e0b Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 8 Nov 2024 17:07:47 +0000
Subject: [PATCH 36/48] feat: add total_samples as a field to logs being
 emitted

Today we emit a number of important values in the logs, but we do not explicitly announce how many samples
we are planning on iterating through. This commit emits total_samples as a field in both the initial log
as well as the subsequent logs during training. This way, this value can be accessed consistently by
any program interested in displaying information about its progress for convenience.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/training/main_ds.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 7910c341..5eabb384 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -18,6 +18,7 @@
 
 # pylint: disable=no-name-in-module
 from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
+from torch.utils.data import DataLoader
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, get_scheduler
 import torch
@@ -325,7 +326,7 @@ def train(
     lr_scheduler,
     accelerator: Accelerator,
     tokenizer,
-    train_loader,
+    train_loader: DataLoader,
     grad_accum,
     metric_logger,
 ):
@@ -457,6 +458,7 @@ def train(
                         "total_loss": float(log_loss / num_loss_counted_tokens),
                         "samples_seen": samples_seen,
                         "gradnorm": global_grad_norm,
+                        "total_samples": len(train_loader.dataset),
                         # "weight_norm": weight_norm,
                     }
                 )
@@ -620,6 +622,7 @@ def main(args):
                 "num_batches": len(train_loader),
                 "avg_samples_per_batch": len(dataset) / len(train_loader),
                 "samples_per_gpu": args.samples_per_gpu,
+                "total_samples": len(dataset),  # emit the total number of samples
             }
         )
 

From a50929ffd69a865859d2f339a0f51be270ae2f5c Mon Sep 17 00:00:00 2001
From: Jaideep Rao <jrao@redhat.com>
Date: Thu, 7 Nov 2024 10:55:28 -0500
Subject: [PATCH 37/48] chore: move token classes into chat templates

Signed-off-by: Jaideep Rao <jrao@redhat.com>
---
 .../chat_templates/ibm_generic_tmpl.py        |  2 +-
 .../chat_templates/ibm_legacy_tmpl.py         |  2 +-
 .../training/chat_templates/mistral_tmpl.py   |  2 +-
 .../training/chat_templates/utils.py          | 29 ++++++++++++++++++
 src/instructlab/training/tokenizer_utils.py   | 30 -------------------
 5 files changed, 32 insertions(+), 33 deletions(-)
 create mode 100644 src/instructlab/training/chat_templates/utils.py

diff --git a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
index 73a21652..1403276b 100644
--- a/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_generic_tmpl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # First Party
-from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
     start_role=TokenInfo("<|start_of_role|>", add_to_tokenizer=True),
diff --git a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
index 5eaf795e..0f09468f 100644
--- a/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
+++ b/src/instructlab/training/chat_templates/ibm_legacy_tmpl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # First Party
-from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
     system=TokenInfo("<|system|>", add_to_tokenizer=True),
diff --git a/src/instructlab/training/chat_templates/mistral_tmpl.py b/src/instructlab/training/chat_templates/mistral_tmpl.py
index dda051b3..6c1e8757 100644
--- a/src/instructlab/training/chat_templates/mistral_tmpl.py
+++ b/src/instructlab/training/chat_templates/mistral_tmpl.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # First Party
-from instructlab.training.tokenizer_utils import SpecialTokens, TokenInfo
+from instructlab.training.chat_templates.utils import SpecialTokens, TokenInfo
 
 SPECIAL_TOKENS = SpecialTokens(
     bos=TokenInfo("<s>", add_to_tokenizer=True),
diff --git a/src/instructlab/training/chat_templates/utils.py b/src/instructlab/training/chat_templates/utils.py
new file mode 100644
index 00000000..c0e62796
--- /dev/null
+++ b/src/instructlab/training/chat_templates/utils.py
@@ -0,0 +1,29 @@
+# Standard
+from dataclasses import dataclass, field
+from typing import List
+
+
+@dataclass
+class TokenInfo:
+    token: str
+    add_to_tokenizer: bool = False
+
+
+@dataclass
+class SpecialTokens:
+    system: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    user: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    assistant: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    eos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    pad: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    bos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    start_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    end_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+    tool: TokenInfo = field(default_factory=lambda: TokenInfo(""))
+
+    def get_tokens_to_add(self) -> List[str]:
+        return [
+            token_info.token
+            for token_info in self.__dict__.values()
+            if token_info.add_to_tokenizer and token_info.token
+        ]
diff --git a/src/instructlab/training/tokenizer_utils.py b/src/instructlab/training/tokenizer_utils.py
index f142dec6..d6c55e7e 100644
--- a/src/instructlab/training/tokenizer_utils.py
+++ b/src/instructlab/training/tokenizer_utils.py
@@ -1,9 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 
-# Standard
-from dataclasses import dataclass, field
-from typing import List
-
 # Third Party
 from transformers import AutoTokenizer, PreTrainedTokenizer
 
@@ -11,32 +7,6 @@
 from instructlab.training.utils import log_rank_0
 
 
-@dataclass
-class TokenInfo:
-    token: str
-    add_to_tokenizer: bool = False
-
-
-@dataclass
-class SpecialTokens:
-    system: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    user: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    assistant: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    eos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    pad: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    bos: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    start_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    end_role: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-    tool: TokenInfo = field(default_factory=lambda: TokenInfo(""))
-
-    def get_tokens_to_add(self) -> List[str]:
-        return [
-            token_info.token
-            for token_info in self.__dict__.values()
-            if token_info.add_to_tokenizer and token_info.token
-        ]
-
-
 def setup_tokenizer(
     model_name_or_path, SPECIAL_TOKENS, CHAT_TEMPLATE
 ) -> PreTrainedTokenizer:

From 5417952a86db74fef902158721d5caa47d18df95 Mon Sep 17 00:00:00 2001
From: abdullah-ibm <abdullah@ibm.com>
Date: Thu, 7 Nov 2024 13:38:39 +0300
Subject: [PATCH 38/48] fix: pylint fixes

Signed-off-by: abdullah-ibm <abdullah@ibm.com>
---
 src/instructlab/training/main_ds.py | 44 ++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 54d80e5e..72d8357b 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -15,25 +15,28 @@
 from accelerate import Accelerator
 
 try:
+    # Third Party
     from deepspeed.ops.adam import DeepSpeedCPUAdam
 except ImportError:
     DeepSpeedCPUAdam = None
-    local_rank = int(os.getenv('LOCAL_RANK', None))
-    if __name__ == '__main__' and (not local_rank or local_rank == 0):
-        print("DeepSpeed CPU Optimizer is not available. Some features may be unavailable.")
+    local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    if __name__ == "__main__" and (not local_rank or local_rank == 0):
+        print(
+            "DeepSpeed CPU Optimizer is not available. Some features may be unavailable."
+        )
 
 try:
+    # Third Party
     from deepspeed.ops.adam import FusedAdam
     from deepspeed.runtime.zero.utils import ZeRORuntimeException
 except ImportError:
     FusedAdam = None
     ZeRORuntimeException = None
-    local_rank = int(os.getenv('LOCAL_RANK', None))
-    if __name__ == '__main__' and (not local_rank or local_rank == 0):
+    local_rank = int(os.getenv("LOCAL_RANK", "0"))
+    if __name__ == "__main__" and (not local_rank or local_rank == 0):
         print("DeepSpeed is not available. Some features may be unavailable.")
 
-# pylint: disable=no-name-in-module
-from instructlab.training.confg import DistributedBackend
+# Third Party
 from instructlab.dolomite.hf_models import GPTDolomiteForCausalLM
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, get_scheduler
@@ -43,6 +46,8 @@
 # First Party
 from instructlab.training import config
 from instructlab.training.async_logger import AsyncStructuredLogger
+
+# pylint: disable=no-name-in-module
 from instructlab.training.config import (
     DataProcessArgs,
     DistributedBackend,
@@ -533,11 +538,19 @@ def main(args):
     # Third Party
     import yaml
 
-    if args.distributed_training_framework == 'deepspeed' and not FusedAdam:
-        raise ImportError("DeepSpeed was selected but we cannot import the `FusedAdam` optimizer")
+    if args.distributed_training_framework == "deepspeed" and not FusedAdam:
+        raise ImportError(
+            "DeepSpeed was selected but we cannot import the `FusedAdam` optimizer"
+        )
 
-    if args.distributed_training_framework == 'deepspeed' and args.cpu_offload_optimizer and not DeepSpeedCPUAdam:
-        raise ImportError("DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags.")
+    if (
+        args.distributed_training_framework == "deepspeed"
+        and args.cpu_offload_optimizer
+        and not DeepSpeedCPUAdam
+    ):
+        raise ImportError(
+            "DeepSpeed was selected and CPU offloading was requested, but DeepSpeedCPUAdam could not be imported. This likely means you need to build DeepSpeed with the CPU adam flags."
+        )
 
     metric_logger = AsyncStructuredLogger(
         args.output_dir
@@ -761,11 +774,14 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     # deepspeed options
     if train_args.distributed_backend == DistributedBackend.DeepSpeed:
         if not FusedAdam:
-            raise ImportError("DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly")
+            raise ImportError(
+                "DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly"
+            )
 
         if train_args.deepspeed_options.cpu_offload_optimizer and not DeepSpeedCPUAdam:
-            raise ImportError("DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading.")
-
+            raise ImportError(
+                "DeepSpeed CPU offloading was enabled, but DeepSpeedCPUAdam could not be imported. This is most likely because DeepSpeed was not built with CPU Adam. Please rebuild DeepSpeed to have CPU Adam, or disable CPU offloading."
+            )
     if train_args.deepspeed_options.save_samples:
         command.append(f"--save_samples_ds={train_args.deepspeed_options.save_samples}")
     if train_args.deepspeed_options.cpu_offload_optimizer:

From ea3ec9c927931ef0dc33a952a33a982ae1e562a2 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Tue, 12 Nov 2024 14:01:44 +0000
Subject: [PATCH 39/48] fix: incorrect enum selection

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/training/main_ds.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 63c92784..4c04da0f 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -775,7 +775,7 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     )
 
     # deepspeed options
-    if train_args.distributed_backend == DistributedBackend.DeepSpeed:
+    if train_args.distributed_backend == DistributedBackend.DEEPSPEED:
         if not FusedAdam:
             raise ImportError(
                 "DeepSpeed was selected as the distributed backend, but FusedAdam could not be imported. Please double-check that DeepSpeed is installed correctly"

From a6e1f77c0eb79a64fcccc0aacdbb2d4791bbf974 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Tue, 12 Nov 2024 14:50:35 +0000
Subject: [PATCH 40/48] docs: include docs on installing deepspeed w/ cpuadam

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 README.md | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 27219af9..c6435542 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,36 @@ allow you to customize aspects of the ZeRO stage 2 optimizer.
 
 For more information about DeepSpeed, see [deepspeed.ai](https://www.deepspeed.ai/)
 
-#### `FSDPOptions`
+#### DeepSpeed with CPU Offloading
+
+To use DeepSpeed with CPU offloading, you'll usually encounter an issue indicating that the optimizer needed to use the Adam optimizer on CPU doesn't exist. To resolve this, please follow the following steps:
+
+**Rebuild DeepSpeed with CPUAdam**:
+
+You'll need to rebuild DeepSpeed in order for the optimizer to be present:
+
+```bash
+# uninstall deepspeed & reinstall with the flags for installing CPUAdam
+pip uninstall deepspeed
+DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install deepspeed --no-deps
+```
+
+**Ensure `-lcurand` is linked correctly**:
+
+A problem that we commonly encounter is that the `-lcurand` linker will not be present when
+DeepSpeed recompiles. To resolve this, you will need to find the location of the `libcurand.so` file in your machine:
+
+```bash
+find / -name 'libcurand*.so*' 2>/dev/null
+```
+
+If libcurand.so is not present in the `/usr/lib64` directory, you'll need to add a symlink. Ex:
+
+```bash
+sudo ln -s /usr/local/cuda/lib64/libcurand.so.10 /usr/lib64/libcurand.so
+```
+
+### `FSDPOptions`
 
 Like DeepSpeed, we only expose a number of parameters for you to modify with FSDP.
 They are listed below:

From c809c736a65335d39e3f09196301d2337c428d0a Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Fri, 25 Oct 2024 18:08:04 -0400
Subject: [PATCH 41/48] feat: add log exporting to e2e tests

Currently, the training library runs through a series of end-to-end tests which ensure there are
no bugs in the code being tested. However; we do not perform any form of validation to assure that
the training logic and quality has not diminished.

This presents an issue where we can potentially be "correct" in the sense of no hard errors being hit,
but invisible bugs may be introduced which cause models to regress in training quality, or other
bugs that plague the models themselves to seep in.

This commit fixes that problem by introducng the ability to export the training loss data itself
from the test and rendering the loss curve using matplotlib.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 .github/workflows/e2e-nvidia-l4-x1.yml   |  66 +++++++-
 .github/workflows/e2e-nvidia-l40s-x4.yml |  69 ++++++++-
 requirements-dev.txt                     |   2 +
 scripts/create-loss-graph.py             | 186 +++++++++++++++++++++++
 4 files changed, 321 insertions(+), 2 deletions(-)
 create mode 100644 scripts/create-loss-graph.py

diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index 16574190..6bf16187 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -41,6 +41,11 @@ jobs:
       label: ${{ steps.start-ec2-runner.outputs.label }}
       ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
     steps:
+      - name: "Harden Runner"
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -78,6 +83,11 @@ jobs:
     permissions: {}
 
     steps:
+      - name: "Harden Runner"
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+        with:
+          egress-policy: audit
+
       - name: Install Packages
         run: |
           cat /etc/os-release
@@ -141,7 +151,22 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           . venv/bin/activate
-          ./scripts/e2e-ci.sh -m
+          # set preserve to true so we can retain the logs
+          ./scripts/e2e-ci.sh -mp
+          
+          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+          # and we know that it will be written into a directory created by `mktemp -d`. 
+          # Given this information, we can use the following command to find the file:
+          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
+          mv "${log_file}" training-log.jsonl
+
+      - name: Upload training logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: ./instructlab/training-log.jsonl
+          retention-days: 1
+          overwrite: true
 
   stop-medium-ec2-runner:
     needs:
@@ -150,6 +175,11 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
+      - name: "Harden Runner"
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -164,6 +194,40 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           label: ${{ needs.start-medium-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
+      
+      - name: Download loss data
+        id: download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: downloaded-data
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt
+      
+      - name: Try to upload to s3
+        id: upload-s3
+        continue-on-error: true
+        run: |
+          output_file='./test.md' 
+          python scripts/create-loss-graph.py  \
+            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+            --output-file "${output_file}" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${{ github.event.pull_request.base.ref }}" \
+            --pr-number "${{ github.event.pull_request.number }}" \
+            --head-sha "${{ github.event.pull_request.head.sha }}" \
+            --origin-repository "${{ github.repository }}"
+
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+      - name: Check S3 upload status
+        if: steps.upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
 
   e2e-medium-workflow-complete:
     # we don't want to block PRs on failed EC2 cleanup
diff --git a/.github/workflows/e2e-nvidia-l40s-x4.yml b/.github/workflows/e2e-nvidia-l40s-x4.yml
index 27290c30..52015d5f 100644
--- a/.github/workflows/e2e-nvidia-l40s-x4.yml
+++ b/.github/workflows/e2e-nvidia-l40s-x4.yml
@@ -19,6 +19,12 @@ jobs:
       label: ${{ steps.start-ec2-runner.outputs.label }}
       ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -54,6 +60,11 @@ jobs:
       pull-requests: write
 
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
       - name: Install Packages
         run: |
           cat /etc/os-release
@@ -170,7 +181,23 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         run: |
           . venv/bin/activate
-          ./scripts/e2e-ci.sh -l
+
+          # set preserve to true so we can retain the logs
+          ./scripts/e2e-ci.sh -lp
+
+          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+          # and we know that it will be written into a directory created by `mktemp -d`. 
+          # Given this information, we can use the following command to find the file:
+          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
+          mv "${log_file}" training-log.jsonl
+
+      - name: Upload training logs
+        uses: actions/upload-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: ./instructlab/training-log.jsonl
+          retention-days: 1
+          overwrite: true
 
       - name: Add comment to PR if the workflow failed
         if: failure() && steps.check_pr.outputs.is_pr == 'true'
@@ -221,6 +248,12 @@ jobs:
     runs-on: ubuntu-latest
     if: ${{ always() }}
     steps:
+      - name: "Harden Runner"
+        # v2.10.1
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7
+        with:
+          egress-policy: audit
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
         with:
@@ -235,3 +268,37 @@ jobs:
           github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
           label: ${{ needs.start-large-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-large-ec2-runner.outputs.ec2-instance-id }}
+
+      - name: Download loss data
+        id: download-logs
+        uses: actions/download-artifact@v4
+        with:
+          name: training-log.jsonl
+          path: downloaded-data
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt
+      
+      - name: Try to upload to s3
+        id: upload-s3
+        continue-on-error: true
+        run: |
+          output_file='./test.md' 
+          python scripts/create-loss-graph.py  \
+            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+            --output-file "${output_file}" \
+            --aws-region "${{ vars.AWS_REGION }}" \
+            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+            --base-branch "${{ github.event.pull_request.base.ref }}" \
+            --pr-number "${{ github.event.pull_request.number }}" \
+            --head-sha "${{ github.event.pull_request.head.sha }}" \
+            --origin-repository "${{ github.repository }}"
+
+      - name: Check S3 upload status
+        if: steps.upload-s3.outcome == 'failure'
+        run: |
+          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+  
+          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index a0dff1ed..f77c807f 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,6 +2,8 @@
 
 -r requirements.txt
 
+matplotlib
+numpy
 pre-commit>=3.0.4,<5.0
 pylint>=2.16.2,<4.0
 pylint-pydantic
diff --git a/scripts/create-loss-graph.py b/scripts/create-loss-graph.py
new file mode 100644
index 00000000..e4ab1254
--- /dev/null
+++ b/scripts/create-loss-graph.py
@@ -0,0 +1,186 @@
+# SPDX-License-Identifier: Apache-2.0
+# Standard
+from argparse import ArgumentParser
+from pathlib import Path
+from subprocess import run
+from typing import Dict, List
+import json
+
+# Third Party
+from matplotlib import pyplot as plt
+from pydantic import BaseModel
+
+
+class Arguments(BaseModel):
+    log_file: str | None = None
+    output_file: str
+    aws_region: str
+    bucket_name: str
+    base_branch: str
+    pr_number: str
+    head_sha: str
+    origin_repository: str
+
+
+def render_image(loss_data: List[float], outfile: Path) -> str:
+    # create the plot
+    plt.figure()
+    plt.plot(loss_data)
+    plt.xlabel("Steps")
+    plt.ylabel("Loss")
+    plt.title("Training performance over fixed dataset")
+
+    if outfile.exists():
+        outfile.unlink()
+
+    plt.savefig(outfile, format="png")
+
+
+def contents_from_file(log_file: Path) -> List[Dict]:
+    if not log_file.exists():
+        raise FileNotFoundError(f"Log file {log_file} does not exist")
+    if log_file.is_dir():
+        raise ValueError(f"Log file {log_file} is a directory")
+    with open(log_file, "r") as f:
+        return [json.loads(l) for l in f.read().splitlines()]
+
+
+def read_loss_data(log_file: Path) -> List[float]:
+    if not log_file:
+        raise ValueError("log_file must be provided when source is file")
+    contents = contents_from_file(log_file)
+
+    # select the loss data
+    loss_data = [item["total_loss"] for item in contents if "total_loss" in item]
+
+    if not loss_data:
+        raise ValueError("Loss data is empty")
+
+    # ensure that the loss data is valid
+    if not all(isinstance(l, float) for l in loss_data):
+        raise ValueError("Loss data must be a list of floats")
+
+    return loss_data
+
+
+def write_to_s3(
+    file: Path,
+    bucket_name: str,
+    destination: str,
+):
+    if not file.exists():
+        raise RuntimeError(f"File {file} does not exist")
+
+    s3_path = f"s3://{bucket_name}/{destination}"
+    results = run(
+        ["aws", "s3", "cp", str(file), s3_path], capture_output=True, check=True
+    )
+    if results.returncode != 0:
+        raise RuntimeError(f"failed to upload to s3: {results.stderr.decode('utf-8')}")
+    else:
+        print(results.stdout.decode("utf-8"))
+
+
+def get_destination_path(base_ref: str, pr_number: str, head_sha: str):
+    return f"pulls/{base_ref}/{pr_number}/{head_sha}/loss-graph.png"
+
+
+def write_md_file(
+    output_file: Path, url: str, pr_number: str, head_sha: str, origin_repository: str
+):
+    commit_url = f"https://github.com/{origin_repository}/commit/{head_sha}"
+    md_template = f"""
+# Loss Graph for PR {args.pr_number} ([{args.head_sha[:7]}]({commit_url}))
+
+![Loss Graph]({url})
+"""
+    output_file.write_text(md_template, encoding="utf-8")
+
+
+def get_url(bucket_name: str, destination: str, aws_region: str) -> str:
+    return f"https://{bucket_name}.s3.{aws_region}.amazonaws.com/{destination}"
+
+
+def main(args: Arguments):
+    # first things first, we create the png file to upload to S3
+    log_file = Path(args.log_file)
+    loss_data = read_loss_data(log_file=log_file)
+    output_image = Path("/tmp/loss-graph.png")
+    output_file = Path(args.output_file)
+    render_image(loss_data=loss_data, outfile=output_image)
+    destination_path = get_destination_path(
+        base_ref=args.base_branch, pr_number=args.pr_number, head_sha=args.head_sha
+    )
+    write_to_s3(
+        file=output_image, bucket_name=args.bucket_name, destination=destination_path
+    )
+    s3_url = get_url(
+        bucket_name=args.bucket_name,
+        destination=destination_path,
+        aws_region=args.aws_region,
+    )
+    write_md_file(
+        output_file=output_file,
+        url=s3_url,
+        pr_number=args.pr_number,
+        head_sha=args.head_sha,
+        origin_repository=args.origin_repository,
+    )
+    print(f"Loss graph uploaded to '{s3_url}'")
+    print(f"Markdown file written to '{output_file}'")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+
+    parser.add_argument(
+        "--log-file",
+        type=str,
+        required=True,
+        help="The log file to read the loss data from.",
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        required=True,
+        help="The output file where the resulting markdown will be written.",
+    )
+    parser.add_argument(
+        "--aws-region",
+        type=str,
+        required=True,
+        help="S3 region to which the bucket belongs.",
+    )
+    parser.add_argument(
+        "--bucket-name", type=str, required=True, help="The S3 bucket name"
+    )
+    parser.add_argument(
+        "--base-branch",
+        type=str,
+        required=True,
+        help="The base branch being merged to.",
+    )
+    parser.add_argument("--pr-number", type=str, required=True, help="The PR number")
+    parser.add_argument(
+        "--head-sha", type=str, required=True, help="The head SHA of the PR"
+    )
+    parser.add_argument(
+        "--origin-repository",
+        type=str,
+        required=True,
+        help="The repository to which the originating branch belongs to.",
+    )
+
+    args = parser.parse_args()
+
+    arguments = Arguments(
+        log_file=args.log_file,
+        output_file=args.output_file,
+        aws_region=args.aws_region,
+        bucket_name=args.bucket_name,
+        base_branch=args.base_branch,
+        pr_number=args.pr_number,
+        head_sha=args.head_sha,
+        origin_repository=args.origin_repository,
+    )
+    main(arguments)

From b945f565ef36fa4bb56cfd6b83bd223f2980cfc7 Mon Sep 17 00:00:00 2001
From: Oleg S <97077423+RobotSail@users.noreply.github.com>
Date: Thu, 7 Nov 2024 15:20:46 +0000
Subject: [PATCH 42/48] Update tests/smoketest.sh to support FSDP + LoRA as a
 testing path. Additionally introuce a max_seq_len parameter to support
 testing on lower-end hardware.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 src/instructlab/training/utils.py | 25 ++++++++++++++++++++++++-
 tests/smoketest.sh                |  5 ++---
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 41148c8d..e6b62f48 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -46,7 +46,11 @@
 import torch.nn.functional as F
 
 # First Party
-from instructlab.training.config import DistributedBackend, TrainingArgs
+from instructlab.training.config import (
+    DistributedBackend,
+    QuantizeDataType,
+    TrainingArgs,
+)
 
 
 def check_valid_train_args(train_args: TrainingArgs):
@@ -76,6 +80,25 @@ def check_valid_train_args(train_args: TrainingArgs):
             "\033[33m WARNING: is_padding_free is being deprecated due to adoption of the default padding-free support in Hugging Face Transformers. As such, this flag is non-functional in 0.6.0 and beyond. If you would like to use the older Dolomite padding-free implementation, please set use_dolomite moving forward.\033[0m"
         )
 
+    if (
+        train_args.accelerate_full_state_at_epoch
+        and train_args.lora
+        and train_args.lora.rank > 0
+    ):
+        raise ValueError(
+            "`accelerate_full_state_at_epoch` is not currently supported when training LoRA models."
+        )
+
+    if (
+        train_args.lora
+        and train_args.lora.rank > 0
+        and train_args.lora.quantize_data_type != QuantizeDataType.NONE
+        and train_args.distributed_backend == DistributedBackend.FSDP.value
+    ):
+        raise ValueError(
+            "Quantization is not supported when training LoRA models with FSDP. For quantized LoRA training, please switch to DeepSpeed."
+        )
+
 
 def retrieve_chat_template(chat_tmpl_path):
     try:
diff --git a/tests/smoketest.sh b/tests/smoketest.sh
index a54c9764..6918fb03 100755
--- a/tests/smoketest.sh
+++ b/tests/smoketest.sh
@@ -19,6 +19,7 @@ NUM_GPUS="${2:-${DEFAULT_GPUS}}"
 # ############### User-modifiable parameters ############### 
 # Change these as needed
 MAX_BATCH_LEN=60000
+MAX_SEQ_LEN=4096
 NUM_SAMPLES_TRAINED_ON=5000 # upper-bound on training dataset size.
 
 # ############### Test Functions ############### 
@@ -203,17 +204,14 @@ function test_standard_loop_fsdp_lora() {
     --nproc_per_node="${NUM_GPUS}" \
     main_ds.py \
     --model_name_or_path="${MODEL_NAME}" \
-    --is_granite \
     --data_path="${COMPUTED_DATA_PATH}" \
     --output_dir="${CHECKPOINTS_DIR}" \
     --num_epochs=1 \
     --effective_batch_size=128 \
     --save_samples=0 \
     --checkpoint_at_epoch \
-    --accelerate_full_state_at_epoch \
     --distributed_training_framework="${DISTRIB_FRAMEWORK}" \
     --max_batch_len="${MAX_BATCH_LEN}" \
-    --is_granite \
     --lora_r=4 \
     --lora_alpha=32 \
     --lora_dropout=0.1
@@ -235,6 +233,7 @@ function main () {
     test_standard_loop_nongranite
     _cleanup_saved_checkpoints
     test_standard_loop
+    test_standard_loop_fsdp_lora
 }
 
 main

From 3679098b9094ef9468b1854129c56c73772c9721 Mon Sep 17 00:00:00 2001
From: Jaideep Rao <jrao@redhat.com>
Date: Wed, 13 Nov 2024 15:45:09 -0500
Subject: [PATCH 43/48] feat: add toggle to pick legacy chat tmpl for granite
 (#336)

Signed-off-by: Jaideep Rao <jrao@redhat.com>
---
 src/instructlab/training/config.py  | 3 +++
 src/instructlab/training/main_ds.py | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index 51e9752e..bcc06fa2 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -156,6 +156,9 @@ class TrainingArgs(BaseModel):
         os.path.dirname(__file__), "chat_templates/ibm_generic_tmpl.py"
     )
 
+    # this field determines if ibm_legacy_tmpl should be used instead
+    use_legacy_tmpl: bool = False
+
     # this field specifies the filepath to the training dataset before processing
     data_path: str
     ckpt_output_dir: str
diff --git a/src/instructlab/training/main_ds.py b/src/instructlab/training/main_ds.py
index 4c04da0f..d6f9eeb2 100644
--- a/src/instructlab/training/main_ds.py
+++ b/src/instructlab/training/main_ds.py
@@ -691,6 +691,12 @@ def run_training(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None:
     """
     check_valid_train_args(train_args)
 
+    # switch out generic tmpl for legacy tmpl if requested
+    if train_args.use_legacy_tmpl:
+        train_args.chat_tmpl_path = os.path.join(
+            os.path.dirname(__file__), "chat_templates/ibm_legacy_tmpl.py"
+        )
+
     if train_args.process_data:
         dp.main(
             DataProcessArgs(

From d4570ae0ca6768c4846901ce7d51222198fdb957 Mon Sep 17 00:00:00 2001
From: James Kunstle <jkunstle@redhat.com>
Date: Tue, 5 Nov 2024 13:29:22 -0800
Subject: [PATCH 44/48] deps: change hf accelerate bound, add
 requirements-hpu.txt

Signed-off-by: James Kunstle <jkunstle@redhat.com>
---
 pyproject.toml        | 1 +
 requirements-cuda.txt | 3 +++
 requirements-hpu.txt  | 2 ++
 requirements-rocm.txt | 2 ++
 requirements.txt      | 4 ++--
 5 files changed, 10 insertions(+), 2 deletions(-)
 create mode 100644 requirements-hpu.txt

diff --git a/pyproject.toml b/pyproject.toml
index 93387840..ca053385 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ package-dir = { "" = "src" }
 dependencies = { file = ["requirements.txt"] }
 optional-dependencies.cuda = { file = ["requirements-cuda.txt"] }
 optional-dependencies.rocm = { file = ["requirements-rocm.txt"] }
+optional-dependencies.hpu = { file = ["requirements-hpu.txt"] }
 
 [tool.setuptools.packages.find]
 where = ["src"]
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 1bed61eb..6230797e 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -1,2 +1,5 @@
 flash-attn>=2.4.0
 bitsandbytes>=0.43.1
+
+# required for FSDP updates
+accelerate>=0.34.2
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
new file mode 100644
index 00000000..6c9ca0c2
--- /dev/null
+++ b/requirements-hpu.txt
@@ -0,0 +1,2 @@
+# required for optimum-habana's deps
+accelerate>=0.33.0
\ No newline at end of file
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 21612c36..9d72f4d5 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -1 +1,3 @@
 flash-attn>=2.6.2,<2.7.0
+# required for FSDP updates
+accelerate>=0.34.2
diff --git a/requirements.txt b/requirements.txt
index 84da8460..7c6a00ca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ py-cpuinfo
 # replace custom pytorch images with the 2.3.0
 torch>=2.3.0a0
 transformers>=4.45.2
-accelerate>=0.34.2
+
 datasets>=2.15.0
 numba
 # Note: numpy ranges copied from instructlab/instructlab
@@ -22,6 +22,6 @@ trl>=0.9.4
 peft
 pydantic>=2.7.0
 
-# deepspeed needs to be at the bottom or it'll break during installation
+# deepspeed needs to be at the end or it'll break stuff.
 deepspeed>=0.14.3
 aiofiles>=23.2.1

From 64ee152f6b45789f284962d2b5b55ab5a0aaf2bf Mon Sep 17 00:00:00 2001
From: Oleg <97077423+RobotSail@users.noreply.github.com>
Date: Thu, 14 Nov 2024 12:59:52 -0500
Subject: [PATCH 45/48] fix: disable loss exporting for medium training job
 (#347)

The medium training job is not currently running through the training library
and therefore does not emit the same logs. This commit disables the log exporting
logic as it is currently breaking. We intend to re-introduce this logic into CI
once the medium job is aligned with it.

Signed-off-by: Oleg S <97077423+RobotSail@users.noreply.github.com>
---
 .github/workflows/e2e-nvidia-l4-x1.yml | 84 +++++++++++++-------------
 1 file changed, 43 insertions(+), 41 deletions(-)

diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
index 6bf16187..ef511319 100644
--- a/.github/workflows/e2e-nvidia-l4-x1.yml
+++ b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -154,19 +154,21 @@ jobs:
           # set preserve to true so we can retain the logs
           ./scripts/e2e-ci.sh -mp
           
+          # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
+          #                Therefore we must disable the upload of the training logs, as they will not exist in the same location.
           # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
           # and we know that it will be written into a directory created by `mktemp -d`. 
           # Given this information, we can use the following command to find the file:
-          log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
-          mv "${log_file}" training-log.jsonl
+          # log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
+          # mv "${log_file}" training-log.jsonl
 
-      - name: Upload training logs
-        uses: actions/upload-artifact@v4
-        with:
-          name: training-log.jsonl
-          path: ./instructlab/training-log.jsonl
-          retention-days: 1
-          overwrite: true
+      # - name: Upload training logs
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: training-log.jsonl
+      #     path: ./instructlab/training-log.jsonl
+      #     retention-days: 1
+      #     overwrite: true
 
   stop-medium-ec2-runner:
     needs:
@@ -195,39 +197,39 @@ jobs:
           label: ${{ needs.start-medium-ec2-runner.outputs.label }}
           ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
       
-      - name: Download loss data
-        id: download-logs
-        uses: actions/download-artifact@v4
-        with:
-          name: training-log.jsonl
-          path: downloaded-data
-
-      - name: Install dependencies
-        run: |
-          pip install -r requirements-dev.txt
+      # - name: Download loss data
+      #   id: download-logs
+      #   uses: actions/download-artifact@v4
+      #   with:
+      #     name: training-log.jsonl
+      #     path: downloaded-data
+
+      # - name: Install dependencies
+      #   run: |
+      #     pip install -r requirements-dev.txt
       
-      - name: Try to upload to s3
-        id: upload-s3
-        continue-on-error: true
-        run: |
-          output_file='./test.md' 
-          python scripts/create-loss-graph.py  \
-            --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
-            --output-file "${output_file}" \
-            --aws-region "${{ vars.AWS_REGION }}" \
-            --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
-            --base-branch "${{ github.event.pull_request.base.ref }}" \
-            --pr-number "${{ github.event.pull_request.number }}" \
-            --head-sha "${{ github.event.pull_request.head.sha }}" \
-            --origin-repository "${{ github.repository }}"
-
-          cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
-
-      - name: Check S3 upload status
-        if: steps.upload-s3.outcome == 'failure'
-        run: |
-          echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
-          echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+      # - name: Try to upload to s3
+      #   id: upload-s3
+      #   continue-on-error: true
+      #   run: |
+      #     output_file='./test.md' 
+      #     python scripts/create-loss-graph.py  \
+      #       --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+      #       --output-file "${output_file}" \
+      #       --aws-region "${{ vars.AWS_REGION }}" \
+      #       --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+      #       --base-branch "${{ github.event.pull_request.base.ref }}" \
+      #       --pr-number "${{ github.event.pull_request.number }}" \
+      #       --head-sha "${{ github.event.pull_request.head.sha }}" \
+      #       --origin-repository "${{ github.repository }}"
+
+      #     cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+      # - name: Check S3 upload status
+      # if: steps.upload-s3.outcome == 'failure'
+      #   run: |
+      #     echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+      #     echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
 
   e2e-medium-workflow-complete:
     # we don't want to block PRs on failed EC2 cleanup

From af8d76c2a1b2df9a21896978f5d6cf72fe6e36ff Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 15 Nov 2024 12:16:58 +0000
Subject: [PATCH 46/48] build(deps): Bump DavidAnson/markdownlint-cli2-action

Bumps [DavidAnson/markdownlint-cli2-action](https://github.com/davidanson/markdownlint-cli2-action) from 17.0.0 to 18.0.0.
- [Release notes](https://github.com/davidanson/markdownlint-cli2-action/releases)
- [Commits](https://github.com/davidanson/markdownlint-cli2-action/compare/db43aef879112c3119a410d69f66701e0d530809...eb5ca3ab411449c66620fe7f1b3c9e10547144b0)

---
updated-dependencies:
- dependency-name: DavidAnson/markdownlint-cli2-action
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 79e1ac6b..34e2afbb 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -41,6 +41,6 @@ jobs:
         with:
           fetch-depth: 0
       - name: "Check Markdown documents"
-        uses: DavidAnson/markdownlint-cli2-action@db43aef879112c3119a410d69f66701e0d530809 # v17.0.0
+        uses: DavidAnson/markdownlint-cli2-action@eb5ca3ab411449c66620fe7f1b3c9e10547144b0 # v18.0.0
         with:
           globs: '**/*.md'

From 5643e042f3f3b6e98ce6409f66ad9c90069ad6b6 Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 15 Nov 2024 11:00:23 -0500
Subject: [PATCH 47/48] Update Dependencies to Move DeepSpeed to CUDA Extras

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 requirements-cuda.txt                         |  3 +++
 requirements.txt                              |  3 ---
 src/instructlab/training/config.py            |  2 +-
 src/instructlab/training/setup_accelerator.py |  9 +++++--
 src/instructlab/training/utils.py             | 25 +++++++++++--------
 5 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 6230797e..245aa0df 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -1,5 +1,8 @@
 flash-attn>=2.4.0
 bitsandbytes>=0.43.1
 
+# available as an option for NVIDIA, FSDP still default
+deepspeed>=0.14.3
+
 # required for FSDP updates
 accelerate>=0.34.2
diff --git a/requirements.txt b/requirements.txt
index 7c6a00ca..d4b7760c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,7 +21,4 @@ instructlab-dolomite>=0.2.0
 trl>=0.9.4
 peft
 pydantic>=2.7.0
-
-# deepspeed needs to be at the end or it'll break stuff.
-deepspeed>=0.14.3
 aiofiles>=23.2.1
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
index bcc06fa2..bf43f2eb 100644
--- a/src/instructlab/training/config.py
+++ b/src/instructlab/training/config.py
@@ -195,7 +195,7 @@ class TrainingArgs(BaseModel):
             cpu_offload_params=False, sharding_strategy=ShardingStrategies.SHARD_GRAD_OP
         )
     )
-    distributed_backend: DistributedBackend = DistributedBackend.DEEPSPEED
+    distributed_backend: DistributedBackend = DistributedBackend.FSDP
 
     disable_flash_attn: Optional[bool] = False
 
diff --git a/src/instructlab/training/setup_accelerator.py b/src/instructlab/training/setup_accelerator.py
index 239367f4..c7d079e6 100644
--- a/src/instructlab/training/setup_accelerator.py
+++ b/src/instructlab/training/setup_accelerator.py
@@ -96,8 +96,13 @@ def get_fsdp_config(args, model: PreTrainedModel):
 
 def setup_accelerator(args, model: PreTrainedModel, grad_accum):
     if args.distributed_training_framework == "deepspeed":
-        # Third Party
-        from deepspeed import DeepSpeedEngine
+        try:
+            # Third Party
+            from deepspeed import DeepSpeedEngine
+        except ImportError as exc:
+            raise ImportError(
+                "DeepSpeed selected as distributed framework, but not installed"
+            ) from exc
 
         # patch deepspeed to work with quantized models.
         if args.lora_quant_bits is not None:
diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
index 6a9d6f84..41ec413f 100644
--- a/src/instructlab/training/utils.py
+++ b/src/instructlab/training/utils.py
@@ -653,16 +653,21 @@ def prepare_universal_checkpoint_from_latest(output_dir):
 
     start = time.time()
     if torch.distributed.get_rank() == 0:
-        # Third Party
-        from deepspeed.checkpoint import DeepSpeedCheckpoint
-        from deepspeed.checkpoint.ds_to_universal import (
-            PARAM_SHAPES,
-            UNIVERSAL_CHECKPOINT_INFO,
-            _check_for_required_state,
-            _extract_zero_shard_files,
-            _merge_tp_slice_files,
-            _save_optimizer_state,
-        )
+        try:
+            # Third Party
+            from deepspeed.checkpoint import DeepSpeedCheckpoint
+            from deepspeed.checkpoint.ds_to_universal import (
+                PARAM_SHAPES,
+                UNIVERSAL_CHECKPOINT_INFO,
+                _check_for_required_state,
+                _extract_zero_shard_files,
+                _merge_tp_slice_files,
+                _save_optimizer_state,
+            )
+        except ImportError as exc:
+            raise ImportError(
+                "DeepSpeed-specific checkpoints cannot be saved without DeepSpeed>=0.14.3 installed"
+            ) from exc
 
         # read the latest file to get the step folder
         latest_file = output_dir / "latest"

From fdfb4fd2805298bb99aa77b427c3c519ab77643b Mon Sep 17 00:00:00 2001
From: Mustafa Eyceoz <meyceoz@redhat.com>
Date: Fri, 15 Nov 2024 11:18:06 -0500
Subject: [PATCH 48/48] Cap accelerate to avoid breaking changes

Signed-off-by: Mustafa Eyceoz <meyceoz@redhat.com>
---
 requirements-cuda.txt | 2 +-
 requirements-hpu.txt  | 2 +-
 requirements-rocm.txt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 245aa0df..d030d340 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -5,4 +5,4 @@ bitsandbytes>=0.43.1
 deepspeed>=0.14.3
 
 # required for FSDP updates
-accelerate>=0.34.2
+accelerate>=0.34.2,<1.1.0
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index 6c9ca0c2..73db0ef6 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -1,2 +1,2 @@
 # required for optimum-habana's deps
-accelerate>=0.33.0
\ No newline at end of file
+accelerate>=0.33.0,<1.1.0
\ No newline at end of file
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index 9d72f4d5..bb9d572e 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -1,3 +1,3 @@
 flash-attn>=2.6.2,<2.7.0
 # required for FSDP updates
-accelerate>=0.34.2
+accelerate>=0.34.2,<1.1.0