Merge branch 'instructlab:main' into main

ashna000 · Nov 22, 2024 · 405c8d7 · 405c8d7
2 parents 1a9678f + 84c0f72
commit 405c8d7
Show file tree

Hide file tree

Showing 33 changed files with 2,000 additions and 434 deletions.
diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -25,21 +25,21 @@ pull_request_rules:
         - -files~=^\.github/(actions|workflows)/.*\.ya?ml$
         - -files~=^\.github/workflows/actionlint\.
 
-    # e2e workflow
+    # e2e medium workflow
     - or:
       - and:
-        # note this should match the triggering criteria in 'e2e-nvidia-t4-x1.yml'
-        - check-success=e2e-workflow-complete
+        # note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml'
+        - check-success~=e2e-medium-workflow-complete
         - or:
           - files~=\.py$
           - files=pyproject.toml
           - files~=^requirements.*\.txt$
-          - files=.github/workflows/e2e-nvidia-t4-x1.yml
+          - files=.github/workflows/e2e-nvidia-l4-x1.yml
       - and:
         - -files~=\.py$
         - -files=pyproject.toml
         - -files~=^requirements.*\.txt$
-        - -files=.github/workflows/e2e-nvidia-t4-x1.yml
+        - -files=.github/workflows/e2e-nvidia-l4-x1.yml
 
     # code lint workflow
     - or:

diff --git a/.github/workflows/actionlint.dockerfile b/.github/workflows/actionlint.dockerfile
@@ -1,3 +1,3 @@
 # Since dependabot cannot update workflows using docker,
 # we use this indirection since dependabot can update this file.
-FROM rhysd/actionlint:1.7.3@sha256:7617f05bd698cd2f1c3aedc05bc733ccec92cca0738f3e8722c32c5b42c70ae6
+FROM rhysd/actionlint:1.7.4@sha256:82244e1db1c60d82c7792180a48dd0bcb838370bb589d53ff132503fc9485868
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
@@ -35,7 +35,7 @@ jobs:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
 
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
 

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -37,10 +37,10 @@ jobs:
         with:
           egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
       - name: "Check Markdown documents"
-        uses: DavidAnson/markdownlint-cli2-action@db43aef879112c3119a410d69f66701e0d530809 # v17.0.0
+        uses: DavidAnson/markdownlint-cli2-action@eb5ca3ab411449c66620fe7f1b3c9e10547144b0 # v18.0.0
         with:
           globs: '**/*.md'
diff --git a/.github/workflows/e2e-nvidia-l4-x1.yml b/.github/workflows/e2e-nvidia-l4-x1.yml
@@ -0,0 +1,241 @@
+# SPDX-License-Identifier: Apache-2.0
+
+name: E2E (NVIDIA L4 x1)
+
+on:
+  # run against every merge commit to 'main' and release branches
+  push:
+    branches:
+      - main
+      - release-*
+  # only run on PRs that touch certain regex paths
+  pull_request_target:
+    branches:
+      - main
+      - release-*
+    paths:
+      # note this should match the merging criteria in 'mergify.yml'
+      - '**.py'
+      - 'pyproject.toml'
+      - 'requirements**.txt'
+      - '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  start-medium-ec2-runner:
+    runs-on: ubuntu-latest
+    outputs:
+      label: ${{ steps.start-ec2-runner.outputs.label }}
+      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
+    steps:
+      - name: "Harden Runner"
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Start EC2 runner
+        id: start-ec2-runner
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
+        with:
+          mode: start
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          ec2-image-id: ${{ vars.AWS_EC2_AMI }}
+          ec2-instance-type: g6.8xlarge
+          subnet-id: subnet-02d230cffd9385bd4
+          security-group-id: sg-06300447c4a5fbef3
+          iam-role-name: instructlab-ci-runner
+          aws-resource-tags: >
+            [
+              {"Key": "Name", "Value": "instructlab-ci-github-medium-runner"},
+              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
+              {"Key": "GitHubRef", "Value": "${{ github.ref }}"},
+              {"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
+            ]
+  
+  e2e-medium-test:
+    needs:
+      - start-medium-ec2-runner
+    runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}
+
+    # It is important that this job has no write permissions and has
+    # no access to any secrets. This part (e2e) is where we are running
+    # untrusted code from PRs.
+    permissions: {}
+
+    steps:
+      - name: "Harden Runner"
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: Install Packages
+        run: |
+          cat /etc/os-release
+          sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
+
+      - name: Checkout instructlab/instructlab
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/instructlab"
+          path: "instructlab"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Checkout instructlab/training
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          repository: "instructlab/training"
+          path: "training"
+          # https://github.com/actions/checkout/issues/249
+          fetch-depth: 0
+
+      - name: Fetch and checkout PR
+        if: ${{ github.event_name == 'pull_request_target' }}
+        working-directory: ./training
+        run: |
+          git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
+          git checkout pr-${{ github.event.pull_request.number }}
+
+      - name: Install ilab
+        working-directory: ./instructlab
+        run: |
+          export CUDA_HOME="/usr/local/cuda"
+          export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+          export PATH="$PATH:$CUDA_HOME/bin"
+          python3.11 -m venv --upgrade-deps venv
+          . venv/bin/activate
+          nvidia-smi
+          python3.11 -m pip cache remove llama_cpp_python
+
+          CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .
+
+          # https://github.com/instructlab/instructlab/issues/1821
+          # install with Torch and build dependencies installed
+          python3.11 -m pip install -v packaging wheel setuptools-scm
+          python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
+
+      - name: Update instructlab-training library
+        working-directory: ./training
+        run: |
+          . ../instructlab/venv/bin/activate
+          pip install -v .
+          pip install -v .[cuda]
+
+      - name: Check disk
+        run: |
+          df -h
+
+      - name: Run e2e test
+        working-directory: ./instructlab
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          . venv/bin/activate
+          # set preserve to true so we can retain the logs
+          ./scripts/e2e-ci.sh -mp
+          
+          # HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
+          #                Therefore we must disable the upload of the training logs, as they will not exist in the same location.
+          # we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
+          # and we know that it will be written into a directory created by `mktemp -d`. 
+          # Given this information, we can use the following command to find the file:
+          # log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
+          # mv "${log_file}" training-log.jsonl
+
+      # - name: Upload training logs
+      #   uses: actions/upload-artifact@v4
+      #   with:
+      #     name: training-log.jsonl
+      #     path: ./instructlab/training-log.jsonl
+      #     retention-days: 1
+      #     overwrite: true
+
+  stop-medium-ec2-runner:
+    needs:
+      - start-medium-ec2-runner
+      - e2e-medium-test
+    runs-on: ubuntu-latest
+    if: ${{ always() }}
+    steps:
+      - name: "Harden Runner"
+        uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
+        with:
+          egress-policy: audit
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: ${{ secrets.AWS_REGION }}
+
+      - name: Stop EC2 runner
+        uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
+        with:
+          mode: stop
+          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
+          label: ${{ needs.start-medium-ec2-runner.outputs.label }}
+          ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
+
+      # - name: Download loss data
+      #   id: download-logs
+      #   uses: actions/download-artifact@v4
+      #   with:
+      #     name: training-log.jsonl
+      #     path: downloaded-data
+
+      # - name: Install dependencies
+      #   run: |
+      #     pip install -r requirements-dev.txt
+
+      # - name: Try to upload to s3
+      #   id: upload-s3
+      #   continue-on-error: true
+      #   run: |
+      #     output_file='./test.md' 
+      #     python scripts/create-loss-graph.py  \
+      #       --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
+      #       --output-file "${output_file}" \
+      #       --aws-region "${{ vars.AWS_REGION }}" \
+      #       --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
+      #       --base-branch "${{ github.event.pull_request.base.ref }}" \
+      #       --pr-number "${{ github.event.pull_request.number }}" \
+      #       --head-sha "${{ github.event.pull_request.head.sha }}" \
+      #       --origin-repository "${{ github.repository }}"
+
+      #     cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"
+
+      # - name: Check S3 upload status
+      # if: steps.upload-s3.outcome == 'failure'
+      #   run: |
+      #     echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
+      #     echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"
+
+  e2e-medium-workflow-complete:
+    # we don't want to block PRs on failed EC2 cleanup
+    # so not requiring "stop-runner" as well
+    needs: ["start-medium-ec2-runner", "e2e-medium-test"]
+    runs-on: ubuntu-latest
+    steps:
+      - name: E2E Workflow Complete
+        run: echo "E2E Workflow Complete"