Skip to content

Commit

Permalink
Merge branch 'instructlab:main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
ashna000 authored Nov 22, 2024
2 parents 1a9678f + 84c0f72 commit 405c8d7
Show file tree
Hide file tree
Showing 33 changed files with 2,000 additions and 434 deletions.
10 changes: 5 additions & 5 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,21 @@ pull_request_rules:
- -files~=^\.github/(actions|workflows)/.*\.ya?ml$
- -files~=^\.github/workflows/actionlint\.

# e2e workflow
# e2e medium workflow
- or:
- and:
# note this should match the triggering criteria in 'e2e-nvidia-t4-x1.yml'
- check-success=e2e-workflow-complete
# note this should match the triggering criteria in 'e2e-nvidia-l4-x1.yml'
- check-success~=e2e-medium-workflow-complete
- or:
- files~=\.py$
- files=pyproject.toml
- files~=^requirements.*\.txt$
- files=.github/workflows/e2e-nvidia-t4-x1.yml
- files=.github/workflows/e2e-nvidia-l4-x1.yml
- and:
- -files~=\.py$
- -files=pyproject.toml
- -files~=^requirements.*\.txt$
- -files=.github/workflows/e2e-nvidia-t4-x1.yml
- -files=.github/workflows/e2e-nvidia-l4-x1.yml

# code lint workflow
- or:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/actionlint.dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Since dependabot cannot update workflows using docker,
# we use this indirection since dependabot can update this file.
FROM rhysd/actionlint:1.7.3@sha256:7617f05bd698cd2f1c3aedc05bc733ccec92cca0738f3e8722c32c5b42c70ae6
FROM rhysd/actionlint:1.7.4@sha256:82244e1db1c60d82c7792180a48dd0bcb838370bb589d53ff132503fc9485868
2 changes: 1 addition & 1 deletion .github/workflows/actionlint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ jobs:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs

- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ jobs:
with:
egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs
- name: "Checkout"
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
- name: "Check Markdown documents"
uses: DavidAnson/markdownlint-cli2-action@db43aef879112c3119a410d69f66701e0d530809 # v17.0.0
uses: DavidAnson/markdownlint-cli2-action@eb5ca3ab411449c66620fe7f1b3c9e10547144b0 # v18.0.0
with:
globs: '**/*.md'
241 changes: 241 additions & 0 deletions .github/workflows/e2e-nvidia-l4-x1.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
# SPDX-License-Identifier: Apache-2.0

name: E2E (NVIDIA L4 x1)

on:
# run against every merge commit to 'main' and release branches
push:
branches:
- main
- release-*
# only run on PRs that touch certain regex paths
pull_request_target:
branches:
- main
- release-*
paths:
# note this should match the merging criteria in 'mergify.yml'
- '**.py'
- 'pyproject.toml'
- 'requirements**.txt'
- '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
LC_ALL: en_US.UTF-8

defaults:
run:
shell: bash

permissions:
contents: read

jobs:
start-medium-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: "Harden Runner"
uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
with:
egress-policy: audit

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
ec2-instance-type: g6.8xlarge
subnet-id: subnet-02d230cffd9385bd4
security-group-id: sg-06300447c4a5fbef3
iam-role-name: instructlab-ci-runner
aws-resource-tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-medium-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]
e2e-medium-test:
needs:
- start-medium-ec2-runner
runs-on: ${{ needs.start-medium-ec2-runner.outputs.label }}

# It is important that this job has no write permissions and has
# no access to any secrets. This part (e2e) is where we are running
# untrusted code from PRs.
permissions: {}

steps:
- name: "Harden Runner"
uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
with:
egress-policy: audit

- name: Install Packages
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
- name: Checkout instructlab/instructlab
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/instructlab"
path: "instructlab"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Checkout instructlab/training
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
repository: "instructlab/training"
path: "training"
# https://github.com/actions/checkout/issues/249
fetch-depth: 0

- name: Fetch and checkout PR
if: ${{ github.event_name == 'pull_request_target' }}
working-directory: ./training
run: |
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-${{ github.event.pull_request.number }}
git checkout pr-${{ github.event.pull_request.number }}
- name: Install ilab
working-directory: ./instructlab
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
export PATH="$PATH:$CUDA_HOME/bin"
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
nvidia-smi
python3.11 -m pip cache remove llama_cpp_python
CMAKE_ARGS="-DLLAMA_CUDA=on" python3.11 -m pip install -v .
# https://github.com/instructlab/instructlab/issues/1821
# install with Torch and build dependencies installed
python3.11 -m pip install -v packaging wheel setuptools-scm
python3.11 -m pip install -v .[cuda] -r requirements-vllm-cuda.txt
- name: Update instructlab-training library
working-directory: ./training
run: |
. ../instructlab/venv/bin/activate
pip install -v .
pip install -v .[cuda]
- name: Check disk
run: |
df -h
- name: Run e2e test
working-directory: ./instructlab
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
. venv/bin/activate
# set preserve to true so we can retain the logs
./scripts/e2e-ci.sh -mp
# HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
# Therefore we must disable the upload of the training logs, as they will not exist in the same location.
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
# and we know that it will be written into a directory created by `mktemp -d`.
# Given this information, we can use the following command to find the file:
# log_file=$(find /tmp -name "training_params_and_metrics_global0.jsonl")
# mv "${log_file}" training-log.jsonl
# - name: Upload training logs
# uses: actions/upload-artifact@v4
# with:
# name: training-log.jsonl
# path: ./instructlab/training-log.jsonl
# retention-days: 1
# overwrite: true

stop-medium-ec2-runner:
needs:
- start-medium-ec2-runner
- e2e-medium-test
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: "Harden Runner"
uses: step-security/harden-runner@91182cccc01eb5e619899d80e4e971d6181294a7 # v2.10.1
with:
egress-policy: audit

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Stop EC2 runner
uses: machulav/ec2-github-runner@1827d6ca7544d7044ddbd2e9360564651b463da2 # v2.3.7
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}

# - name: Download loss data
# id: download-logs
# uses: actions/download-artifact@v4
# with:
# name: training-log.jsonl
# path: downloaded-data

# - name: Install dependencies
# run: |
# pip install -r requirements-dev.txt

# - name: Try to upload to s3
# id: upload-s3
# continue-on-error: true
# run: |
# output_file='./test.md'
# python scripts/create-loss-graph.py \
# --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
# --output-file "${output_file}" \
# --aws-region "${{ vars.AWS_REGION }}" \
# --bucket-name "${{ vars.AWS_S3_LOSS_GRAPHS_BUCKET_NAME }}" \
# --base-branch "${{ github.event.pull_request.base.ref }}" \
# --pr-number "${{ github.event.pull_request.number }}" \
# --head-sha "${{ github.event.pull_request.head.sha }}" \
# --origin-repository "${{ github.repository }}"

# cat "${output_file}" >> "${GITHUB_STEP_SUMMARY}"

# - name: Check S3 upload status
# if: steps.upload-s3.outcome == 'failure'
# run: |
# echo "::warning::Failed to upload loss graph to S3. This won't block the workflow, but you may want to investigate."
# echo "Loss graph upload failed" >> "${GITHUB_STEP_SUMMARY}"

e2e-medium-workflow-complete:
# we don't want to block PRs on failed EC2 cleanup
# so not requiring "stop-runner" as well
needs: ["start-medium-ec2-runner", "e2e-medium-test"]
runs-on: ubuntu-latest
steps:
- name: E2E Workflow Complete
run: echo "E2E Workflow Complete"
Loading

0 comments on commit 405c8d7

Please sign in to comment.