Merge pull request #64 from EleutherAI/main

merge upstream
CERC-AAI · Apr 4, 2024 · 5790435 · 5790435
2 parents 8125ea3 + 01657aa
commit 5790435
Show file tree

Hide file tree

Showing 189 changed files with 108,893 additions and 3,398 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -1 +1 @@
-* @EleutherAI/pm-gptneo
+* @Quentin-Anthony
diff --git a/.github/workflows/coverity_scan.yml b/.github/workflows/coverity_scan.yml
@@ -0,0 +1,60 @@
+name: Coverity
+on:
+  workflow_dispatch:
+    inputs:
+      build_version:
+        description: "Version of GPT-NeoX being submitted for scan"
+        required: false
+        default: "GPT-NeoX build version"
+      build_description:
+        description: "Description of the current build"
+        required: false
+        default: "Current build of GPT-NeoX"
+
+jobs:
+  coverity:
+
+    runs-on: ubuntu-latest
+
+    env:
+      COV_USER: ${{ secrets.COV_USER }}
+      COVERITY_PROJECT: ${{ secrets.COVERITY_PROJECT }}
+      COVERITY_TOKEN: ${{ secrets.COVERITY_TOKEN }}
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        path: gpt-neox
+
+    - name: Install utils
+      run: |
+        sudo apt update -y && sudo apt upgrade -y
+        sudo apt install curl jq wget -y
+
+    - name: Coverity Download
+      run: |
+        wget https://scan.coverity.com/download/linux64 --post-data "token=$COVERITY_TOKEN&project=$COVERITY_PROJECT" -O coverity_tool.tgz --no-verbose
+        mkdir $GITHUB_WORKSPACE/coverity && tar xvf coverity_tool.tgz -C $GITHUB_WORKSPACE/coverity --strip-components=1
+        $GITHUB_WORKSPACE/coverity/bin/cov-configure --python
+        $GITHUB_WORKSPACE/coverity/bin/cov-configure --gcc
+
+    - name: Coverity Scan and Upload
+      run: |
+        set -x
+        pushd $GITHUB_WORKSPACE
+        cd $GITHUB_WORKSPACE/gpt-neox
+        $GITHUB_WORKSPACE/coverity/bin/cov-build --dir $GITHUB_WORKSPACE/cov-int --no-command --fs-capture-search ./
+        popd
+        tar caf build-results.bz2 cov-int
+        curl --form token=$COVERITY_TOKEN \
+          --form email=$COV_USER \
+          --form [email protected] \
+          --form version="${{ inputs.build_version }}" \
+          --form description="${{ inputs.build_description }}" \
+          https://scan.coverity.com/builds?project=$COVERITY_PROJECT
+
+    - name: Upload Scan Build as Artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: coverity-build-${{ github.sha }}
+        path: build-results.bz2
diff --git a/.github/workflows/cpu_ci.yml b/.github/workflows/cpu_ci.yml
@@ -4,7 +4,8 @@ on: "push"
 
 jobs:
   run-tests:
-    runs-on: ubuntu-latest
+    #runs-on: ubuntu-latest
+    runs-on: [ 'test', 'self-hosted' ]
     steps:
       - uses: actions/checkout@v3
 

diff --git a/.github/workflows/cpu_ci_on_pr.yml b/.github/workflows/cpu_ci_on_pr.yml
@@ -0,0 +1,69 @@
+name: "Pull Request CPU Tests"
+
+on:
+  pull_request:
+    paths: # job only triggers when the PR changes files under megatron directory
+      - "megatron/**"
+
+jobs:
+  run-tests:
+    runs-on: [ 'test', 'self-hosted' ]
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+          cache: "pip"
+          cache-dependency-path: "**/requirements*.txt"
+
+      - name: Upgrade Pip
+        run: python -m pip install --upgrade pip
+
+      - name: Set up Docker repository # this should possibly be done by the worker before the job starts in the interest of execution time?
+        run: |
+          # Add Docker's official GPG key:
+          sudo apt-get update -y
+          sudo apt-get install ca-certificates curl -y
+          sudo install -m 0755 -d /etc/apt/keyrings
+          sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
+          sudo chmod a+r /etc/apt/keyrings/docker.asc
+
+          # Add the repository to Apt sources:
+          echo \
+            "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
+            $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
+            sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
+          sudo apt-get update
+      - name: Docker installation # this should possibly be done by the worker before the job starts in the interest of execution time?
+        run: |
+          sudo apt-get install docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin -y
+          sudo docker run hello-world
+      - name: Prepare data
+        run: |
+          python prepare_data.py -d ./data
+      - name: Remove previous container
+        run: |
+          if docker ps -a | grep -q "$CONTAINER"; then
+            echo "Container already exists, deleting it..."
+            docker rm -f $CONTAINER
+          fi
+      - name: Create container
+        run: |
+          export NEOX_DATA_PATH='./data/enwik8'
+          export NEOX_CHECKPOINT_PATH='/mnt/sda/checkpoints' #todo: where do I get this?
+          docker compose run -d --build --name $CONTAINER gpt-neox tail -f /dev/null
+      - name: Install test requirements
+        run: |
+          docker exec $CONTAINER pip install -r /workspace/requirements-dev.txt
+      - name: Execute CPU tests 1
+        run: |
+          docker exec $CONTAINER sh -c "cd gpt-neox && pytest tests -m cpu"
+      - name: Execute CPU tests 2
+        run: |
+          docker exec $CONTAINER sh -c "cd gpt-neox && PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python pytest tests -m cpu"
+      - name: Generate report
+        run: |
+          docker exec $CONTAINER python -m http.server --directory htmlcov 8000
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
@@ -4,18 +4,27 @@ on: [pull_request]
 
 jobs:
   pre-commit:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v2
       - uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: 3.10
           cache: "pip"
           cache-dependency-path: "**/requirements*.txt"
+      # Need the right version of clang-format
+      - run: pip install -r requirements/requirements-dev.txt
       - uses: pre-commit/[email protected]
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      -
+        name: Docker build
+        id: docker_build
+        uses: docker/build-push-action@v2
 
   update-documentation:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     steps:
       - uses: actions/checkout@v3
         with:

diff --git a/.gitignore b/.gitignore
@@ -137,6 +137,7 @@ data/**/*.bin
 data/**/*.json*
 data/**/*.txt
 data/**/*.gz
+data/**/*.zip
 data/**/*.np*
 data/**/*.npy
 checkpoints/
@@ -150,3 +151,7 @@ test_logs/
 logs/
 tensorboard/
 src/
+
+# test data files
+tests/data/*.bin
+tests/data/*.idx
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,18 +8,19 @@ repos:
           - id: check-yaml
           - id: destroyed-symlinks
           - id: end-of-file-fixer
-            exclude: docs/CNAME
+            exclude: ^(docs/CNAME/|configs/neox_arguments.md)
           - id: fix-byte-order-marker
           - id: fix-encoding-pragma
             args: [--remove]
           - id: mixed-line-ending
             args: [--fix=lf]
           - id: requirements-txt-fixer
           - id: trailing-whitespace
-    - repo: https://gitlab.com/daverona/pre-commit-cpp
+            exclude: ^(docs/CNAME/|configs/neox_arguments.md)
+    - repo: https://gitlab.com/daverona/pre-commit/cpp
       rev: 0.8.0
       hooks:
-          - id: clang-format # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
+          - id: clang-format  # formatter of C/C++ code based on a style guide: LLVM, Google, Chromium, Mozilla, and WebKit available
             args: []
 
     - repo: https://github.com/psf/black
@@ -36,3 +37,4 @@ repos:
               --check-filenames,
               --check-hidden,
           ]
+        exclude: tests/data/hf_cache/tokenizer/gpt2.json
diff --git a/CITATION.cff b/CITATION.cff
@@ -4,6 +4,9 @@ authors:
   - affiliation: EleutherAI
     family-names: Andonian
     given-names: Alex
+  - affiliation: EleutherAI
+    family-names: Anthony
+    given-names: Quentin
   - affiliation: EleutherAI
     family-names: Biderman
     given-names: Stella
@@ -34,15 +37,30 @@ authors:
   - affiliation: EleutherAI
     family-names: Pieler
     given-names: Michael
+  - affiliation: EleutherAI
+    family-names: Phang
+    given-names: Jason
   - affiliation: EleutherAI
     family-names: Purohit
     given-names: Shivanshu
+  - affiliation: EleutherAI
+    family-names: Schoelkopf
+    given-names: Hailey
+  - affiliation: EleutherAI
+    family-names: Stander
+    given-names: Dashiell
   - affiliation: EleutherAI
     family-names: Songz
     given-names: Tri
   - affiliation: EleutherAI
-    family-names: Phil
-    given-names: Wang
+    family-names: Tigges
+    given-names: Curt
+  - affiliation: EleutherAI
+    family-names: Thérien
+    given-names: Benjamin
+  - affiliation: EleutherAI
+    family-names: Wang
+    given-names: Phil
   - affiliation: EleutherAI
     family-names: Weinbach
     given-names: Samuel
@@ -55,7 +73,7 @@ license: "Apache-2.0"
 message: "If you use this software, please cite it using these metadata."
 repository-code: "https://www.github.com/eleutherai/gpt-neox"
 title: "GPT-NeoX: Large Scale Autoregressive Language Modeling in PyTorch"
-version: "0.0.1"
+version: "2.0.0"
 doi: "10.5281/zenodo.5879544"
 date-released: 2021-08-23
 ...
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,86 @@
+# Contributing
+GPT-NeoX welcomes your contributions!
+
+## Prerequisites
+GPT-NeoX uses [pre-commit](https://pre-commit.com/) to ensure that formatting is
+consistent across GPT-NeoX. First, ensure that `pre-commit` is installed with
+`pip install pre-commit`. Next, the pre-commit hooks must be installed once
+before commits can be made:
+```bash
+pre-commit install
+```
+Please install `clang-format` from Conda:
+```bash
+conda install clang-format
+```
+
+Afterwards, our suite of formatting tests run automatically before each `git commit`. You
+can also run these manually:
+```bash
+pre-commit run --all-files
+```
+If a formatting test fails, it will fix the modified code in place and abort
+the `git commit`. After looking over the changes, you can `git add <modified files>`
+and then repeat the previous `git commit` command.
+
+
+## Testing
+GPT-NeoX tracks two types of tests: unit tests and more costly model convergence tests.
+Unit tests are found in `tests/unit/` and the model convergence tests are found in
+`tests/model/`.
+
+### Unit Tests
+[PyTest](https://docs.pytest.org/en/latest/) is used to execute tests. PyTest can be
+installed from PyPI via `pip install pytest`. Simply invoke `pytest --forked` to run the
+unit tests:
+```bash
+pytest --forked tests/unit/
+```
+You can also provide the `-v` flag to `pytest` to see additional information about the
+tests. Note that [pytest-forked](https://github.com/pytest-dev/pytest-forked) and the
+`--forked` flag are required to test CUDA functionality in distributed tests.
+
+### Model Tests
+To execute model tests, first install GPT-NeoX. Next, execute the model test driver:
+```bash
+cd tests/model/
+pytest run_sanity_check.py
+```
+Note that the `--forked` flag is not necessary for the model tests.
+
+## Contributor License Agreement
+This project welcomes contributions and suggestions. Most contributions require you to
+agree to a Contributor License Agreement (CLA) declaring that you have the right to, and
+actually do, grant us the rights to use your contribution. For details, visit
+https://cla-assistant.io/EleutherAI/gpt-neox.
+
+When you submit a pull request, a CLA bot will automatically determine whether you need
+to provide a CLA and decorate the PR appropriately (e.g., status check, comment). Simply
+follow the instructions provided by the bot. You will only need to do this once across
+all repos using our CLA.
+
+## New Feature Contribution Guidelines
+Unlike bug fix or improving existing feature (where users usually directly submit a PR and we review it), adding a new feature to GPT-NeoX requires several steps: (1) proposal and discussion, (2) implementation and verification, (3) release and maintenance. This general guideline applies to all new feature contributions. Core GPT-NeoX team member contributions may complete step 1 internally.
+
+### Step 1: Proposal and Discussion
+We ask users to first post your intended feature in an issue. This issue needs to include:
+
+* A description of the proposed feature.
+* A motivation of why it will be useful to GPT-NeoX users.
+* A rough design of how you implement the feature inside GPT-NeoX.
+* (Important) Results or planned experiments to demonstrate the effectiveness and correctness of the feature.
+  * If the feature only affects performance and does not affect training convergence, we require testing on a fraction of training to demonstrate that the training/validation loss are consistent with baseline, and that the performance is better than baseline.
+  * If the feature does affect training convergence, we require testing the whole training to demonstrate that the feature achieves better/on-par final model quality and training performance compared to baseline.
+
+Based on the issue we shall discuss the merit of the new feature and decide whether to accept or decline the proposal. Once accepted and after we confirm the design and implementation plan, we are ready for step 2.
+
+### Step 2: Implementation and Verification
+The contributor will proceed and implement the feature, and the GPT-NeoX team will provide guidance/helps as needed. The required deliverables include:
+
+* A PR to [EleutherAI/GPT-NeoX](https://github.com/EleutherAI/gpt-neox) including (1) the feature implementation (2) unit tests (3) documentation (4) example usage.
+* In the implementation (code, documentation, tutorial), we require the feature author to record their GitHub username as a contact method for future questions/maintenance.
+
+After receiving the PRs, we will review them and merge them after necessary tests/fixes.
+
+### Step 3: Release and Maintenance
+After the PRs are merged, we will announce the feature on our website (with credit to the feature author). We ask the feature author to commit to the maintenance of the feature.