google · shauryagup · Jul 15, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 11, 2024
@@ -1,2 +1,2 @@
 # Changes in this file should match with requiredReviewers in .github/workflows/AddLabel.yml
-* @rwitten
+* @gobbleturk
@@ -15,10 +15,8 @@
 name: Add Label
 
 on:
-  workflow_call:
   workflow_run:
-    # workflows: [Unit Test, CodeQL]
-    workflows: [CodeQL]
+    workflows: [Unit Test, CodeQL]
     types:
       - completed
   pull_request_review:
@@ -54,23 +52,33 @@ jobs:
             }
 
             // This list should match with CODEOWNERS
-            let requiredReviewers = { rwitten: "" }
+            let requiredReviewers = { gobbleturk: "" }
             const reviews = await github.rest.pulls.listReviews({
               owner,
               repo,
               pull_number,
             })
+
+            const pullRequest = await github.rest.pulls.get({
+              owner,
+              repo,
+              pull_number,
+            });
+            const pullRequester = pullRequest.data.user.login;
+
             if (reviews.data.length === 0) {
-              console.log("Not adding pull ready because the PR is not approved yet")
+              console.log("Not adding pull ready because the PR is not approved yet.")
               process.exit()
             }
+            let is_approved=false
             for (const review of reviews.data) {
-              if (review.state === "APPROVED") {
-                delete requiredReviewers[review.user.login]
+              if (review.state === "APPROVED" && (review.user.login in requiredReviewers || pullRequester in requiredReviewers)) {
+                is_approved=true
+                break;
               }
             }
-            if (Object.keys(requiredReviewers).length !== 0) {
-              console.log("Not adding pull ready because the PR is not approved yet")
+            if (!is_approved) {
+              console.log("Not adding pull ready because the PR is not approved yet by a code owner.")
               process.exit()
             }
 
@@ -80,6 +88,11 @@ jobs:
               pull_number,
               per_page: 100,
             })
+            // Check that the number of commits in the PR is 1.
+            if (commits.data.length !== 1) {
+              console.log("Not adding pull ready because the PR has more than one commit. Please squash your commits.")
+              process.exit(1)
+            }
             const ref = commits.data.slice(-1)[0].sha
             const checkRuns = await github.rest.checks.listForRef({
               owner,

@@ -0,0 +1,38 @@
+name: Linter
+
+on:
+  push:
+    branches:
+      - '**'
+
+jobs:
+  cpu:
+    name: "CPU tests"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-20.04]
+        python-version: ['3.10']
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install Dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint pyink pytype==2024.2.27
+    - name: Typecheck the code with pytype
+      run: |
+        pytype --jobs auto --disable import-error MaxText/
+    - name: Analysing the code with pylint in Maxtext/
+      run: |
+         pylint MaxText/  && \
+         echo 'Maxtext PyLint check successful' || { echo \
+         'PyLint check has failed. Please run bash code_style.sh to fix issues'; exit 20; }
+    - name: Analysing the code with pylint in pedagogical_examples/
+      run: |
+         pylint pedagogical_examples/ && \
+         echo 'PyLint check on pedagogical_examples/ is successful' || { echo \
+         'PyLint check has failed. Please run bash code_style.sh to fix issues'; exit 20; }
@@ -23,47 +23,151 @@ on:
     branches: [ "main" ]
   workflow_dispatch:
   schedule:
-    # Run the job every 60 mins
-    - cron:  '*/60 * * * *'
+    # Run the job every 2 hours
+    - cron:  '0 */2 * * *'
 
 jobs:
-  build:
+  build_and_upload_image:
     strategy:
-      fail-fast: false
-      matrix:
-        tpu-type: ["v4-8"]
-    name: "TPU test (${{ matrix.tpu-type }})"
-    runs-on: ["self-hosted", "tpu", "${{ matrix.tpu-type }}"]
+        fail-fast: false
+        matrix:
+          device:
+          - type: tpu
+            name: v4-8
+            mode: stable
+          - type: gpu
+            name: a100-40gb-4
+            mode: pinned
+    name: Build and upload image (${{ matrix.device.name }})
+    runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
     steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
     - name: Cleanup old docker images
+      run: docker system prune --all --force
+    - name: Build an image
       run: |
-        docker system prune --all --force
-    - name: Install dependencies
+        bash docker_build_dependency_image.sh MODE=${{ matrix.device.mode }} DEVICE=${{ matrix.device.type }} 
+    - name: Tag the image
       run: |
-        bash docker_build_dependency_image.sh
-    - name: Analysing the code with pylint
+        docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
+    - name: Upload the image
       run: |
-        docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c "pylint MaxText/"
+        docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
+
+  common:
+    needs: build_and_upload_image
+    strategy:
+      fail-fast: False
+      matrix:
+        device:
+        - type: tpu
+          name: v4-8
+          attention: autoselected
+          pytest_marker: ''
+          container_env:
+            XLA_PYTHON_CLIENT_MEM_FRACTION: 0.75
+            TF_FORCE_GPU_ALLOW_GROWTH: false
+          container_resource_option: "--privileged"
+        - type: gpu
+          name: a100-40gb-4
+          image_suffix: gpu_jax_pinned
+          attention: dot_product
+          pytest_marker: -m 'not tpu'
+          container_env:
+            XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
+            TF_FORCE_GPU_ALLOW_GROWTH: true
+          container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+    name: Common test (${{ matrix.device.name }})
+    runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
+    container:
+      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
+      volumes: 
+        - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
+      env:
+        XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ matrix.device.container_env.XLA_PYTHON_CLIENT_MEM_FRACTION }}
+        TF_FORCE_GPU_ALLOW_GROWTH: ${{ matrix.device.container_env.TF_FORCE_GPU_ALLOW_GROWTH }}
+      options: ${{ matrix.device.container_resource_option }}
+    steps:
+    - uses: actions/checkout@v4
+    - name: Test gsutil installation
+      run: which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
     - name: Test with pytest
-      run: |
-        docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest'
-    - name: Test train.py
-      run: |
-        docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
-        'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2'
+      run: cd MaxText;python3 -m pytest ${{ matrix.device.pytest_marker }}
+    - name: Test train.py with TFDS c4
+      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
+    - name: Test train.py with HF c4
+      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs hf_train_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet hf_path=parquet dataset_type=hf steps=2 tokenizer_path=google-t5/t5-large attention=${{ matrix.device.attention }} enable_checkpointing=false
+    - name: Test train.py with synthetic data
+      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }} dataset_type=synthetic
+    - name: Test train.py with per_device_batch_size < 1
+      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=${{ matrix.device.attention }}
     - name: Test decode.py
-      run: |
-        docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
-        'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4'
+      run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=1
+    - name: Test decode.py with per_device_batch_size < 1
+      run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=.25
     - name: Test int8_training
-      run: |
-        docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \
-        'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset int8_training=true steps=2'
-  add_pull_ready:
-    if: github.ref != 'refs/heads/main'
-    permissions:
-      checks: read
-      pull-requests: write
-    needs: build
-    uses: ./.github/workflows/AddLabel.yml
+      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
+    - name: Test fp8_training
+      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=fp8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
+    - name: Test generate_param_only_checkpoint
+      run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -a ${{ matrix.device.attention }}
+    - name: Test generate_param_only_checkpoint with int8 quantization
+      run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -q int8 -a ${{ matrix.device.attention }}
+    - name: Test grain checkpoint determinism
+      run: bash end_to_end/test_checkpointing.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset False grain ${{ matrix.device.attention }}
+    - name: Test checkpoint compatibility
+      run: bash end_to_end/test_checkpoint_compatibility.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset ${{ matrix.device.attention }}
+
+  tpu:
+    needs: build_and_upload_image
+    strategy:
+      fail-fast: false
+      matrix:
+        device-type: ["v4-8"]
+    name: "TPU test (${{ matrix.device-type }})"
+    runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"]
+    container:
+      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu
+      volumes: 
+      - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
+      options: "--privileged"
+    steps:
+    - uses: actions/checkout@v4
+    - name: Validate Pedagogical Example, Shmap_collective_matmul
+      run: python3 pedagogical_examples/shmap_collective_matmul.py
+
+  gpu:
+    needs: build_and_upload_image
+    strategy:
+      fail-fast: false
+      matrix:
+        device-type: ["a100-40gb-4"]
+        build-mode: ["pinned"]
+    name: "GPU test (${{ matrix.device-type }}, ${{ matrix.build-mode }})"
+    runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"]
+    container:
+      image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu
+      volumes: 
+      - /home/runner/actions-runner/_work/maxtext/maxtext:/deps
+      env:
+        XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
+        TF_FORCE_GPU_ALLOW_GROWTH: true
+      options: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+    - name: Test train.py with flash attention
+      run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=cudnn_flash_te
+
+  clean_up:
+    if: ${{ always() }}
+    needs: [common, gpu, tpu]
+    name: "Clean up"
+    runs-on: ["self-hosted"]
+    steps:
+    - name: Delete GPU image
+      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
+    - name: Delete TPU image
+      run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet
+
@@ -0,0 +1,56 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Build Images
+
+on:
+  schedule:
+    # Run the job daily at 12AM UTC
+    - cron:  '0 0 * * *'
+
+jobs:
+  tpu:
+    strategy:
+      fail-fast: false
+      matrix:
+        device-type: ["v4-8"]
+    runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: build jax stable image
+      run : |
+        bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_jax_stable MODE=stable DEVICE=tpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_jax_stable
+    - name: build jax nightly image
+      run : |
+        bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_jax_nightly MODE=nightly DEVICE=tpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_jax_nightly
+  gpu:
+    strategy:
+      fail-fast: false
+      matrix:
+        device-type: ["a100-40gb-4"]
+    runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"]
+    steps:
+    - uses: actions/checkout@v3
+    - name: build jax stable image
+      run : |
+        bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_gpu_jax_stable MODE=stable DEVICE=gpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_gpu_local_jax_stable
+    - name: build jax nightly image
+      run : |
+        bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_gpu_jax_nightly MODE=nightly DEVICE=gpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_gpu_local_jax_nightly
+    - name: build jax pinned image
+      run : |
+        bash .github/workflows/build_and_upload_images.sh CLOUD_IMAGE_NAME=maxtext_gpu_jax_pinned MODE=pinned DEVICE=gpu PROJECT=tpu-prod-env-multipod LOCAL_IMAGE_NAME=maxtext_gpu_local_jax_pinned