From eec6bb01f834782d33a34eed5db52540f6199b80 Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Tue, 27 Feb 2024 01:34:40 +0000 Subject: [PATCH 1/5] docker build --- .github/workflows/UnitTests.yml | 260 ++++++++++++------------ .github/workflows/UploadDockerBuild.yml | 55 +++++ 2 files changed, 185 insertions(+), 130 deletions(-) create mode 100644 .github/workflows/UploadDockerBuild.yml diff --git a/.github/workflows/UnitTests.yml b/.github/workflows/UnitTests.yml index a5012b3cd..398bb47d3 100644 --- a/.github/workflows/UnitTests.yml +++ b/.github/workflows/UnitTests.yml @@ -15,138 +15,138 @@ # This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python -name: Unit Test +# name: Unit Test -on: - pull_request: - push: - branches: [ "main" ] - workflow_dispatch: - schedule: - # Run the job every 2 hours - - cron: '0 */2 * * *' +# on: +# pull_request: +# push: +# branches: [ "main" ] +# workflow_dispatch: +# schedule: +# # Run the job every 2 hours +# - cron: '0 */2 * * *' -jobs: - cpu: - runs-on: ${{ matrix.os }} - strategy: - matrix: - os: [ubuntu-20.04] - python-version: ['3.10'] - steps: - - uses: actions/checkout@v3 - - name: setup python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - name: Install Dependencies - run: | - pip install pytype - pip install pylint - - name: Typecheck the code with pytype - run: | - pytype --jobs auto --disable import-error MaxText/ - - name: Analysing the code with pylint - run: | - pylint MaxText/ +# jobs: +# cpu: +# runs-on: ${{ matrix.os }} +# strategy: +# matrix: +# os: [ubuntu-20.04] +# python-version: ['3.10'] +# steps: +# - uses: actions/checkout@v3 +# - name: setup python +# uses: actions/setup-python@v4 +# with: +# python-version: ${{ matrix.python-version }} +# - name: Install Dependencies +# run: | +# pip install pytype +# pip install pylint +# - name: Typecheck the code with pytype +# run: | +# pytype --jobs auto --disable import-error MaxText/ +# - name: Analysing the code with pylint +# run: | +# pylint MaxText/ - # IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'gpu' job - tpu: - strategy: - fail-fast: false - matrix: - device-type: ["v4-8"] - name: "TPU test (${{ matrix.device-type }})" - runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"] - steps: - - uses: actions/checkout@v3 - - name: Cleanup old docker images - run: | - docker system prune --all --force - - name: Install dependencies - run: | - bash docker_build_dependency_image.sh - - name: Test gsutil installation - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}' - - name: Test with pytest - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest' - - name: Test train.py - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false' - - name: Test train.py with per_device_batch_size < 1 - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false' - - name: Test decode.py - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1' - - name: Test decode.py with per_device_batch_size < 1 - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25' - - name: Test standalone_dataloader.py - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/standalone_dataloader.py MaxText/configs/base.yml run_name=standalone_dataloader_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=100 enable_checkpointing=false' - - name: Test standalone_checkpointer.py - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/standalone_checkpointer.py MaxText/configs/base.yml run_name=standalone_checkpointer_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=200 checkpoint_period=50 enable_checkpointing=True async_checkpointing=False' - - name: Test int8_training - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false' - - name: Test generate_param_only_checkpoint - run: | - docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4' +# # IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'gpu' job +# tpu: +# strategy: +# fail-fast: false +# matrix: +# device-type: ["v4-8"] +# name: "TPU test (${{ matrix.device-type }})" +# runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"] +# steps: +# - uses: actions/checkout@v3 +# - name: Cleanup old docker images +# run: | +# docker system prune --all --force +# - name: Install dependencies +# run: | +# bash docker_build_dependency_image.sh +# - name: Test gsutil installation +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}' +# - name: Test with pytest +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest' +# - name: Test train.py +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false' +# - name: Test train.py with per_device_batch_size < 1 +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false' +# - name: Test decode.py +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1' +# - name: Test decode.py with per_device_batch_size < 1 +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25' +# - name: Test standalone_dataloader.py +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/standalone_dataloader.py MaxText/configs/base.yml run_name=standalone_dataloader_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=100 enable_checkpointing=false' +# - name: Test standalone_checkpointer.py +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/standalone_checkpointer.py MaxText/configs/base.yml run_name=standalone_checkpointer_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=200 checkpoint_period=50 enable_checkpointing=True async_checkpointing=False' +# - name: Test int8_training +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false' +# - name: Test generate_param_only_checkpoint +# run: | +# docker run -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4' - # IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'tpu' job - gpu: - strategy: - fail-fast: false - matrix: - device-type: ["a100-40gb-4"] - name: "GPU test (${{ matrix.device-type }})" - runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"] - steps: - - uses: actions/checkout@v3 - - name: Cleanup old docker images - run: | - docker system prune --all --force - - name: Install dependencies - run: | - bash docker_build_dependency_image.sh DEVICE=gpu - - name: Test gsutil installation - run: | - docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}' - - name: Test with pytest - run: | - docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest -m "not tpu"' - - name: Test train.py - run: | - docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=dot_product' - - name: Test train.py with per_device_batch_size < 1 - run: | - docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=dot_product' - - name: Test int8_training - run: | - docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=dot_product' - - name: Test decode.py - run: | - docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1' - - name: Test decode.py with per_device_batch_size < 1 - run: | - docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ - 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25' +# # IF YOU MODIFY THIS, YOU SHOULD ALSO ADD CORRESPONDING MODICATIONS TO 'tpu' job +# gpu: +# strategy: +# fail-fast: false +# matrix: +# device-type: ["a100-40gb-4"] +# name: "GPU test (${{ matrix.device-type }})" +# runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"] +# steps: +# - uses: actions/checkout@v3 +# - name: Cleanup old docker images +# run: | +# docker system prune --all --force +# - name: Install dependencies +# run: | +# bash docker_build_dependency_image.sh DEVICE=gpu +# - name: Test gsutil installation +# run: | +# docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'which gsutil >/dev/null 2>&1 || { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}' +# - name: Test with pytest +# run: | +# docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c 'cd MaxText;python3 -m pytest -m "not tpu"' +# - name: Test train.py +# run: | +# docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=dot_product' +# - name: Test train.py with per_device_batch_size < 1 +# run: | +# docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=dot_product' +# - name: Test int8_training +# run: | +# docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=dot_product' +# - name: Test decode.py +# run: | +# docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=1' +# - name: Test decode.py with per_device_batch_size < 1 +# run: | +# docker run --runtime=nvidia --gpus all -v /home/runner/actions-runner/_work/maxtext/maxtext:/app --rm --privileged maxtext_base_image bash -c \ +# 'python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=dot_product enable_checkpointing=false max_target_length=128 per_device_batch_size=.25' diff --git a/.github/workflows/UploadDockerBuild.yml b/.github/workflows/UploadDockerBuild.yml new file mode 100644 index 000000000..e5db3e2f8 --- /dev/null +++ b/.github/workflows/UploadDockerBuild.yml @@ -0,0 +1,55 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This workflow will install Python dependencies, run tests and lint with a variety of Python versions +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: Build Images + +on: + schedule: + # Run the job daily at 10PM PST (6AM UTC) + - cron: '0 6 * * *' + +jobs: + build_images: + strategy: + fail-fast: false + matrix: + device-type: ["v4-8"] + runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"] + steps: + - uses: actions/checkout@v3 + - name: build jax stable image + run : | + project=tpu-prod-env-multipod + local_image_name=maxtext_local_jax_stable + cloud_image_name=maxtext_jax_stable + bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=stable && + docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest + gcloud auth configure-docker --quiet + docker push gcr.io/$project/${cloud_image_name}:latest + image_date=$(date +%Y-%m-%d) + docker push gcr.io/$project/${cloud_image_name}:${image_date} + - name: build jax stable image + run : | + project=tpu-prod-env-multipod + local_image_name=maxtext_local_jax_nightly + cloud_image_name=maxtext_jax_nightly + bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=nightly && + docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest + gcloud auth configure-docker --quiet + docker push gcr.io/$project/${cloud_image_name}:latest + image_date=$(date +%Y-%m-%d) + docker push gcr.io/$project/${cloud_image_name}:${image_date} \ No newline at end of file From aca5c7cf147c9d51784fb96b1dc3bab41202917f Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Tue, 27 Feb 2024 01:36:39 +0000 Subject: [PATCH 2/5] Test via pr --- .github/workflows/UploadDockerBuild.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/UploadDockerBuild.yml b/.github/workflows/UploadDockerBuild.yml index e5db3e2f8..aa22cbd50 100644 --- a/.github/workflows/UploadDockerBuild.yml +++ b/.github/workflows/UploadDockerBuild.yml @@ -18,6 +18,10 @@ name: Build Images on: + pull_request: + push: + branches: [ "main" ] + workflow_dispatch: schedule: # Run the job daily at 10PM PST (6AM UTC) - cron: '0 6 * * *' From 646f4839628b4d97063700a793341cf2ffad5387 Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Tue, 27 Feb 2024 01:38:12 +0000 Subject: [PATCH 3/5] Test via pr --- docker_build_fake_image.sh | 77 ++++++++++++++++++++++++++++ maxtext_fake_dependencies.Dockerfile | 11 ++++ 2 files changed, 88 insertions(+) create mode 100644 docker_build_fake_image.sh create mode 100644 maxtext_fake_dependencies.Dockerfile diff --git a/docker_build_fake_image.sh b/docker_build_fake_image.sh new file mode 100644 index 000000000..6f16b3b14 --- /dev/null +++ b/docker_build_fake_image.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Example command: +# bash docker_build_dependency_image.sh MODE=stable +# bash docker_build_dependency_image.sh MODE=nightly +# bash docker_build_dependency_image.sh MODE=stable JAX_VERSION=0.4.13 + +# Enable "exit immediately if any command fails" option +set -e + +export LOCAL_IMAGE_NAME=maxtext_base_image + +echo "Starting to build your docker image. This will take a few minutes but the image can be reused as you iterate." + +# Set environment variables +for ARGUMENT in "$@"; do + IFS='=' read -r KEY VALUE <<< "$ARGUMENT" + export "$KEY"="$VALUE" + echo "$KEY"="$VALUE" +done + + +if [[ -z ${JAX_VERSION+x} ]] ; then + export JAX_VERSION=NONE + echo "Default JAX_VERSION=${JAX_VERSION}" +fi + +if [[ -z ${MODE} ]]; then + export MODE=stable + echo "Default MODE=${MODE}" + +fi + +if [[ -z ${DEVICE} ]]; then + export DEVICE=tpu + echo "Default DEVICE=${DEVICE}" +fi + +if [[ -z ${LIBTPU_GCS_PATH+x} ]] ; then + export LIBTPU_GCS_PATH=NONE + echo "Default LIBTPU_GCS_PATH=${LIBTPU_GCS_PATH}" + if [[ ${DEVICE} == "gpu" ]]; then + docker build --network host --build-arg MODE=${MODE} --build-arg JAX_VERSION=$JAX_VERSION --build-arg DEVICE=$DEVICE -f ./maxtext_gpu_dependencies.Dockerfile -t ${LOCAL_IMAGE_NAME} . + else + docker build --network host --build-arg MODE=${MODE} --build-arg JAX_VERSION=$JAX_VERSION --build-arg LIBTPU_GCS_PATH=$LIBTPU_GCS_PATH --build-arg DEVICE=$DEVICE -f ./maxtext_fake_dependencies.Dockerfile -t ${LOCAL_IMAGE_NAME} . + fi +else + docker build --network host --build-arg MODE=${MODE} --build-arg JAX_VERSION=$JAX_VERSION --build-arg LIBTPU_GCS_PATH=$LIBTPU_GCS_PATH -f ./maxtext_fake_dependencies.Dockerfile -t ${LOCAL_IMAGE_NAME} . + docker build --network host --build-arg CUSTOM_LIBTPU=true -f ./maxtext_libtpu_path.Dockerfile -t ${LOCAL_IMAGE_NAME} . +fi + +echo "" +echo "*************************" +echo "" + +echo "Built your base docker image and named it ${LOCAL_IMAGE_NAME}. +It only has the dependencies installed. Assuming you're on a TPUVM, to run the +docker image locally and mirror your local working directory run:" +echo "docker run -v $(pwd):/app --rm -it --privileged --entrypoint bash ${LOCAL_IMAGE_NAME}" +echo "" +echo "You can run MaxText and your development tests inside of the docker image. Changes to your workspace will automatically +be reflected inside the docker container." +echo "Once you want you upload your docker container to GCR, take a look at docker_upload_runner.sh" diff --git a/maxtext_fake_dependencies.Dockerfile b/maxtext_fake_dependencies.Dockerfile new file mode 100644 index 000000000..e762e1a53 --- /dev/null +++ b/maxtext_fake_dependencies.Dockerfile @@ -0,0 +1,11 @@ + +# Use Python 3.10 as the base image +FROM python:3.10-slim-bullseye + + +# Copy all files from local workspace into docker container +COPY . . +RUN ls . + + +WORKDIR /app \ No newline at end of file From f01f3969d9c3cd4901adc1b40d1f714cf9bd1337 Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Tue, 27 Feb 2024 01:42:47 +0000 Subject: [PATCH 4/5] tag the image --- .github/workflows/UploadDockerBuild.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/UploadDockerBuild.yml b/.github/workflows/UploadDockerBuild.yml index aa22cbd50..70f731969 100644 --- a/.github/workflows/UploadDockerBuild.yml +++ b/.github/workflows/UploadDockerBuild.yml @@ -40,20 +40,22 @@ jobs: project=tpu-prod-env-multipod local_image_name=maxtext_local_jax_stable cloud_image_name=maxtext_jax_stable - bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=stable && - docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest + bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=stable gcloud auth configure-docker --quiet + docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest docker push gcr.io/$project/${cloud_image_name}:latest image_date=$(date +%Y-%m-%d) + docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:${image_date} docker push gcr.io/$project/${cloud_image_name}:${image_date} - - name: build jax stable image + - name: build jax nightly image run : | project=tpu-prod-env-multipod local_image_name=maxtext_local_jax_nightly cloud_image_name=maxtext_jax_nightly - bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=nightly && - docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest + bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=nightly gcloud auth configure-docker --quiet + docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest docker push gcr.io/$project/${cloud_image_name}:latest image_date=$(date +%Y-%m-%d) + docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:${image_date} docker push gcr.io/$project/${cloud_image_name}:${image_date} \ No newline at end of file From 607554fa9fd60ff98c2c3948592a0cd682eaeadf Mon Sep 17 00:00:00 2001 From: gobbleturk Date: Tue, 27 Feb 2024 01:44:32 +0000 Subject: [PATCH 5/5] real dependencies --- .github/workflows/UploadDockerBuild.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/UploadDockerBuild.yml b/.github/workflows/UploadDockerBuild.yml index 70f731969..d31f8030a 100644 --- a/.github/workflows/UploadDockerBuild.yml +++ b/.github/workflows/UploadDockerBuild.yml @@ -40,7 +40,7 @@ jobs: project=tpu-prod-env-multipod local_image_name=maxtext_local_jax_stable cloud_image_name=maxtext_jax_stable - bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=stable + bash docker_build_dependency_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=stable gcloud auth configure-docker --quiet docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest docker push gcr.io/$project/${cloud_image_name}:latest @@ -52,7 +52,7 @@ jobs: project=tpu-prod-env-multipod local_image_name=maxtext_local_jax_nightly cloud_image_name=maxtext_jax_nightly - bash docker_build_fake_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=nightly + bash docker_build_dependency_image.sh LOCAL_IMAGE_NAME=$local_image_name MODE=nightly gcloud auth configure-docker --quiet docker tag ${local_image_name} gcr.io/$project/${cloud_image_name}:latest docker push gcr.io/$project/${cloud_image_name}:latest