Integrate Badput monitoring with MaxText #18689

Workflow file for this run

.github/workflows/UnitTests.yml at d82ec42

	# Copyright 2023 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
	# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

	name: Unit Test

	on:
	pull_request:
	push:
	branches: [ "main" ]
	workflow_dispatch:
	schedule:
	# Run the job every 2 hours
	- cron: '0 /2 * *'

	jobs:
	build_and_upload_image:
	strategy:
	fail-fast: false
	matrix:
	device:
	- type: tpu
	name: v4-8
	mode: stable
	- type: gpu
	name: a100-40gb-4
	mode: pinned
	name: Build and upload image (${{ matrix.device.name }})
	runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
	steps:
	- uses: actions/checkout@v4
	- name: Cleanup old docker images
	run: docker system prune --all --force
	- name: Build an image
	run: \|
	bash docker_build_dependency_image.sh MODE=${{ matrix.device.mode }} DEVICE=${{ matrix.device.type }}
	- name: Tag the image
	run: \|
	docker tag maxtext_base_image gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
	- name: Upload the image
	run: \|
	docker push gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}

	common:
	needs: build_and_upload_image
	strategy:
	fail-fast: False
	matrix:
	device:
	- type: tpu
	name: v4-8
	attention: autoselected
	pytest_marker: ''
	container_env:
	XLA_PYTHON_CLIENT_MEM_FRACTION: 0.75
	TF_FORCE_GPU_ALLOW_GROWTH: false
	container_resource_option: "--privileged"
	- type: gpu
	name: a100-40gb-4
	image_suffix: gpu_jax_pinned
	attention: dot_product
	pytest_marker: -m 'not tpu'
	container_env:
	XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
	TF_FORCE_GPU_ALLOW_GROWTH: true
	container_resource_option: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
	name: Common test (${{ matrix.device.name }})
	runs-on: ["self-hosted", "${{ matrix.device.type }}", "${{ matrix.device.name }}"]
	container:
	image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:${{ matrix.device.type }}
	volumes:
	- /home/runner/actions-runner/_work/maxtext/maxtext:/deps
	env:
	XLA_PYTHON_CLIENT_MEM_FRACTION: ${{ matrix.device.container_env.XLA_PYTHON_CLIENT_MEM_FRACTION }}
	TF_FORCE_GPU_ALLOW_GROWTH: ${{ matrix.device.container_env.TF_FORCE_GPU_ALLOW_GROWTH }}
	options: ${{ matrix.device.container_resource_option }}
	steps:
	- uses: actions/checkout@v4
	- name: Test gsutil installation
	run: which gsutil >/dev/null 2>&1 \|\| { echo >&2 "gsutil is required but not installed. Aborting"; exit 24;}
	- name: Test with pytest
	run: cd MaxText;python3 -m pytest ${{ matrix.device.pytest_marker }}
	- name: Test train.py with TFDS c4
	run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
	- name: Test train.py with HF c4
	run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs hf_train_files=gs://maxtext-dataset/hf/c4/c4-train-00000-of-01637.parquet hf_path=parquet dataset_type=hf steps=2 tokenizer_path=google-t5/t5-large attention=${{ matrix.device.attention }} enable_checkpointing=false
	- name: Test train.py with synthetic data
	run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }} dataset_type=synthetic
	- name: Test train.py with per_device_batch_size < 1
	run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 per_device_batch_size=0.25 ici_tensor_parallelism=4 enable_checkpointing=false attention=${{ matrix.device.attention }}
	- name: Test decode.py
	run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=1
	- name: Test decode.py with per_device_batch_size < 1
	run: python3 MaxText/decode.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 ici_tensor_parallelism=4 attention=${{ matrix.device.attention }} enable_checkpointing=false max_target_length=128 per_device_batch_size=.25
	- name: Test int8_training
	run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=int8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
	- name: Test fp8_training
	run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset quantization=fp8 steps=2 enable_checkpointing=false attention=${{ matrix.device.attention }}
	- name: Test generate_param_only_checkpoint
	run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -a ${{ matrix.device.attention }}
	- name: Test generate_param_only_checkpoint with int8 quantization
	run: bash end_to_end/test_generate_param_only_checkpoint.sh -r runner_$(date +%Y-%m-%d-%H-%M-%S) -o gs://runner-maxtext-logs -d gs://maxtext-dataset -i 4 -q int8 -a ${{ matrix.device.attention }}
	- name: Test grain checkpoint determinism
	run: bash end_to_end/test_checkpointing.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset False grain ${{ matrix.device.attention }}
	- name: Test checkpoint compatibility
	run: bash end_to_end/test_checkpoint_compatibility.sh runner_$(date +%Y-%m-%d-%H-%M-%S) gs://runner-maxtext-logs gs://maxtext-dataset ${{ matrix.device.attention }}

	tpu:
	needs: build_and_upload_image
	strategy:
	fail-fast: false
	matrix:
	device-type: ["v4-8"]
	name: "TPU test (${{ matrix.device-type }})"
	runs-on: ["self-hosted", "tpu", "${{ matrix.device-type }}"]
	container:
	image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu
	volumes:
	- /home/runner/actions-runner/_work/maxtext/maxtext:/deps
	options: "--privileged"
	steps:
	- uses: actions/checkout@v4
	- name: Validate Pedagogical Example, Shmap_collective_matmul
	run: python3 pedagogical_examples/shmap_collective_matmul.py

	gpu:
	needs: build_and_upload_image
	strategy:
	fail-fast: false
	matrix:
	device-type: ["a100-40gb-4"]
	build-mode: ["pinned"]
	name: "GPU test (${{ matrix.device-type }}, ${{ matrix.build-mode }})"
	runs-on: ["self-hosted", "gpu", "${{ matrix.device-type }}"]
	container:
	image: gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu
	volumes:
	- /home/runner/actions-runner/_work/maxtext/maxtext:/deps
	env:
	XLA_PYTHON_CLIENT_MEM_FRACTION: 0.65
	TF_FORCE_GPU_ALLOW_GROWTH: true
	options: "--shm-size 2g --runtime=nvidia --gpus all --privileged"
	steps:
	- uses: actions/checkout@v4
	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3
	- name: Test train.py with flash attention
	run: python3 MaxText/train.py MaxText/configs/base.yml run_name=runner_$(date +%Y-%m-%d-%H-%M-%S) base_output_directory=gs://runner-maxtext-logs dataset_path=gs://maxtext-dataset steps=2 enable_checkpointing=false attention=cudnn_flash_te

	clean_up:
	if: ${{ always() }}
	needs: [common, gpu, tpu]
	name: "Clean up"
	runs-on: ["self-hosted"]
	steps:
	- name: Delete GPU image
	run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:gpu --force-delete-tags --quiet
	- name: Delete TPU image
	run: gcloud container images delete gcr.io/tpu-prod-env-multipod/maxtext_${{ github.run_id }}:tpu --force-delete-tags --quiet

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Integrate Badput monitoring with MaxText #18689

Workflow file

Integrate Badput monitoring with MaxText #18689

Jobs

Run details

Workflow file for this run