Skip to content

Commit

Permalink
Merge pull request #749 from google:integrate-goodput-monitor
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 651091628
  • Loading branch information
maxtext authors committed Jul 10, 2024
2 parents 704ab1c + f27f70a commit 0af4ee2
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 0 deletions.
2 changes: 2 additions & 0 deletions MaxText/configs/base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,8 @@ target_eval_loss: 0. # early stop once reaching target eval_loss

# Goodput parameters
enable_goodput_recording: False
monitor_goodput: False
goodput_upload_interval_seconds: 60

# Vertex AI Tensorboard Configurations - https://github.com/google/maxtext/tree/main/getting_started/Use_Vertex_AI_Tensorboard.md
# Set to True for GCE, False if running via XPK
Expand Down
11 changes: 11 additions & 0 deletions MaxText/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from layers import quantizations

from ml_goodput_measurement import goodput
from ml_goodput_measurement import monitoring

Transformer = models.Transformer
EPS = 1e-8
Expand Down Expand Up @@ -597,6 +598,16 @@ def main(argv: Sequence[str]) -> None:
if config.use_vertex_tensorboard or os.environ.get("UPLOAD_DATA_TO_TENSORBOARD"):
vertex_tensorboard_manager.configure_vertex_tensorboard(config)

if config.monitor_goodput and jax.process_index == 0:
logger_name = f'goodput_{config.run_name}'
goodput_monitor = monitoring.GoodputMonitor(
job_name=config.run_name,
logger_name=logger_name,
tensorboard_dir=config.tensorboard_dir,
upload_interval=config.goodput_upload_interval_seconds,
monitoring_enabled=True
)
goodput_monitor.start_goodput_uploader()
debug_config = debug_configuration.DebugConfig(
stack_trace_config=stack_trace_configuration.StackTraceConfig(
collect_stack_trace=config.collect_stack_trace,
Expand Down

0 comments on commit 0af4ee2

Please sign in to comment.