diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml index cf9ce4259..8979b8dfa 100644 --- a/MaxText/configs/base.yml +++ b/MaxText/configs/base.yml @@ -178,5 +178,5 @@ use_iota_embed: False #Monitoring parameters - Export in-workload metrics to Cloud monitoring enable_cloud_monitoring: False -cloud_monitoring_dashboard: "https://pantheon.corp.google.com/monitoring/dashboards?project=" +cloud_monitoring_dashboard: "https://pantheon.corp.google.com/monitoring/dashboards?" cloud_zone: "" # zone name for cloud jobs - used for cloud metrics emitting diff --git a/MaxText/monitoring_api.py b/MaxText/monitoring_api.py index e244ac934..91b4f161e 100644 --- a/MaxText/monitoring_api.py +++ b/MaxText/monitoring_api.py @@ -8,29 +8,11 @@ from google.cloud import monitoring_v3 from google.cloud import compute_v1 from google.api import metric_pb2 -import requests import time import os import max_logging -def get_metadata(project_id, zone, instance_id): - """ - Fetches metadata - - Args: - project_id - zone - instance_id - - Returns: - metadata as json - """ - r = requests.get(url="https://compute.googleapis.com/compute/v1/projects/\ - {project_id}/zones/{zone}/instances/{instance_id}") - metadata = r.json() - return metadata - def create_custom_metric(metric_name, description): """ Creates a custom metric @@ -99,13 +81,7 @@ def write_time_series_step(metric_name, monitoring_enabled, pyconfig, step=1): "%d %b %Y %H:%M:%S UTC", time.gmtime(seconds_since_epoch_utc) ) max_logging.log( - "Emitting metric ", - metric_name, - " for step = ", - step, - " at: ", - event_time, - ) + f"Emitting metric {metric_name} for step = {step} at: {event_time}") instance_id = get_instance_id(project_id, zone) @@ -126,18 +102,11 @@ def write_time_series_step(metric_name, monitoring_enabled, pyconfig, step=1): ) ] - client.create_time_series(name=project_name, time_series=[series], metadata=get_metadata(project_id, zone, instance_id)) + client.create_time_series(name=project_name, time_series=[series]) dashboard_link = pyconfig.config.cloud_monitoring_dashboard+project_name max_logging.log( - "Time series added for step", - step, - "and instance_id ", - instance_id, - " and zone ", - zone, - "\nView dashboards or use metrics: ", - dashboard_link, - ) + f"Time series added for step {step} and instance_id {instance_id} and zone {zone}\ + \n View dashboards or use metrics: {dashboard_link}") return [series] def get_time_series_step_data(metric_name): diff --git a/MaxText/tests/cloud_monitoring_test.py b/MaxText/tests/cloud_monitoring_test.py index 582389017..f62e75172 100644 --- a/MaxText/tests/cloud_monitoring_test.py +++ b/MaxText/tests/cloud_monitoring_test.py @@ -30,6 +30,8 @@ def test_write_time_series_step(self): pyconfig.initialize(sys.argv + ['configs/base.yml'], per_device_batch_size=1, run_name='test', mesh_axes = ['data'], logical_axis_rules = [['batch', 'data']], data_sharding = ['data'], + base_output_directory = "gs://max-experiments/", + dataset_path = "gs://maxtext-dataset/", enable_cloud_monitoring=True, cloud_zone='us-central2-b') monitoring_api.create_custom_metric('test_metric', "This is an example metric")