From bfce4bd94c909df1ad030657542b9ffe430ea587 Mon Sep 17 00:00:00 2001 From: zackycao Date: Fri, 23 Aug 2024 14:32:31 -0700 Subject: [PATCH] Add appdef metadata to torchx event Differential Revision: D61632621 Pull Request resolved: https://github.com/pytorch/torchx/pull/947 --- torchx/runner/api.py | 5 ++++- torchx/runner/events/__init__.py | 6 +++++- torchx/runner/events/api.py | 6 +++++- torchx/runner/events/test/lib_test.py | 17 +++++++++++++++-- 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/torchx/runner/api.py b/torchx/runner/api.py index ea7968db..c33e70fc 100644 --- a/torchx/runner/api.py +++ b/torchx/runner/api.py @@ -198,10 +198,12 @@ def run_component( parent_run_id=parent_run_id, ) handle = self.schedule(dryrun_info) + app = none_throws(dryrun_info._app) ctx._torchx_event.workspace = workspace ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler) - ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image + ctx._torchx_event.app_image = app.roles[0].image ctx._torchx_event.app_id = parse_app_handle(handle)[2] + ctx._torchx_event.app_metadata = app.metadata return handle def dryrun_component( @@ -263,6 +265,7 @@ def run( ctx._torchx_event.scheduler = none_throws(dryrun_info._scheduler) ctx._torchx_event.app_image = none_throws(dryrun_info._app).roles[0].image ctx._torchx_event.app_id = parse_app_handle(handle)[2] + ctx._torchx_event.app_metadata = app.metadata return handle def schedule(self, dryrun_info: AppDryRunInfo) -> AppHandle: diff --git a/torchx/runner/events/__init__.py b/torchx/runner/events/__init__.py index cedba10d..360cb3e7 100644 --- a/torchx/runner/events/__init__.py +++ b/torchx/runner/events/__init__.py @@ -24,7 +24,7 @@ import time import traceback from types import TracebackType -from typing import Optional, Type +from typing import Dict, Optional, Type from torchx.runner.events.handlers import get_logging_handler @@ -84,6 +84,7 @@ def __init__( scheduler: Optional[str] = None, app_id: Optional[str] = None, app_image: Optional[str] = None, + app_metadata: Optional[Dict[str, str]] = None, runcfg: Optional[str] = None, workspace: Optional[str] = None, ) -> None: @@ -92,6 +93,7 @@ def __init__( scheduler or "", app_id, app_image=app_image, + app_metadata=app_metadata, runcfg=runcfg, workspace=workspace, ) @@ -128,6 +130,7 @@ def _generate_torchx_event( scheduler: str, app_id: Optional[str] = None, app_image: Optional[str] = None, + app_metadata: Optional[Dict[str, str]] = None, runcfg: Optional[str] = None, source: SourceType = SourceType.UNKNOWN, workspace: Optional[str] = None, @@ -138,6 +141,7 @@ def _generate_torchx_event( api=api, app_id=app_id, app_image=app_image, + app_metadata=app_metadata, runcfg=runcfg, source=source, workspace=workspace, diff --git a/torchx/runner/events/api.py b/torchx/runner/events/api.py index 5cb5f11a..ce5bc899 100644 --- a/torchx/runner/events/api.py +++ b/torchx/runner/events/api.py @@ -10,7 +10,7 @@ import json from dataclasses import asdict, dataclass from enum import Enum -from typing import Optional, Union +from typing import Dict, Optional, Union class SourceType(str, Enum): @@ -30,10 +30,13 @@ class TorchxEvent: api: Api name app_id: Unique id that is set by the underlying scheduler image: Image/container bundle that is used to execute request. + app_metadata: metadata to the app (treatment of metadata is scheduler dependent) runcfg: Run config that was used to schedule app. source: Type of source the event is generated. cpu_time_usec: CPU time spent in usec wall_time_usec: Wall time spent in usec + start_epoch_time_usec: Epoch time in usec when runner event starts + Workspace: Track how different workspaces/no workspace affects build and scheduler """ session: str @@ -41,6 +44,7 @@ class TorchxEvent: api: str app_id: Optional[str] = None app_image: Optional[str] = None + app_metadata: Optional[Dict[str, str]] = None runcfg: Optional[str] = None raw_exception: Optional[str] = None source: SourceType = SourceType.UNKNOWN diff --git a/torchx/runner/events/test/lib_test.py b/torchx/runner/events/test/lib_test.py index 14c025ad..92bb3c82 100644 --- a/torchx/runner/events/test/lib_test.py +++ b/torchx/runner/events/test/lib_test.py @@ -31,6 +31,7 @@ def assert_event( self.assertEqual(actual_event.app_image, expected_event.app_image) self.assertEqual(actual_event.runcfg, expected_event.runcfg) self.assertEqual(actual_event.source, expected_event.source) + self.assertEqual(actual_event.app_metadata, expected_event.app_metadata) @patch("torchx.runner.events.get_logging_handler") def test_get_or_create_logger(self, logging_handler_mock: MagicMock) -> None: @@ -41,11 +42,13 @@ def test_get_or_create_logger(self, logging_handler_mock: MagicMock) -> None: self.assertIsInstance(logger.handlers[0], logging.NullHandler) def test_event_created(self) -> None: + test_metadata = {"test_key": "test_value"} event = TorchxEvent( session="test_session", scheduler="test_scheduler", api="test_api", app_image="test_app_image", + app_metadata=test_metadata, workspace="test_workspace", ) self.assertEqual("test_session", event.session) @@ -54,13 +57,16 @@ def test_event_created(self) -> None: self.assertEqual("test_app_image", event.app_image) self.assertEqual(SourceType.UNKNOWN, event.source) self.assertEqual("test_workspace", event.workspace) + self.assertEqual(test_metadata, event.app_metadata) def test_event_deser(self) -> None: + test_metadata = {"test_key": "test_value"} event = TorchxEvent( session="test_session", scheduler="test_scheduler", api="test_api", app_image="test_app_image", + app_metadata=test_metadata, workspace="test_workspace", source=SourceType.EXTERNAL, ) @@ -78,14 +84,17 @@ def assert_torchx_event(self, expected: TorchxEvent, actual: TorchxEvent) -> Non self.assertEqual(expected.app_image, actual.app_image) self.assertEqual(expected.source, actual.source) self.assertEqual(expected.workspace, actual.workspace) + self.assertEqual(expected.app_metadata, actual.app_metadata) def test_create_context(self, _) -> None: - cfg = json.dumps({"test_key": "test_value"}) + test_dict = {"test_key": "test_value"} + cfg = json.dumps(test_dict) context = log_event( "test_call", "local", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", ) @@ -95,6 +104,7 @@ def test_create_context(self, _) -> None: "test_call", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", ) @@ -102,12 +112,14 @@ def test_create_context(self, _) -> None: self.assert_torchx_event(expected_torchx_event, context._torchx_event) def test_record_event(self, record_mock: MagicMock) -> None: - cfg = json.dumps({"test_key": "test_value"}) + test_dict = {"test_key": "test_value"} + cfg = json.dumps(test_dict) with log_event( "test_call", "local", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", ) as ctx: @@ -119,6 +131,7 @@ def test_record_event(self, record_mock: MagicMock) -> None: "test_call", "test_app_id", app_image="test_app_image_id", + app_metadata=test_dict, runcfg=cfg, workspace="test_workspace", cpu_time_usec=ctx._torchx_event.cpu_time_usec,