From 1522339caa90e3c8d2640451c37b76615b593677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 13:32:39 +0100 Subject: [PATCH 1/9] chore(tableau): set ingestion stage report and pertimers --- .../ingestion/source/tableau/tableau.py | 75 +++++++++++++++---- 1 file changed, 62 insertions(+), 13 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index df59cae3fad232..cc1a07020e8d18 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -3,7 +3,7 @@ import re import time from collections import OrderedDict -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import datetime from functools import lru_cache from typing import ( @@ -117,6 +117,7 @@ ) from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo from datahub.ingestion.source.tableau.tableau_validation import check_user_role +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.metadata.com.linkedin.pegasus2avro.common import ( AuditStamp, ChangeAuditStamps, @@ -169,6 +170,8 @@ create_lineage_sql_parsed_result, ) from datahub.utilities import config_clean +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.stats_collections import TopKDict from datahub.utilities.urns.dataset_urn import DatasetUrn try: @@ -636,12 +639,27 @@ class SiteIdContentUrl: site_content_url: str -class TableauSourceReport(StaleEntityRemovalSourceReport): +class TableauSourceReport( + StaleEntityRemovalSourceReport, + IngestionStageReport, +): get_all_datasources_query_failed: bool = False num_get_datasource_query_failures: int = 0 num_datasource_field_skipped_no_name: int = 0 num_csql_field_skipped_no_name: int = 0 num_table_field_skipped_no_name: int = 0 + # timers + extract_usage_stats_timer: Dict[str, float] = field(default_factory=TopKDict) + fetch_groups_timer: Dict[str, float] = field(default_factory=TopKDict) + populate_database_server_hostname_map_timer: Dict[str, float] = field(default_factory=TopKDict) + populate_projects_registry_timer: Dict[str, float] = field(default_factory=TopKDict) + emit_workbooks_timer: Dict[str, float] = field(default_factory=TopKDict) + emit_sheets_timer: Dict[str, float] = field(default_factory=TopKDict) + emit_dashboards_timer: Dict[str, float] = field(default_factory=TopKDict) + emit_embedded_datasources_timer: Dict[str, float] = field(default_factory=TopKDict) + emit_published_datasources_timer: Dict[str, float] = field(default_factory=TopKDict) + emit_custom_sql_datasources_timer: Dict[str, float] = field(default_factory=TopKDict) + emit_upstream_tables_timer: Dict[str, float] = field(default_factory=TopKDict) # lineage num_tables_with_upstream_lineage: int = 0 num_upstream_table_lineage: int = 0 @@ -3457,33 +3475,64 @@ def _create_workbook_properties( return {"permissions": json.dumps(groups)} if len(groups) > 0 else None def ingest_tableau_site(self): + self.report.report_ingestion_stage_start(f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}") + # Initialise the dictionary to later look-up for chart and dashboard stat if self.config.extract_usage_stats: - self._populate_usage_stat_registry() + with PerfTimer() as timer: + self._populate_usage_stat_registry() + self.report.extract_usage_stats_timer[self.site_id] = round(timer.elapsed_seconds(), 2) if self.config.permission_ingestion: - self._fetch_groups() + with PerfTimer() as timer: + self._fetch_groups() + self.report.fetch_groups_timer[self.site_id] = round(timer.elapsed_seconds(), 2) # Populate the map of database names and database hostnames to be used later to map # databases to platform instances. if self.config.database_hostname_to_platform_instance_map: - self._populate_database_server_hostname_map() + with PerfTimer() as timer: + self._populate_database_server_hostname_map() + self.report.populate_database_server_hostname_map_timer[self.site_id] = round(timer.elapsed_seconds(), 2) - self._populate_projects_registry() + with PerfTimer() as timer: + self._populate_projects_registry() + self.report.populate_projects_registry_timer[self.site_id] = round(timer.elapsed_seconds(), 2) if self.config.add_site_container: yield from self.emit_site_container() yield from self.emit_project_containers() - yield from self.emit_workbooks() + + with PerfTimer() as timer: + yield from self.emit_workbooks() + self.report.emit_workbooks_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + if self.sheet_ids: - yield from self.emit_sheets() + with PerfTimer() as timer: + yield from self.emit_sheets() + self.report.emit_sheets_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + if self.dashboard_ids: - yield from self.emit_dashboards() + with PerfTimer() as timer: + yield from self.emit_dashboards() + self.report.emit_dashboards_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + if self.embedded_datasource_ids_being_used: - yield from self.emit_embedded_datasources() + with PerfTimer() as timer: + yield from self.emit_embedded_datasources() + self.report.emit_embedded_datasources_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + if self.datasource_ids_being_used: - yield from self.emit_published_datasources() + with PerfTimer() as timer: + yield from self.emit_published_datasources() + self.report.emit_published_datasources_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + if self.custom_sql_ids_being_used: - yield from self.emit_custom_sql_datasources() + with PerfTimer() as timer: + yield from self.emit_custom_sql_datasources() + self.report.emit_custom_sql_datasources_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + if self.database_tables: - yield from self.emit_upstream_tables() + with PerfTimer() as timer: + yield from self.emit_upstream_tables() + self.report.emit_upstream_tables_timer[self.site_id] = round(timer.elapsed_seconds(), 2) From 1afb4e7bebba43845cd48e9edc1fd2836246f39b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 13:58:04 +0100 Subject: [PATCH 2/9] fixup --- .../src/datahub/ingestion/source/tableau/tableau.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index cc1a07020e8d18..62ee721db9316a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -639,6 +639,7 @@ class SiteIdContentUrl: site_content_url: str +@dataclass class TableauSourceReport( StaleEntityRemovalSourceReport, IngestionStageReport, @@ -670,7 +671,7 @@ class TableauSourceReport( num_upstream_table_lineage_failed_parse_sql: int = 0 num_upstream_fine_grained_lineage_failed_parse_sql: int = 0 num_hidden_assets_skipped: int = 0 - logged_in_user: List[UserInfo] = [] + logged_in_user: List[UserInfo] = field(default_factory=list) def report_user_role(report: TableauSourceReport, server: Server) -> None: @@ -834,6 +835,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: platform=self.platform, ) yield from site_source.ingest_tableau_site() + + self.report.report_ingestion_stage_start("End") + except MetadataQueryException as md_exception: self.report.failure( title="Failed to Retrieve Tableau Metadata", @@ -3535,4 +3539,4 @@ def ingest_tableau_site(self): if self.database_tables: with PerfTimer() as timer: yield from self.emit_upstream_tables() - self.report.emit_upstream_tables_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.emit_upstream_tables_timer[self.site_id] = round(timer.elapsed_seconds(), 2) \ No newline at end of file From fbc6d8e7d31fb22ddae705758d175a717d2556a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 27 Dec 2024 15:16:17 +0100 Subject: [PATCH 3/9] fix lint --- .../ingestion/source/tableau/tableau.py | 90 +++++++++++++------ 1 file changed, 64 insertions(+), 26 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 62ee721db9316a..72c104147a00c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -3,7 +3,7 @@ import re import time from collections import OrderedDict -from dataclasses import dataclass, field +from dataclasses import dataclass, field as dataclass_field from datetime import datetime from functools import lru_cache from typing import ( @@ -650,17 +650,31 @@ class TableauSourceReport( num_csql_field_skipped_no_name: int = 0 num_table_field_skipped_no_name: int = 0 # timers - extract_usage_stats_timer: Dict[str, float] = field(default_factory=TopKDict) - fetch_groups_timer: Dict[str, float] = field(default_factory=TopKDict) - populate_database_server_hostname_map_timer: Dict[str, float] = field(default_factory=TopKDict) - populate_projects_registry_timer: Dict[str, float] = field(default_factory=TopKDict) - emit_workbooks_timer: Dict[str, float] = field(default_factory=TopKDict) - emit_sheets_timer: Dict[str, float] = field(default_factory=TopKDict) - emit_dashboards_timer: Dict[str, float] = field(default_factory=TopKDict) - emit_embedded_datasources_timer: Dict[str, float] = field(default_factory=TopKDict) - emit_published_datasources_timer: Dict[str, float] = field(default_factory=TopKDict) - emit_custom_sql_datasources_timer: Dict[str, float] = field(default_factory=TopKDict) - emit_upstream_tables_timer: Dict[str, float] = field(default_factory=TopKDict) + extract_usage_stats_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + fetch_groups_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + populate_database_server_hostname_map_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + populate_projects_registry_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_workbooks_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_sheets_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_dashboards_timer: Dict[str, float] = dataclass_field(default_factory=TopKDict) + emit_embedded_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_published_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_custom_sql_datasources_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) + emit_upstream_tables_timer: Dict[str, float] = dataclass_field( + default_factory=TopKDict + ) # lineage num_tables_with_upstream_lineage: int = 0 num_upstream_table_lineage: int = 0 @@ -671,7 +685,7 @@ class TableauSourceReport( num_upstream_table_lineage_failed_parse_sql: int = 0 num_upstream_fine_grained_lineage_failed_parse_sql: int = 0 num_hidden_assets_skipped: int = 0 - logged_in_user: List[UserInfo] = field(default_factory=list) + logged_in_user: List[UserInfo] = dataclass_field(default_factory=list) def report_user_role(report: TableauSourceReport, server: Server) -> None: @@ -837,7 +851,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from site_source.ingest_tableau_site() self.report.report_ingestion_stage_start("End") - + except MetadataQueryException as md_exception: self.report.failure( title="Failed to Retrieve Tableau Metadata", @@ -3479,29 +3493,39 @@ def _create_workbook_properties( return {"permissions": json.dumps(groups)} if len(groups) > 0 else None def ingest_tableau_site(self): - self.report.report_ingestion_stage_start(f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}") + self.report.report_ingestion_stage_start( + f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}" + ) # Initialise the dictionary to later look-up for chart and dashboard stat if self.config.extract_usage_stats: with PerfTimer() as timer: self._populate_usage_stat_registry() - self.report.extract_usage_stats_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.extract_usage_stats_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.config.permission_ingestion: with PerfTimer() as timer: self._fetch_groups() - self.report.fetch_groups_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.fetch_groups_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) # Populate the map of database names and database hostnames to be used later to map # databases to platform instances. if self.config.database_hostname_to_platform_instance_map: with PerfTimer() as timer: self._populate_database_server_hostname_map() - self.report.populate_database_server_hostname_map_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.populate_database_server_hostname_map_timer[ + self.site_id + ] = round(timer.elapsed_seconds(), 2) with PerfTimer() as timer: self._populate_projects_registry() - self.report.populate_projects_registry_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.populate_projects_registry_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.config.add_site_container: yield from self.emit_site_container() @@ -3509,34 +3533,48 @@ def ingest_tableau_site(self): with PerfTimer() as timer: yield from self.emit_workbooks() - self.report.emit_workbooks_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.emit_workbooks_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.sheet_ids: with PerfTimer() as timer: yield from self.emit_sheets() - self.report.emit_sheets_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.emit_sheets_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.dashboard_ids: with PerfTimer() as timer: yield from self.emit_dashboards() - self.report.emit_dashboards_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.emit_dashboards_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.embedded_datasource_ids_being_used: with PerfTimer() as timer: yield from self.emit_embedded_datasources() - self.report.emit_embedded_datasources_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.emit_embedded_datasources_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.datasource_ids_being_used: with PerfTimer() as timer: yield from self.emit_published_datasources() - self.report.emit_published_datasources_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.emit_published_datasources_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.custom_sql_ids_being_used: with PerfTimer() as timer: yield from self.emit_custom_sql_datasources() - self.report.emit_custom_sql_datasources_timer[self.site_id] = round(timer.elapsed_seconds(), 2) + self.report.emit_custom_sql_datasources_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) if self.database_tables: with PerfTimer() as timer: yield from self.emit_upstream_tables() - self.report.emit_upstream_tables_timer[self.site_id] = round(timer.elapsed_seconds(), 2) \ No newline at end of file + self.report.emit_upstream_tables_timer[self.site_id] = round( + timer.elapsed_seconds(), 2 + ) From 332953fa94f4716fc8be5eadf660f7bdab0a47ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 2 Jan 2025 10:52:56 +0100 Subject: [PATCH 4/9] automatically close latest ongoing stage if any --- .../src/datahub/ingestion/api/source.py | 2 ++ .../datahub/ingestion/source/tableau/tableau.py | 5 +++-- .../ingestion/source_report/ingestion_stage.py | 17 ++++++++++------- .../src/datahub/utilities/perf_timer.py | 9 ++++++--- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index c3638635b19aac..d3adfe0069b2a6 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -331,6 +331,8 @@ def as_obj(self) -> dict: } def compute_stats(self) -> None: + super().compute_stats() + duration = datetime.datetime.now() - self.start_time workunits_produced = self.events_produced if duration.total_seconds() > 0: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 72c104147a00c8..c9ca383a09c57c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -687,6 +687,9 @@ class TableauSourceReport( num_hidden_assets_skipped: int = 0 logged_in_user: List[UserInfo] = dataclass_field(default_factory=list) + def compute_stats(self) -> None: + self.close_stage() + def report_user_role(report: TableauSourceReport, server: Server) -> None: title: str = "Insufficient Permissions" @@ -850,8 +853,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) yield from site_source.ingest_tableau_site() - self.report.report_ingestion_stage_start("End") - except MetadataQueryException as md_exception: self.report.failure( title="Failed to Retrieve Tableau Metadata", diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index ce683e64b3f468..81972b62074c4c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -25,12 +25,10 @@ class IngestionStageReport: ingestion_stage: Optional[str] = None ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) - _timer: Optional[PerfTimer] = field( - default=None, init=False, repr=False, compare=False - ) + _timer: PerfTimer = PerfTimer() - def report_ingestion_stage_start(self, stage: str) -> None: - if self._timer: + def _close_stage(self) -> None: + if self._timer.is_running(): elapsed = round(self._timer.elapsed_seconds(), 2) logger.info( f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds", @@ -38,9 +36,14 @@ def report_ingestion_stage_start(self, stage: str) -> None: ) if self.ingestion_stage: self.ingestion_stage_durations[self.ingestion_stage] = elapsed - else: - self._timer = PerfTimer() + + def report_ingestion_stage_start(self, stage: str) -> None: + self._close_stage() self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" logger.info(f"Stage started: {self.ingestion_stage}") self._timer.start() + + def close_stage(self) -> None: + # just close ongoing stage if any + self._close_stage() diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 9488683d6d8cac..66cd67062a7a49 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -70,6 +70,11 @@ def elapsed_seconds(self) -> float: return (self.end_time - self.start_time) + self._past_active_time def assert_timer_is_running(self) -> None: + if not self.is_running(): + self._error_state = True + logger.warning("Did you forget to start the timer ?") + + def is_running(self) -> bool: """ Returns true if timer is in running state. Timer is in NOT in running state if @@ -77,9 +82,7 @@ def assert_timer_is_running(self) -> None: 2. it is in paused state. 3. it had been started and finished in the past but not started again. """ - if self.start_time is None or self.paused or self.end_time: - self._error_state = True - logger.warning("Did you forget to start the timer ?") + return self.start_time is not None and not self.paused and self.end_time is None def __repr__(self) -> str: return repr(self.as_obj()) From 0b13bfc397a04a9e024bdfa5672654d062acc5fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 2 Jan 2025 11:08:58 +0100 Subject: [PATCH 5/9] use site_content_url and round digits parameter to perftimer method --- .../ingestion/source/tableau/tableau.py | 64 +++++++++---------- .../src/datahub/utilities/perf_timer.py | 8 ++- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index c9ca383a09c57c..5c0378eff77b27 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -3502,16 +3502,16 @@ def ingest_tableau_site(self): if self.config.extract_usage_stats: with PerfTimer() as timer: self._populate_usage_stat_registry() - self.report.extract_usage_stats_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.extract_usage_stats_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.config.permission_ingestion: with PerfTimer() as timer: self._fetch_groups() - self.report.fetch_groups_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.fetch_groups_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) # Populate the map of database names and database hostnames to be used later to map # databases to platform instances. @@ -3519,14 +3519,14 @@ def ingest_tableau_site(self): with PerfTimer() as timer: self._populate_database_server_hostname_map() self.report.populate_database_server_hostname_map_timer[ - self.site_id - ] = round(timer.elapsed_seconds(), 2) + self.site_content_url + ] = timer.elapsed_seconds(digits=2) with PerfTimer() as timer: self._populate_projects_registry() - self.report.populate_projects_registry_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.populate_projects_registry_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.config.add_site_container: yield from self.emit_site_container() @@ -3534,48 +3534,48 @@ def ingest_tableau_site(self): with PerfTimer() as timer: yield from self.emit_workbooks() - self.report.emit_workbooks_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.emit_workbooks_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.sheet_ids: with PerfTimer() as timer: yield from self.emit_sheets() - self.report.emit_sheets_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.emit_sheets_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.dashboard_ids: with PerfTimer() as timer: yield from self.emit_dashboards() - self.report.emit_dashboards_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.emit_dashboards_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.embedded_datasource_ids_being_used: with PerfTimer() as timer: yield from self.emit_embedded_datasources() - self.report.emit_embedded_datasources_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.emit_embedded_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.datasource_ids_being_used: with PerfTimer() as timer: yield from self.emit_published_datasources() - self.report.emit_published_datasources_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.emit_published_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.custom_sql_ids_being_used: with PerfTimer() as timer: yield from self.emit_custom_sql_datasources() - self.report.emit_custom_sql_datasources_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.emit_custom_sql_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) if self.database_tables: with PerfTimer() as timer: yield from self.emit_upstream_tables() - self.report.emit_upstream_tables_timer[self.site_id] = round( - timer.elapsed_seconds(), 2 - ) + self.report.emit_upstream_tables_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 66cd67062a7a49..99bd1ca0058c9f 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -57,7 +57,7 @@ def __exit__( self.finish() return None - def elapsed_seconds(self) -> float: + def elapsed_seconds(self, digits: Optional[int] = 5) -> float: """ Returns the elapsed time in seconds. """ @@ -65,9 +65,11 @@ def elapsed_seconds(self) -> float: return self._past_active_time if self.end_time is None: - return (time.perf_counter() - self.start_time) + (self._past_active_time) + elapsed = (time.perf_counter() - self.start_time) + (self._past_active_time) else: - return (self.end_time - self.start_time) + self._past_active_time + elapsed = (self.end_time - self.start_time) + self._past_active_time + + return round(elapsed, digits) if digits else elapsed def assert_timer_is_running(self) -> None: if not self.is_running(): From 5d42781e4c7cbed7c23e0bdc66f63666d115d439 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 2 Jan 2025 15:15:33 +0100 Subject: [PATCH 6/9] fix default --- metadata-ingestion/src/datahub/utilities/perf_timer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index 99bd1ca0058c9f..b91b93b0eb3089 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -57,7 +57,7 @@ def __exit__( self.finish() return None - def elapsed_seconds(self, digits: Optional[int] = 5) -> float: + def elapsed_seconds(self, digits: Optional[int] = None) -> float: """ Returns the elapsed time in seconds. """ From 2fe9a1ca5328ebea7ebd38f9c7964a99229a1110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 3 Jan 2025 11:02:08 +0100 Subject: [PATCH 7/9] revert updates in IngestionStageReport and implement a new one using ContextManager --- .../ingestion/source/tableau/tableau.py | 154 +++++++++--------- .../source_report/ingestion_stage.py | 46 +++++- .../unit/reporting/test_ingestion_stage.py | 42 +++++ 3 files changed, 154 insertions(+), 88 deletions(-) create mode 100644 metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index bd52fbdfa1e11d..b1c5d1c79f082c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -117,7 +117,7 @@ ) from datahub.ingestion.source.tableau.tableau_server_wrapper import UserInfo from datahub.ingestion.source.tableau.tableau_validation import check_user_role -from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport +from datahub.ingestion.source_report.ingestion_stage import IngestionStageContextReport from datahub.metadata.com.linkedin.pegasus2avro.common import ( AuditStamp, ChangeAuditStamps, @@ -642,7 +642,7 @@ class SiteIdContentUrl: @dataclass class TableauSourceReport( StaleEntityRemovalSourceReport, - IngestionStageReport, + IngestionStageContextReport, ): get_all_datasources_query_failed: bool = False num_get_datasource_query_failures: int = 0 @@ -687,9 +687,6 @@ class TableauSourceReport( num_hidden_assets_skipped: int = 0 logged_in_user: List[UserInfo] = dataclass_field(default_factory=list) - def compute_stats(self) -> None: - self.close_stage() - def report_user_role(report: TableauSourceReport, server: Server) -> None: title: str = "Insufficient Permissions" @@ -3491,88 +3488,87 @@ def _create_workbook_properties( return {"permissions": json.dumps(groups)} if len(groups) > 0 else None def ingest_tableau_site(self): - self.report.report_ingestion_stage_start( + with self.report.new_stage( f"Ingesting Tableau Site: {self.site_id} {self.site_content_url}" - ) - - # Initialise the dictionary to later look-up for chart and dashboard stat - if self.config.extract_usage_stats: - with PerfTimer() as timer: - self._populate_usage_stat_registry() - self.report.extract_usage_stats_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) - - if self.config.permission_ingestion: - with PerfTimer() as timer: - self._fetch_groups() - self.report.fetch_groups_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) - - # Populate the map of database names and database hostnames to be used later to map - # databases to platform instances. - if self.config.database_hostname_to_platform_instance_map: - with PerfTimer() as timer: - self._populate_database_server_hostname_map() - self.report.populate_database_server_hostname_map_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) - - with PerfTimer() as timer: - self._populate_projects_registry() - self.report.populate_projects_registry_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) - - if self.config.add_site_container: - yield from self.emit_site_container() - yield from self.emit_project_containers() - - with PerfTimer() as timer: - yield from self.emit_workbooks() - self.report.emit_workbooks_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) - - if self.sheet_ids: - with PerfTimer() as timer: - yield from self.emit_sheets() - self.report.emit_sheets_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) - - if self.dashboard_ids: - with PerfTimer() as timer: - yield from self.emit_dashboards() - self.report.emit_dashboards_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + ): + # Initialise the dictionary to later look-up for chart and dashboard stat + if self.config.extract_usage_stats: + with PerfTimer() as timer: + self._populate_usage_stat_registry() + self.report.extract_usage_stats_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.config.permission_ingestion: + with PerfTimer() as timer: + self._fetch_groups() + self.report.fetch_groups_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + # Populate the map of database names and database hostnames to be used later to map + # databases to platform instances. + if self.config.database_hostname_to_platform_instance_map: + with PerfTimer() as timer: + self._populate_database_server_hostname_map() + self.report.populate_database_server_hostname_map_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) - if self.embedded_datasource_ids_being_used: with PerfTimer() as timer: - yield from self.emit_embedded_datasources() - self.report.emit_embedded_datasources_timer[ + self._populate_projects_registry() + self.report.populate_projects_registry_timer[ self.site_content_url ] = timer.elapsed_seconds(digits=2) - if self.datasource_ids_being_used: - with PerfTimer() as timer: - yield from self.emit_published_datasources() - self.report.emit_published_datasources_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + if self.config.add_site_container: + yield from self.emit_site_container() + yield from self.emit_project_containers() - if self.custom_sql_ids_being_used: with PerfTimer() as timer: - yield from self.emit_custom_sql_datasources() - self.report.emit_custom_sql_datasources_timer[ + yield from self.emit_workbooks() + self.report.emit_workbooks_timer[ self.site_content_url ] = timer.elapsed_seconds(digits=2) - if self.database_tables: - with PerfTimer() as timer: - yield from self.emit_upstream_tables() - self.report.emit_upstream_tables_timer[ - self.site_content_url - ] = timer.elapsed_seconds(digits=2) + if self.sheet_ids: + with PerfTimer() as timer: + yield from self.emit_sheets() + self.report.emit_sheets_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.dashboard_ids: + with PerfTimer() as timer: + yield from self.emit_dashboards() + self.report.emit_dashboards_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.embedded_datasource_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_embedded_datasources() + self.report.emit_embedded_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.datasource_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_published_datasources() + self.report.emit_published_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.custom_sql_ids_being_used: + with PerfTimer() as timer: + yield from self.emit_custom_sql_datasources() + self.report.emit_custom_sql_datasources_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) + + if self.database_tables: + with PerfTimer() as timer: + yield from self.emit_upstream_tables() + self.report.emit_upstream_tables_timer[ + self.site_content_url + ] = timer.elapsed_seconds(digits=2) diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 81972b62074c4c..5d2870d41c686f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -1,4 +1,5 @@ import logging +from contextlib import AbstractContextManager from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Optional @@ -25,10 +26,12 @@ class IngestionStageReport: ingestion_stage: Optional[str] = None ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) - _timer: PerfTimer = PerfTimer() + _timer: Optional[PerfTimer] = field( + default=None, init=False, repr=False, compare=False + ) - def _close_stage(self) -> None: - if self._timer.is_running(): + def report_ingestion_stage_start(self, stage: str) -> None: + if self._timer: elapsed = round(self._timer.elapsed_seconds(), 2) logger.info( f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds", @@ -36,14 +39,39 @@ def _close_stage(self) -> None: ) if self.ingestion_stage: self.ingestion_stage_durations[self.ingestion_stage] = elapsed - - def report_ingestion_stage_start(self, stage: str) -> None: - self._close_stage() + else: + self._timer = PerfTimer() self.ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" logger.info(f"Stage started: {self.ingestion_stage}") self._timer.start() - def close_stage(self) -> None: - # just close ongoing stage if any - self._close_stage() + +@dataclass +class IngestionStageContextReport: + ingestion_stage_durations: TopKDict[str, float] = field(default_factory=TopKDict) + + def new_stage(self, stage: str) -> "IngestionStageContext": + return IngestionStageContext(stage, self) + + +@dataclass +class IngestionStageContext(AbstractContextManager): + def __init__(self, stage: str, report: IngestionStageContextReport): + self._ingestion_stage = f"{stage} at {datetime.now(timezone.utc)}" + self._timer: PerfTimer = PerfTimer() + self._report = report + + def __enter__(self) -> "IngestionStageContext": + logger.info(f"Stage started: {self._ingestion_stage}") + self._timer.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + elapsed = self._timer.elapsed_seconds(digits=2) + logger.info( + f"Time spent in stage <{self._ingestion_stage}>: {elapsed} seconds", + stacklevel=2, + ) + self._report.ingestion_stage_durations[self._ingestion_stage] = elapsed + return None diff --git a/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py new file mode 100644 index 00000000000000..7c62214323bdad --- /dev/null +++ b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py @@ -0,0 +1,42 @@ +import time + +from datahub.ingestion.source_report.ingestion_stage import IngestionStageContextReport + + +def test_ingestion_stage_context_records_duration(): + report = IngestionStageContextReport() + with report.new_stage(stage="Test Stage"): + pass + assert len(report._ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report._ingestion_stage_durations.keys())) + + +def test_ingestion_stage_context_handles_exceptions(): + report = IngestionStageContextReport() + try: + with report.new_stage(stage="Test Stage"): + raise ValueError("Test Exception") + except ValueError: + pass + assert len(report._ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report._ingestion_stage_durations)) + + +def test_ingestion_stage_context_report_handles_multiple_stages(): + report = IngestionStageContextReport() + with report.new_stage(stage="Test Stage 1"): + time.sleep(0.1) + with report.new_stage(stage="Test Stage 2"): + time.sleep(0.1) + with report.new_stage(stage="Test Stage 3"): + time.sleep(0.1) + assert len(report._ingestion_stage_durations) == 3 + assert all( + isinstance(duration, float) and duration > 0.0 + for duration in report._ingestion_stage_durations.values() + ) + + sorted_stages = list(sorted(report._ingestion_stage_durations.keys())) + assert "Test Stage 1" in sorted_stages[0] + assert "Test Stage 2" in sorted_stages[1] + assert "Test Stage 3" in sorted_stages[2] From cc73d57450dfd30826a346e805d169a8b1fc7f9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 3 Jan 2025 11:15:59 +0100 Subject: [PATCH 8/9] fixup --- .../tests/unit/reporting/test_ingestion_stage.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py index 7c62214323bdad..4b791a2c83d85f 100644 --- a/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py +++ b/metadata-ingestion/tests/unit/reporting/test_ingestion_stage.py @@ -7,8 +7,8 @@ def test_ingestion_stage_context_records_duration(): report = IngestionStageContextReport() with report.new_stage(stage="Test Stage"): pass - assert len(report._ingestion_stage_durations) == 1 - assert "Test Stage" in next(iter(report._ingestion_stage_durations.keys())) + assert len(report.ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report.ingestion_stage_durations.keys())) def test_ingestion_stage_context_handles_exceptions(): @@ -18,8 +18,8 @@ def test_ingestion_stage_context_handles_exceptions(): raise ValueError("Test Exception") except ValueError: pass - assert len(report._ingestion_stage_durations) == 1 - assert "Test Stage" in next(iter(report._ingestion_stage_durations)) + assert len(report.ingestion_stage_durations) == 1 + assert "Test Stage" in next(iter(report.ingestion_stage_durations)) def test_ingestion_stage_context_report_handles_multiple_stages(): @@ -30,13 +30,13 @@ def test_ingestion_stage_context_report_handles_multiple_stages(): time.sleep(0.1) with report.new_stage(stage="Test Stage 3"): time.sleep(0.1) - assert len(report._ingestion_stage_durations) == 3 + assert len(report.ingestion_stage_durations) == 3 assert all( isinstance(duration, float) and duration > 0.0 - for duration in report._ingestion_stage_durations.values() + for duration in report.ingestion_stage_durations.values() ) - sorted_stages = list(sorted(report._ingestion_stage_durations.keys())) + sorted_stages = list(sorted(report.ingestion_stage_durations.keys())) assert "Test Stage 1" in sorted_stages[0] assert "Test Stage 2" in sorted_stages[1] assert "Test Stage 3" in sorted_stages[2] From 986fb1dee6eee51109486d6c391c7054cad0d300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Fri, 3 Jan 2025 11:31:07 +0100 Subject: [PATCH 9/9] timer elapsed time default digits = 4 --- .../source/bigquery_v2/bigquery_schema_gen.py | 6 +++--- .../ingestion/source/bigquery_v2/lineage.py | 4 ++-- .../ingestion/source/bigquery_v2/usage.py | 4 ++-- .../ingestion/source/redshift/redshift.py | 16 +++++++--------- .../datahub/ingestion/source/redshift/usage.py | 2 +- .../source/snowflake/snowflake_usage_v2.py | 6 +++--- .../ingestion/source_report/ingestion_stage.py | 2 +- .../src/datahub/utilities/perf_timer.py | 4 ++-- .../performance/bigquery/test_bigquery_usage.py | 2 +- .../tests/performance/databricks/test_unity.py | 2 +- .../performance/snowflake/test_snowflake.py | 2 +- .../tests/performance/sql/test_sql_formatter.py | 6 ++++-- 12 files changed, 28 insertions(+), 28 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index 4a3b47f6b543a6..4f8c1b9f7fd273 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -1209,9 +1209,9 @@ def get_tables_for_dataset( report=self.report, ) - self.report.metadata_extraction_sec[f"{project_id}.{dataset.name}"] = round( - timer.elapsed_seconds(), 2 - ) + self.report.metadata_extraction_sec[ + f"{project_id}.{dataset.name}" + ] = timer.elapsed_seconds(digits=2) def get_core_table_details( self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 321b1b6207fabf..008adc42ca79c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -381,8 +381,8 @@ def generate_lineage( self.report.lineage_metadata_entries[project_id] = len(lineage) logger.info(f"Built lineage map containing {len(lineage)} entries.") logger.debug(f"lineage metadata is {lineage}") - self.report.lineage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[project_id] = timer.elapsed_seconds( + digits=2 ) self.report.lineage_mem_size[project_id] = humanfriendly.format_size( memory_footprint.total_size(lineage) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 876ffab85ba311..b9ca8deb68d3a9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -572,8 +572,8 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: ) self.report_status(f"usage-extraction-{project_id}", False) - self.report.usage_extraction_sec[project_id] = round( - timer.elapsed_seconds(), 2 + self.report.usage_extraction_sec[project_id] = timer.elapsed_seconds( + digits=2 ) def _store_usage_event( diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 49f7941563c1a7..a8ddda201f9c57 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -633,8 +633,8 @@ def process_schema( else: logger.info("View processing disabled, skipping") - self.report.metadata_extraction_sec[report_key] = round( - timer.elapsed_seconds(), 2 + self.report.metadata_extraction_sec[report_key] = timer.elapsed_seconds( + digits=2 ) def _process_table( @@ -986,9 +986,7 @@ def extract_usage( yield from usage_extractor.get_usage_workunits(all_tables=all_tables) - self.report.usage_extraction_sec[database] = round( - timer.elapsed_seconds(), 2 - ) + self.report.usage_extraction_sec[database] = timer.elapsed_seconds(digits=2) def extract_lineage( self, @@ -1011,8 +1009,8 @@ def extract_lineage( database=database, connection=connection, all_tables=all_tables ) - self.report.lineage_extraction_sec[f"{database}"] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds( + digits=2 ) yield from self.generate_lineage( database, lineage_extractor=lineage_extractor @@ -1042,8 +1040,8 @@ def extract_lineage_v2( yield from lineage_extractor.generate() - self.report.lineage_extraction_sec[f"{database}"] = round( - timer.elapsed_seconds(), 2 + self.report.lineage_extraction_sec[f"{database}"] = timer.elapsed_seconds( + digits=2 ) if self.redundant_lineage_run_skip_handler: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py index e0bf8b23dd0f7d..7c6affd50321fb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/usage.py @@ -190,7 +190,7 @@ def _get_workunits_internal( ) self.report.operational_metadata_extraction_sec[ self.config.database - ] = round(timer.elapsed_seconds(), 2) + ] = timer.elapsed_seconds(digits=2) # Generate aggregate events self.report.report_ingestion_stage_start(USAGE_EXTRACTION_USAGE_AGGREGATION) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py index 4bdf559f293b51..73565ebf30593e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_usage_v2.py @@ -386,7 +386,7 @@ def _get_snowflake_history(self) -> Iterable[SnowflakeJoinedAccessEvent]: ) self.report_status(USAGE_EXTRACTION_OPERATIONAL_STATS, False) return - self.report.access_history_query_secs = round(timer.elapsed_seconds(), 2) + self.report.access_history_query_secs = timer.elapsed_seconds(digits=2) for row in results: yield from self._process_snowflake_history_row(row) @@ -434,8 +434,8 @@ def _check_usage_date_ranges(self) -> None: self.report.max_access_history_time = db_row["MAX_TIME"].astimezone( tz=timezone.utc ) - self.report.access_history_range_query_secs = round( - timer.elapsed_seconds(), 2 + self.report.access_history_range_query_secs = timer.elapsed_seconds( + digits=2 ) def _get_operation_aspect_work_unit( diff --git a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py index 5d2870d41c686f..40959af60ed2b0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_report/ingestion_stage.py @@ -32,7 +32,7 @@ class IngestionStageReport: def report_ingestion_stage_start(self, stage: str) -> None: if self._timer: - elapsed = round(self._timer.elapsed_seconds(), 2) + elapsed = self._timer.elapsed_seconds(digits=2) logger.info( f"Time spent in stage <{self.ingestion_stage}>: {elapsed} seconds", stacklevel=2, diff --git a/metadata-ingestion/src/datahub/utilities/perf_timer.py b/metadata-ingestion/src/datahub/utilities/perf_timer.py index b91b93b0eb3089..fc1b1ed58244c3 100644 --- a/metadata-ingestion/src/datahub/utilities/perf_timer.py +++ b/metadata-ingestion/src/datahub/utilities/perf_timer.py @@ -57,7 +57,7 @@ def __exit__( self.finish() return None - def elapsed_seconds(self, digits: Optional[int] = None) -> float: + def elapsed_seconds(self, digits: int = 4) -> float: """ Returns the elapsed time in seconds. """ @@ -69,7 +69,7 @@ def elapsed_seconds(self, digits: Optional[int] = None) -> float: else: elapsed = (self.end_time - self.start_time) + self._past_active_time - return round(elapsed, digits) if digits else elapsed + return round(elapsed, digits) def assert_timer_is_running(self) -> None: if not self.is_running(): diff --git a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py index 9cb80ff02657bb..ee5baacf2441f5 100644 --- a/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py +++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py @@ -80,7 +80,7 @@ def run_test(): num_workunits, peak_memory_usage = workunit_sink(workunits) report.set_ingestion_stage("All", "Done") print(f"Workunits Generated: {num_workunits}") - print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") print( f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" diff --git a/metadata-ingestion/tests/performance/databricks/test_unity.py b/metadata-ingestion/tests/performance/databricks/test_unity.py index ddd19804ba1841..71192dc5b509bc 100644 --- a/metadata-ingestion/tests/performance/databricks/test_unity.py +++ b/metadata-ingestion/tests/performance/databricks/test_unity.py @@ -59,7 +59,7 @@ def run_test(): workunits = source.get_workunits() num_workunits, peak_memory_usage = workunit_sink(workunits) print(f"Workunits Generated: {num_workunits}") - print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + print(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") print( f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" diff --git a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py index 984d9e42957452..a940cce46a8f74 100644 --- a/metadata-ingestion/tests/performance/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/performance/snowflake/test_snowflake.py @@ -53,7 +53,7 @@ def run_test(): workunits = source.get_workunits() num_workunits, peak_memory_usage = workunit_sink(workunits) logging.info(f"Workunits Generated: {num_workunits}") - logging.info(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + logging.info(f"Seconds Elapsed: {timer.elapsed_seconds(digits=2)} seconds") logging.info(source.get_report().as_string()) logging.info( diff --git a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py index 5f783efc559bc9..f09047c0ec4a4f 100644 --- a/metadata-ingestion/tests/performance/sql/test_sql_formatter.py +++ b/metadata-ingestion/tests/performance/sql/test_sql_formatter.py @@ -12,12 +12,14 @@ def run_test() -> None: for i in range(N): if i % 50 == 0: print( - f"Running iteration {i}, elapsed time: {timer.elapsed_seconds():.2f} seconds" + f"Running iteration {i}, elapsed time: {timer.elapsed_seconds(digits=2)} seconds" ) try_format_query.__wrapped__(large_sql_query, platform="snowflake") - print(f"Total time taken for {N} iterations: {timer.elapsed_seconds():.2f} seconds") + print( + f"Total time taken for {N} iterations: {timer.elapsed_seconds(digits=2)} seconds" + ) if __name__ == "__main__":