Skip to content

Commit

Permalink
Visualize Dataset statistics in metadata panel (#1472)
Browse files Browse the repository at this point in the history
* initial draft with hooks

* modify test for formatFileSize

* add file size to the api response using fsspec

* update unit test for metadata panel

* remove print statements and update stats file

* update get file size to not consider empty dir

* fixing linting and format errors

* fix format and lint errors

* fix pytest errors

* add test cases and add fix for circle ci builds

* resolve PR comments

* fixing PR comments and add additional support for MemoryDataset

* update stats and modify file_size extraction

* fix lint and format errors

* fix lint errors

* fix lint errors

* fix lint errors

* fix lint errors

* fix lint errors

* fix for PR comments

* add test coverage for transcoded data node

* address PR comments

* fix lint errors

* modify test cases for hooks and utils

* add matplotlib in requirements file for e2e tests

* add design change for overflow

* add design change for overflow

* remove matplotlib from requirements and fix metadata suggestions

* add release notes for visualizing dataset stats

* add release notes for displaying dataset stats

* hooks update based on Nok's comments

* fix lint and format checks

* modify stats based on Nok's comments

* fix lint and format

* fixed failing unit test

* update code based on Nok's suggestion

---------

Co-authored-by: Tynan DeBold <[email protected]>
  • Loading branch information
ravi-kumar-pilla and tynandebold authored Aug 14, 2023
1 parent 6e1d127 commit 3c50980
Show file tree
Hide file tree
Showing 27 changed files with 766 additions and 37 deletions.
5 changes: 5 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ Please follow the established format:
- Use present tense (e.g. 'Add new feature')
- Include the ID number for the related PR (or PRs) in parentheses
-->
## Major features and improvements

- Add support for displaying dataset statistics in the metadata panel. (#1472)

# Release 6.3.5

## Bug fixes and other changes

Expand Down
24 changes: 24 additions & 0 deletions cypress/tests/ui/flowchart/flowchart.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,28 @@ describe('Flowchart DAG', () => {
.should('exist')
.and('have.text', `Oops, there's nothing to see here`);
});

it('verifies that users can open and see the dataset statistics in the metadata panel for datasets. #TC-51', () => {
const dataNodeText = 'Companies';

// Assert before action
cy.get('[data-label="Dataset statistics:]').should('not.exist');

// Action
cy.get('.pipeline-node > .pipeline-node__text')
.contains(dataNodeText)
.click({ force: true });

// Assert after action
cy.get('[data-label="Dataset statistics:"]').should('exist');
cy.get('[data-test=stats-value-rows]')
.invoke('text')
.should((rowsValue) => expect(rowsValue).to.be.eq('77,096'));
cy.get('[data-test=stats-value-columns]')
.invoke('text')
.should((colsValue) => expect(parseInt(colsValue)).to.be.eq(5));
cy.get('[data-test=stats-value-file_size]')
.invoke('text')
.should((fileSizeValue) => expect(fileSizeValue).to.be.eq('1.8MB'));
});
});
39 changes: 39 additions & 0 deletions demo-project/stats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"companies": { "rows": 77096, "columns": 5, "file_size": 1810602 },
"ingestion.int_typed_companies": {
"rows": 77096,
"columns": 5,
"file_size": 550616
},
"reviews": { "rows": 77096, "columns": 10, "file_size": 2937144 },
"ingestion.int_typed_reviews": {
"rows": 55790,
"columns": 11,
"file_size": 1335600
},
"shuttles": { "rows": 77096, "columns": 13, "file_size": 4195290 },
"ingestion.int_typed_shuttles": {
"rows": 77096,
"columns": 13,
"file_size": 1235685
},
"ingestion.prm_agg_companies": { "rows": 50098, "columns": 5 },
"prm_shuttle_company_reviews": {
"rows": 29768,
"columns": 27,
"file_size": 1020356
},
"prm_spine_table": { "rows": 29768, "columns": 3, "file_size": 655994 },
"feature_engineering.feat_derived_features": { "rows": 29768, "columns": 3 },
"feature_importance_output": { "rows": 15, "columns": 2, "file_size": 460 },
"feature_engineering.feat_static_features": { "rows": 29768, "columns": 12 },
"ingestion.prm_spine_table_clone": { "rows": 29768, "columns": 3 },
"reporting.cancellation_policy_breakdown": {
"rows": 21,
"columns": 3,
"file_size": 8744
},
"model_input_table": { "rows": 29768, "columns": 12, "file_size": 787351 },
"X_train": { "rows": 23814, "columns": 11 },
"X_test": { "rows": 5954, "columns": 11 }
}
2 changes: 2 additions & 0 deletions package/kedro_viz/api/rest/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ class DataNodeMetadataAPIResponse(BaseAPIResponse):
tracking_data: Optional[Dict]
run_command: Optional[str]
preview: Optional[Dict]
stats: Optional[Dict]

class Config:
schema_extra = {
Expand All @@ -130,6 +131,7 @@ class TranscodedDataNodeMetadataAPIReponse(BaseAPIResponse):
original_type: str
transcoded_types: List[str]
run_command: Optional[str]
stats: Optional[Dict]


class ParametersNodeMetadataAPIResponse(BaseAPIResponse):
Expand Down
6 changes: 4 additions & 2 deletions package/kedro_viz/api/rest/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ async def get_single_node_metadata(node_id: str):
return TaskNodeMetadata(node)

if isinstance(node, DataNode):
return DataNodeMetadata(node)
dataset_stats = data_access_manager.get_stats_for_data_node(node)
return DataNodeMetadata(node, dataset_stats)

if isinstance(node, TranscodedDataNode):
return TranscodedDataNodeMetadata(node)
dataset_stats = data_access_manager.get_stats_for_data_node(node)
return TranscodedDataNodeMetadata(node, dataset_stats)

return ParametersNodeMetadata(node)

Expand Down
23 changes: 23 additions & 0 deletions package/kedro_viz/data_access/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(self):
)
self.runs = RunsRepository()
self.tracking_datasets = TrackingDatasetsRepository()
self.dataset_stats = {}

def set_db_session(self, db_session_class: sessionmaker):
"""Set db session on repositories that need it."""
Expand Down Expand Up @@ -91,6 +92,28 @@ def add_pipelines(self, pipelines: Dict[str, KedroPipeline]):
# Add the registered pipeline and its components to their repositories
self.add_pipeline(registered_pipeline_id, pipeline)

def add_dataset_stats(self, stats_dict: Dict):
"""Add dataset statistics (eg. rows, columns, file_size) as a dictionary.
This will help in showing the relevant stats in the metadata panel
Args:
stats_dict: A dictionary object loaded from stats.json file in the kedro project
"""

self.dataset_stats = stats_dict

def get_stats_for_data_node(
self, data_node: Union[DataNode, TranscodedDataNode]
) -> Dict:
"""Returns the dataset statistics for the data node if found else returns an
empty dictionary
Args:
The data node for which we need the statistics
"""

return self.dataset_stats.get(data_node.name, {})

def add_pipeline(self, registered_pipeline_id: str, pipeline: KedroPipeline):
"""Iterate through all the nodes and datasets in a "registered" pipeline
and add them to relevant repositories. Take care of extracting other relevant information
Expand Down
54 changes: 44 additions & 10 deletions package/kedro_viz/integrations/kedro/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# pylint: disable=missing-function-docstring, no-else-return

import base64
import json
import logging
from pathlib import Path
from typing import Any, Dict, Optional, Tuple

Expand All @@ -14,23 +16,25 @@

try:
from kedro_datasets import ( # isort:skip
json,
json as json_dataset,
matplotlib,
plotly,
tracking,
)
except ImportError:
from kedro.extras.datasets import ( # Safe since ImportErrors are suppressed within kedro.
json,
json as json_dataset,
matplotlib,
plotly,
tracking,
)

from kedro.io import DataCatalog
from kedro.io.core import get_filepath_str
from kedro.pipeline import Pipeline
from semver import VersionInfo

logger = logging.getLogger(__name__)
KEDRO_VERSION = VersionInfo.parse(__version__)


Expand All @@ -54,11 +58,37 @@ def _bootstrap(project_path: Path):
return


def get_dataset_stats(project_path: Path) -> Dict:
"""Return the stats saved at stats.json as a dictionary if found.
If not, return an empty dictionary
Args:
project_path: the path where the Kedro project is located.
"""
try:
stats_file_path = project_path / "stats.json"

if not stats_file_path.exists():
return {}

with open(stats_file_path, encoding="utf8") as stats_file:
stats = json.load(stats_file)
return stats

except Exception as exc: # pylint: disable=broad-exception-caught
logger.warning(
"Unable to get dataset statistics from project path %s : %s",
project_path,
exc,
)
return {}


def load_data(
project_path: Path,
env: Optional[str] = None,
extra_params: Optional[Dict[str, Any]] = None,
) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore]:
) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
"""Load data from a Kedro project.
Args:
project_path: the path whether the Kedro project is located.
Expand Down Expand Up @@ -91,7 +121,9 @@ def load_data(
# in case user doesn't have an active session down the line when it's first accessed.
# Useful for users who have `get_current_session` in their `register_pipelines()`.
pipelines_dict = dict(pipelines)
return catalog, pipelines_dict, session_store
stats_dict = get_dataset_stats(project_path)

return catalog, pipelines_dict, session_store, stats_dict
elif KEDRO_VERSION.match(">=0.17.1"):
from kedro.framework.session import KedroSession

Expand All @@ -103,8 +135,9 @@ def load_data(
) as session:
context = session.load_context()
session_store = session._store
stats_dict = get_dataset_stats(project_path)

return context.catalog, context.pipelines, session_store
return context.catalog, context.pipelines, session_store, stats_dict
else:
# Since Viz is only compatible with kedro>=0.17.0, this just matches 0.17.0
from kedro.framework.session import KedroSession
Expand All @@ -120,8 +153,9 @@ def load_data(
) as session:
context = session.load_context()
session_store = session._store
stats_dict = get_dataset_stats(project_path)

return context.catalog, context.pipelines, session_store
return context.catalog, context.pipelines, session_store, stats_dict


# The dataset type is available as an attribute if and only if the import from kedro
Expand All @@ -140,13 +174,13 @@ def matplotlib_writer_load(dataset: matplotlib.MatplotlibWriter) -> str:
matplotlib.MatplotlibWriter._load = matplotlib_writer_load

if hasattr(plotly, "JSONDataSet"):
plotly.JSONDataSet._load = json.JSONDataSet._load
plotly.JSONDataSet._load = json_dataset.JSONDataSet._load

if hasattr(plotly, "PlotlyDataSet"):
plotly.PlotlyDataSet._load = json.JSONDataSet._load
plotly.PlotlyDataSet._load = json_dataset.JSONDataSet._load

if hasattr(tracking, "JSONDataSet"):
tracking.JSONDataSet._load = json.JSONDataSet._load
tracking.JSONDataSet._load = json_dataset.JSONDataSet._load

if hasattr(tracking, "MetricsDataSet"):
tracking.MetricsDataSet._load = json.JSONDataSet._load
tracking.MetricsDataSet._load = json_dataset.JSONDataSet._load
Loading

0 comments on commit 3c50980

Please sign in to comment.