-
Notifications
You must be signed in to change notification settings - Fork 114
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Visualize Dataset statistics in metadata panel #1472
Changes from 34 commits
f86ac6f
0370d56
e6e3898
6812664
ccafa98
a37459c
8c6965f
83bf822
9466ea6
cf90243
5a08448
bb5a342
9d66c92
3103894
bbbeb7d
72b8c74
36760b4
dd65977
86dd8ac
37f7059
3c231a0
c3ff3e1
5b1f7e4
f7a4dc1
d84f01f
3b55684
18d9974
beb1e5e
4f3e77f
b332dc2
8ddfffe
cf21083
4213cf7
57c139b
f917c22
7b88fc9
2d823da
381dfa4
6dd02ff
ac65d0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"companies": { "rows": 77096, "columns": 5 }, | ||
"reviews": { "rows": 77096, "columns": 10 }, | ||
"shuttles": { "rows": 77096, "columns": 13 }, | ||
"ingestion.int_typed_companies": { "rows": 77096, "columns": 5 }, | ||
"ingestion.int_typed_shuttles": { "rows": 77096, "columns": 13 }, | ||
"ingestion.prm_agg_companies": { "rows": 50098, "columns": 5 }, | ||
"ingestion.int_typed_reviews": { "rows": 55790, "columns": 11 }, | ||
"prm_spine_table": { "rows": 29768, "columns": 3 }, | ||
"prm_shuttle_company_reviews": { "rows": 29768, "columns": 27 }, | ||
"feature_engineering.feat_static_features": { "rows": 29768, "columns": 12 }, | ||
"feature_engineering.feat_derived_features": { "rows": 29768, "columns": 3 }, | ||
"feature_importance_output": { "rows": 15, "columns": 2 }, | ||
"model_input_table": { "rows": 29768, "columns": 12 }, | ||
"X_train": { "rows": 23814, "columns": 11 }, | ||
"X_test": { "rows": 5954, "columns": 11 } | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
# pylint: disable=broad-exception-caught | ||
"""`kedro_viz.integrations.kedro.hooks` defines hooks to add additional | ||
functionalities for a kedro run.""" | ||
|
||
import json | ||
import logging | ||
from collections import defaultdict | ||
from typing import Any | ||
|
||
import pandas as pd | ||
ravi-kumar-pilla marked this conversation as resolved.
Show resolved
Hide resolved
|
||
from kedro.framework.hooks import hook_impl | ||
|
||
from kedro_viz.integrations.kedro.utils import get_stats_dataset_name, stats_order | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class DatasetStatsHook: | ||
"""Class to collect dataset statistics during a kedro run | ||
and save it to a JSON file. The class currently supports | ||
(pd.DataFrame) dataset instances""" | ||
|
||
def __init__(self): | ||
self._stats = defaultdict(dict) | ||
|
||
@hook_impl | ||
def after_dataset_loaded(self, dataset_name: str, data: Any): | ||
"""Hook to be invoked after a dataset is loaded from the catalog. | ||
Once the dataset is loaded, extract the required dataset statistics. | ||
The hook currently supports (pd.DataFrame) dataset instances | ||
|
||
Args: | ||
dataset_name: name of the dataset that was saved to the catalog. | ||
data: the actual data that was saved to the catalog. | ||
""" | ||
try: | ||
stats_dataset_name = get_stats_dataset_name(dataset_name) | ||
if isinstance(data, pd.DataFrame): | ||
self._stats[stats_dataset_name]["rows"] = int(data.shape[0]) | ||
self._stats[stats_dataset_name]["columns"] = int(data.shape[1]) | ||
|
||
except Exception as exc: # pragma: no cover | ||
logger.warning( | ||
"Unable to create statistics for the dataset %s : %s", dataset_name, exc | ||
) | ||
|
||
@hook_impl | ||
def after_pipeline_run(self): | ||
"""Hook to be invoked after a pipeline runs. | ||
Once the pipeline run completes, write the dataset | ||
statistics to stats.json file | ||
|
||
""" | ||
try: | ||
with open("stats.json", "w", encoding="utf8") as file: | ||
sorted_stats_data = { | ||
dataset_name: stats_order(stats) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it necessary to order the JSON file? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will help when reading the json in the frontend. Also, will be helpful if we have structured data for readability There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I found the name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And can it be just a helper method stay within the class of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same apply to the other functions. Can we not have the |
||
for dataset_name, stats in self._stats.items() | ||
} | ||
json.dump(sorted_stats_data, file) | ||
|
||
except Exception as exc: # pragma: no cover | ||
logger.warning( | ||
"Unable to write dataset statistics for the pipeline: %s", exc | ||
) | ||
|
||
|
||
dataset_stats_hook = DatasetStatsHook() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
"""`kedro_viz.integrations.kedro.utils` contains utility | ||
functions used in the `kedro_viz.integrations.kedro` package""" | ||
|
||
from kedro.pipeline.pipeline import TRANSCODING_SEPARATOR, _strip_transcoding | ||
|
||
|
||
def stats_order(stats: dict) -> dict: | ||
"""Sort the stats extracted from the datasets using the sort order | ||
|
||
Args: | ||
stats: A dictionary of statistics for a dataset | ||
|
||
Returns: A sorted dictionary based on the sort_order | ||
""" | ||
# Custom sort order | ||
sort_order = ["rows", "columns", "file_size"] | ||
return {stat: stats.get(stat) for stat in sort_order if stat in stats} | ||
|
||
|
||
def get_stats_dataset_name(dataset_name: str): | ||
"""Get the dataset name for assigning stat values in the dictionary. | ||
If the dataset name contains transcoded information, strip the transcoding. | ||
|
||
Args: | ||
dataset_name: name of the dataset | ||
|
||
Returns: Dataset name without any transcoding information | ||
""" | ||
|
||
stats_dataset_name = dataset_name | ||
|
||
# Strip transcoding | ||
is_transcoded_dataset = TRANSCODING_SEPARATOR in dataset_name | ||
if is_transcoded_dataset: | ||
stats_dataset_name = _strip_transcoding(dataset_name) | ||
|
||
return stats_dataset_name |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should it be
None
of{}
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should be an empty dict. This method is called in the router.py where I was checking for None and initializing the stats to empty dict. Instead this way, we always get a dict for a data node