From 517cf217f430cc9706fbc8273e405f04308a0424 Mon Sep 17 00:00:00 2001 From: Brian Wylie Date: Thu, 28 Dec 2023 13:53:38 -0700 Subject: [PATCH] fixing docstring formatting for mkdocsstrings --- docs/api_classes/feature_set.md | 10 ++++++-- ...{featureset_stats.py => featureset_eda.py} | 4 ++++ src/sageworks/algorithms/sql/column_stats.py | 2 +- src/sageworks/algorithms/sql/outliers.py | 8 +++---- src/sageworks/core/artifacts/athena_source.py | 24 +++++++++---------- .../core/artifacts/data_source_abstract.py | 20 ++++++++-------- src/sageworks/core/artifacts/endpoint_core.py | 2 +- .../core/artifacts/feature_set_core.py | 12 +++++----- 8 files changed, 46 insertions(+), 36 deletions(-) rename examples/{featureset_stats.py => featureset_eda.py} (88%) diff --git a/docs/api_classes/feature_set.md b/docs/api_classes/feature_set.md index 92a624952..c0fd61e64 100644 --- a/docs/api_classes/feature_set.md +++ b/docs/api_classes/feature_set.md @@ -19,9 +19,9 @@ my_features = test_data.to_features() print(my_features.details()) ``` -**Use/Show some of the EDA Statistics** +**FeatureSet EDA Statistics** -```py title="featureset_stats.py" +```py title="featureset_eda.py" from sageworks.api.feature_set import FeatureSet import pandas as pd @@ -36,6 +36,10 @@ print(corr_df) # Get some outliers outliers = my_features.outliers() pprint(outliers.head()) + +# Full set of EDA Stats +eda_stats = my_features.column_stats() +pprint(eda_stats) ``` **Output** @@ -55,6 +59,8 @@ iq_score -0.295513 0.395378 0.076477 -0.435033 0.033364 -0.655210 1 Person 68 73.918663 189.527313 219994.000000 80 100.000000 0 0 0 1 0 iq_score_low 2 Person 49 70.381790 261.237000 175633.703125 49 107.933998 0 0 0 1 0 iq_score_low 3 Person 90 73.488739 193.840698 227760.000000 72 110.821541 1 0 0 0 0 salary_high + + ``` diff --git a/examples/featureset_stats.py b/examples/featureset_eda.py similarity index 88% rename from examples/featureset_stats.py rename to examples/featureset_eda.py index 43ac3e308..1e3405e99 100644 --- a/examples/featureset_stats.py +++ b/examples/featureset_eda.py @@ -22,3 +22,7 @@ # Get some outliers outliers = my_features.outliers() pprint(outliers.head()) + +# Full set of EDA Stats +eda_stats = my_features.column_stats() +pprint(eda_stats) diff --git a/src/sageworks/algorithms/sql/column_stats.py b/src/sageworks/algorithms/sql/column_stats.py index 4d7fe9aac..621083076 100644 --- a/src/sageworks/algorithms/sql/column_stats.py +++ b/src/sageworks/algorithms/sql/column_stats.py @@ -53,7 +53,7 @@ def column_stats(data_source: DataSourceAbstract, recompute: bool = False) -> di """SQL based Column Statistics: Compute Column Statistics for a DataSource using SQL Args: data_source(DataSource): The DataSource that we're computing column stats on - recompute(bool): Whether or not to recompute the column stats (default: False) + recompute (bool): Whether or not to recompute the column stats (default: False) Returns: dict(dict): A dictionary of stats for each column this format NB: String columns will have value_counts but NOT have num_zeros and descriptive stats diff --git a/src/sageworks/algorithms/sql/outliers.py b/src/sageworks/algorithms/sql/outliers.py index 039413307..29ade9e09 100644 --- a/src/sageworks/algorithms/sql/outliers.py +++ b/src/sageworks/algorithms/sql/outliers.py @@ -24,8 +24,8 @@ def compute_outliers( """Compute outliers for all the numeric columns in a DataSource Args: data_source(DataSource): The DataSource that we're computing outliers on - scale(float): The scale to use for either the IQR or stddev outlier calculation (default: 1.5) - use_stddev(bool): Option to use the standard deviation for the outlier calculation (default: False) + scale (float): The scale to use for either the IQR or stddev outlier calculation (default: 1.5) + use_stddev (bool): Option to use the standard deviation for the outlier calculation (default: False) Returns: pd.DataFrame: A DataFrame of outliers for this DataSource Notes: @@ -67,8 +67,8 @@ def _numeric_outliers(self, data_source: DataSourceAbstract, scale: float, use_s """Internal method to compute outliers for all numeric columns Args: data_source(DataSource): The DataSource that we're computing outliers on - scale(float): The scale to use for the IQR outlier calculation - use_stddev(bool): Option to use the standard deviation for the outlier calculation (default: False) + scale (float): The scale to use for the IQR outlier calculation + use_stddev (bool): Option to use the standard deviation for the outlier calculation (default: False) Returns: pd.DataFrame: A DataFrame of all the outliers combined """ diff --git a/src/sageworks/core/artifacts/athena_source.py b/src/sageworks/core/artifacts/athena_source.py index 38ac3709c..02d6c8d55 100644 --- a/src/sageworks/core/artifacts/athena_source.py +++ b/src/sageworks/core/artifacts/athena_source.py @@ -244,7 +244,7 @@ def descriptive_stats(self, recompute: bool = False) -> dict[dict]: """Compute Descriptive Stats for all the numeric columns in a DataSource Args: - recompute(bool): Recompute the descriptive stats (default: False) + recompute (bool): Recompute the descriptive stats (default: False) Returns: dict(dict): A dictionary of descriptive stats for each column in the form @@ -270,9 +270,9 @@ def outliers_impl(self, scale: float = 1.5, use_stddev=False, recompute: bool = """Compute outliers for all the numeric columns in a DataSource Args: - scale(float): The scale to use for the IQR (default: 1.5) - use_stddev(bool): Use Standard Deviation instead of IQR (default: False) - recompute(bool): Recompute the outliers (default: False) + scale (float): The scale to use for the IQR (default: 1.5) + use_stddev (bool): Use Standard Deviation instead of IQR (default: False) + recompute (bool): Recompute the outliers (default: False) Returns: pd.DataFrame: A DataFrame of outliers from this DataSource @@ -311,7 +311,7 @@ def correlations(self, recompute: bool = False) -> dict[dict]: """Compute Correlations for all the numeric columns in a DataSource Args: - recompute(bool): Recompute the column stats (default: False) + recompute (bool): Recompute the column stats (default: False) Returns: dict(dict): A dictionary of correlations for each column in this format @@ -337,15 +337,15 @@ def column_stats(self, recompute: bool = False) -> dict[dict]: """Compute Column Stats for all the columns in a DataSource Args: - recompute(bool): Recompute the column stats (default: False) + recompute (bool): Recompute the column stats (default: False) Returns: dict(dict): A dictionary of stats for each column this format NB: String columns will NOT have num_zeros, descriptive_stats or correlation data - {'col1': {'dtype': 'string', 'unique': 4321, 'nulls': 12}, - 'col2': {'dtype': 'int', 'unique': 4321, 'nulls': 12, 'num_zeros': 100, - 'descriptive_stats': {...}, 'correlations': {...}}, - ...} + {'col1': {'dtype': 'string', 'unique': 4321, 'nulls': 12}, + 'col2': {'dtype': 'int', 'unique': 4321, 'nulls': 12, 'num_zeros': 100, + 'descriptive_stats': {...}, 'correlations': {...}}, + ...} """ # First check if we have already computed the column stats @@ -366,7 +366,7 @@ def value_counts(self, recompute: bool = False) -> dict[dict]: """Compute 'value_counts' for all the string columns in a DataSource Args: - recompute(bool): Recompute the value counts (default: False) + recompute (bool): Recompute the value counts (default: False) Returns: dict(dict): A dictionary of value counts for each column in the form @@ -392,7 +392,7 @@ def details(self, recompute: bool = False) -> dict[dict]: """Additional Details about this AthenaSource Artifact Args: - recompute(bool): Recompute the details (default: False) + recompute (bool): Recompute the details (default: False) Returns: dict(dict): A dictionary of details about this AthenaSource diff --git a/src/sageworks/core/artifacts/data_source_abstract.py b/src/sageworks/core/artifacts/data_source_abstract.py index d95343f57..df30867fc 100644 --- a/src/sageworks/core/artifacts/data_source_abstract.py +++ b/src/sageworks/core/artifacts/data_source_abstract.py @@ -58,7 +58,7 @@ def column_types(self) -> list[str]: def column_details(self, view: str = "all") -> dict: """Return the column details for this Data Source Args: - view(str): The view to get column details for (default: "all") + view (str): The view to get column details for (default: "all") Returns: dict: The column details for this Data Source """ @@ -106,7 +106,7 @@ def get_display_columns(self) -> list[str]: def set_display_columns(self, display_columns: list[str]): """Set the display columns for this Data Source Args: - display_columns(list[str]): The display columns for this Data Source + display_columns (list[str]): The display columns for this Data Source """ self._display_columns = display_columns self.upsert_sageworks_meta({"sageworks_display_columns": self._display_columns}) @@ -144,7 +144,7 @@ def execute_statement(self, query: str): def sample(self, recompute: bool = False) -> pd.DataFrame: """Return a sample DataFrame from this DataSource Args: - recompute(bool): Recompute the sample (default: False) + recompute (bool): Recompute the sample (default: False) Returns: pd.DataFrame: A sample DataFrame from this DataSource """ @@ -172,7 +172,7 @@ def sample_impl(self) -> pd.DataFrame: def descriptive_stats(self, recompute: bool = False) -> dict[dict]: """Compute Descriptive Stats for all the numeric columns in a DataSource Args: - recompute(bool): Recompute the descriptive stats (default: False) + recompute (bool): Recompute the descriptive stats (default: False) Returns: dict(dict): A dictionary of descriptive stats for each column in the form {'col1': {'min': 0, 'q1': 1, 'median': 2, 'q3': 3, 'max': 4}, @@ -183,8 +183,8 @@ def descriptive_stats(self, recompute: bool = False) -> dict[dict]: def outliers(self, scale: float = 1.5, recompute: bool = False) -> pd.DataFrame: """Return a DataFrame of outliers from this DataSource Args: - scale(float): The scale to use for the IQR (default: 1.5) - recompute(bool): Recompute the outliers (default: False) + scale (float): The scale to use for the IQR (default: 1.5) + recompute (bool): Recompute the outliers (default: False) Returns: pd.DataFrame: A DataFrame of outliers from this DataSource Notes: @@ -207,8 +207,8 @@ def outliers(self, scale: float = 1.5, recompute: bool = False) -> pd.DataFrame: def outliers_impl(self, scale: float = 1.5, recompute: bool = False) -> pd.DataFrame: """Return a DataFrame of outliers from this DataSource Args: - scale(float): The scale to use for the IQR (default: 1.5) - recompute(bool): Recompute the outliers (default: False) + scale (float): The scale to use for the IQR (default: 1.5) + recompute (bool): Recompute the outliers (default: False) Returns: pd.DataFrame: A DataFrame of outliers from this DataSource Notes: @@ -229,7 +229,7 @@ def smart_sample(self) -> pd.DataFrame: def value_counts(self, recompute: bool = False) -> dict[dict]: """Compute 'value_counts' for all the string columns in a DataSource Args: - recompute(bool): Recompute the value counts (default: False) + recompute (bool): Recompute the value counts (default: False) Returns: dict(dict): A dictionary of value counts for each column in the form {'col1': {'value_1': X, 'value_2': Y, 'value_3': Z,...}, @@ -241,7 +241,7 @@ def value_counts(self, recompute: bool = False) -> dict[dict]: def column_stats(self, recompute: bool = False) -> dict[dict]: """Compute Column Stats for all the columns in a DataSource Args: - recompute(bool): Recompute the column stats (default: False) + recompute (bool): Recompute the column stats (default: False) Returns: dict(dict): A dictionary of stats for each column this format NB: String columns will NOT have num_zeros and descriptive stats diff --git a/src/sageworks/core/artifacts/endpoint_core.py b/src/sageworks/core/artifacts/endpoint_core.py index 463b97483..6d355f266 100644 --- a/src/sageworks/core/artifacts/endpoint_core.py +++ b/src/sageworks/core/artifacts/endpoint_core.py @@ -280,7 +280,7 @@ def endpoint_metrics(self) -> pd.DataFrame: def details(self, recompute: bool = False) -> dict: """Additional Details about this Endpoint Args: - recompute(bool): Recompute the details (default: False) + recompute (bool): Recompute the details (default: False) Returns: dict(dict): A dictionary of details about this Endpoint """ diff --git a/src/sageworks/core/artifacts/feature_set_core.py b/src/sageworks/core/artifacts/feature_set_core.py index 8c1542485..0927ee295 100644 --- a/src/sageworks/core/artifacts/feature_set_core.py +++ b/src/sageworks/core/artifacts/feature_set_core.py @@ -124,7 +124,7 @@ def column_details(self, view: str = "all") -> dict: """Return the column details of the Feature Set Args: - view(str): The view to get column details for (default: "all") + view (str): The view to get column details for (default: "all") Returns: dict: The column details of the Feature Set @@ -167,7 +167,7 @@ def set_display_columns(self, display_columns: list[str]): """Set the display columns for this FeatureSet Args: - display_columns(list[str]): The display columns for this FeatureSet + display_columns (list[str]): The display columns for this FeatureSet Notes: This just sets the display columns for the underlying DataSource @@ -278,7 +278,7 @@ def details(self, recompute: bool = False) -> dict[dict]: """Additional Details about this FeatureSet Artifact Args: - recompute(bool): Recompute the details (default: False) + recompute (bool): Recompute the details (default: False) Returns: dict(dict): A dictionary of details about this FeatureSet @@ -465,8 +465,8 @@ def sample(self, recompute: bool = False) -> pd.DataFrame: def outliers(self, scale: float = 1.5, recompute: bool = False) -> pd.DataFrame: """Compute outliers for all the numeric columns in a DataSource Args: - scale(float): The scale to use for the IQR (default: 1.5) - recompute(bool): Recompute the outliers (default: False) + scale (float): The scale to use for the IQR (default: 1.5) + recompute (bool): Recompute the outliers (default: False) Returns: pd.DataFrame: A DataFrame of outliers from this DataSource Notes: @@ -517,7 +517,7 @@ def correlations(self, recompute: bool = False) -> dict: def column_stats(self, recompute: bool = False) -> dict[dict]: """Compute Column Stats for all the columns in the FeatureSets underlying DataSource Args: - recompute(bool): Recompute the column stats (default: False) + recompute (bool): Recompute the column stats (default: False) Returns: dict(dict): A dictionary of stats for each column this format NB: String columns will NOT have num_zeros and descriptive_stats