-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add string type for text variables (#1282)
* chore(actions): fix docs publishing ci * update type infering - added new data type String - added describe function for string - added render function for string * add describe string for spark - same as category * add word cloud to requirements * update tests -replace Category with String, where needed * change type hint - update type hint at string_to_bool function * change word cloud size to same ratio as fig size * format string render - update string render to same format as other renders * update describe_string_spark * change import order * update 'String' type name to 'Text' - 'Text' title is more accurate * resolve pre-commit hooks
- Loading branch information
Showing
19 changed files
with
546 additions
and
82 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from typing import Tuple | ||
|
||
import pandas as pd | ||
|
||
from ydata_profiling.config import Settings | ||
from ydata_profiling.model.pandas.describe_categorical_pandas import ( | ||
length_summary_vc, | ||
unicode_summary_vc, | ||
word_summary_vc, | ||
) | ||
from ydata_profiling.model.summary_algorithms import ( | ||
describe_text_1d, | ||
histogram_compute, | ||
series_handle_nulls, | ||
series_hashable, | ||
) | ||
|
||
|
||
@describe_text_1d.register | ||
@series_hashable | ||
@series_handle_nulls | ||
def pandas_describe_text_1d( | ||
config: Settings, | ||
series: pd.Series, | ||
summary: dict, | ||
) -> Tuple[Settings, pd.Series, dict]: | ||
"""Describe string series. | ||
Args: | ||
config: report Settings object | ||
series: The Series to describe. | ||
summary: The dict containing the series description so far. | ||
Returns: | ||
A dict containing calculated series description values. | ||
""" | ||
|
||
series = series.astype(str) | ||
|
||
# Only run if at least 1 non-missing value | ||
value_counts = summary["value_counts_without_nan"] | ||
value_counts.index = value_counts.index.astype(str) | ||
|
||
summary.update({"first_rows": series.head(5)}) | ||
|
||
if config.vars.text.length: | ||
summary.update(length_summary_vc(value_counts)) | ||
summary.update( | ||
histogram_compute( | ||
config, | ||
summary["length_histogram"].index.values, | ||
len(summary["length_histogram"]), | ||
name="histogram_length", | ||
weights=summary["length_histogram"].values, | ||
) | ||
) | ||
|
||
if config.vars.text.characters: | ||
summary.update(unicode_summary_vc(value_counts)) | ||
|
||
if config.vars.text.words: | ||
summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words)) | ||
|
||
return config, series, summary |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
from typing import Tuple | ||
|
||
from pyspark.sql import DataFrame | ||
|
||
from ydata_profiling.config import Settings | ||
from ydata_profiling.model.summary_algorithms import describe_text_1d | ||
|
||
|
||
@describe_text_1d.register | ||
def describe_text_1d_spark( | ||
config: Settings, df: DataFrame, summary: dict | ||
) -> Tuple[Settings, DataFrame, dict]: | ||
"""Describe a categorical series. | ||
Args: | ||
series: The Series to describe. | ||
summary: The dict containing the series description so far. | ||
Returns: | ||
A dict containing calculated series description values. | ||
""" | ||
|
||
redact = config.vars.text.redact | ||
if not redact: | ||
summary["first_rows"] = df.limit(5).toPandas().squeeze("columns") | ||
|
||
return config, df, summary |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.