feat: add string type for text variables (#1282)

* chore(actions): fix docs publishing ci * update type infering - added new data type String - added describe function for string - added render function for string * add describe string for spark - same as category * add word cloud to requirements * update tests -replace Category with String, where needed * change type hint - update type hint at string_to_bool function * change word cloud size to same ratio as fig size * format string render - update string render to same format as other renders * update describe_string_spark * change import order * update 'String' type name to 'Text' - 'Text' title is more accurate * resolve pre-commit hooks
ydataai · May 3, 2023 · 8d97234 · 8d97234
1 parent 3a4b118
commit 8d97234
Show file tree

Hide file tree

Showing 19 changed files with 546 additions and 82 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -22,4 +22,5 @@ statsmodels>=0.13.2, <0.14
 # type checking
 typeguard>=2.13.2, <2.14
 imagehash==4.3.1
-dacite>=1.8
+wordcloud>=1.9.1
+dacite>=1.8
diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py
@@ -45,6 +45,17 @@ class NumVars(BaseModel):
     chi_squared_threshold: float = 0.999
 
 
+class TextVars(BaseModel):
+    length: bool = True
+    words: bool = True
+    characters: bool = True
+    redact: bool = False
+    # if text has more than threshold categories, its not category
+    categorical_threshold: int = 50
+    # if text has more than threshold % distinct values, its not category
+    percentage_cat_threshold: float = 0.5
+
+
 class CatVars(BaseModel):
     length: bool = True
     characters: bool = True
@@ -106,6 +117,7 @@ class TimeseriesVars(BaseModel):
 
 class Univariate(BaseModel):
     num: NumVars = NumVars()
+    text: TextVars = TextVars()
     cat: CatVars = CatVars()
     image: ImageVars = ImageVars()
     bool: BoolVars = BoolVars()
@@ -395,7 +407,7 @@ class Config:
         "sensitive": {
             "samples": None,
             "duplicates": None,
-            "vars": {"cat": {"redact": True}},
+            "vars": {"cat": {"redact": True}, "text": {"redact": True}},
         },
         "dark_mode": {
             "html": {

diff --git a/src/ydata_profiling/expectations_report.py b/src/ydata_profiling/expectations_report.py
@@ -15,6 +15,7 @@ class ExpectationHandler(Handler):
     def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
         mapping = {
             "Unsupported": [expectation_algorithms.generic_expectations],
+            "Text": [expectation_algorithms.categorical_expectations],
             "Categorical": [expectation_algorithms.categorical_expectations],
             "Boolean": [expectation_algorithms.categorical_expectations],
             "Numeric": [expectation_algorithms.numeric_expectations],

diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py
@@ -69,6 +69,7 @@ def get_render_map() -> Dict[str, Callable]:
         "Boolean": render_algorithms.render_boolean,
         "Numeric": render_algorithms.render_real,
         "Complex": render_algorithms.render_complex,
+        "Text": render_algorithms.render_text,
         "DateTime": render_algorithms.render_date,
         "Categorical": render_algorithms.render_categorical,
         "URL": render_algorithms.render_url,

diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py
@@ -11,6 +11,7 @@
     describe_numeric_pandas,
     describe_path_pandas,
     describe_supported_pandas,
+    describe_text_pandas,
     describe_timeseries_pandas,
     describe_url_pandas,
     duplicates_pandas,
@@ -33,6 +34,7 @@
     "describe_numeric_pandas",
     "describe_path_pandas",
     "describe_supported_pandas",
+    "describe_text_pandas",
     "describe_timeseries_pandas",
     "describe_url_pandas",
     "duplicates_pandas",

diff --git a/src/ydata_profiling/model/pandas/describe_text_pandas.py b/src/ydata_profiling/model/pandas/describe_text_pandas.py
@@ -0,0 +1,64 @@
+from typing import Tuple
+
+import pandas as pd
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.pandas.describe_categorical_pandas import (
+    length_summary_vc,
+    unicode_summary_vc,
+    word_summary_vc,
+)
+from ydata_profiling.model.summary_algorithms import (
+    describe_text_1d,
+    histogram_compute,
+    series_handle_nulls,
+    series_hashable,
+)
+
+
+@describe_text_1d.register
+@series_hashable
+@series_handle_nulls
+def pandas_describe_text_1d(
+    config: Settings,
+    series: pd.Series,
+    summary: dict,
+) -> Tuple[Settings, pd.Series, dict]:
+    """Describe string series.
+
+    Args:
+        config: report Settings object
+        series: The Series to describe.
+        summary: The dict containing the series description so far.
+
+    Returns:
+        A dict containing calculated series description values.
+    """
+
+    series = series.astype(str)
+
+    # Only run if at least 1 non-missing value
+    value_counts = summary["value_counts_without_nan"]
+    value_counts.index = value_counts.index.astype(str)
+
+    summary.update({"first_rows": series.head(5)})
+
+    if config.vars.text.length:
+        summary.update(length_summary_vc(value_counts))
+        summary.update(
+            histogram_compute(
+                config,
+                summary["length_histogram"].index.values,
+                len(summary["length_histogram"]),
+                name="histogram_length",
+                weights=summary["length_histogram"].values,
+            )
+        )
+
+    if config.vars.text.characters:
+        summary.update(unicode_summary_vc(value_counts))
+
+    if config.vars.text.words:
+        summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words))
+
+    return config, series, summary
diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py
@@ -0,0 +1,27 @@
+from typing import Tuple
+
+from pyspark.sql import DataFrame
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.summary_algorithms import describe_text_1d
+
+
+@describe_text_1d.register
+def describe_text_1d_spark(
+    config: Settings, df: DataFrame, summary: dict
+) -> Tuple[Settings, DataFrame, dict]:
+    """Describe a categorical series.
+
+    Args:
+        series: The Series to describe.
+        summary: The dict containing the series description so far.
+
+    Returns:
+        A dict containing calculated series description values.
+    """
+
+    redact = config.vars.text.redact
+    if not redact:
+        summary["first_rows"] = df.limit(5).toPandas().squeeze("columns")
+
+    return config, df, summary
diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
@@ -19,6 +19,7 @@
     describe_numeric_1d,
     describe_path_1d,
     describe_supported,
+    describe_text_1d,
     describe_timeseries_1d,
     describe_url_1d,
 )
@@ -58,6 +59,9 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
             "DateTime": [
                 describe_date_1d,
             ],
+            "Text": [
+                describe_text_1d,
+            ],
             "Categorical": [
                 describe_categorical_1d,
             ],

diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py
@@ -126,6 +126,13 @@ def describe_numeric_1d(
     raise NotImplementedError()
 
 
+@multimethod
+def describe_text_1d(
+    config: Settings, series: Any, summary: dict
+) -> Tuple[Settings, Any, dict, Any]:
+    raise NotImplementedError()
+
+
 @multimethod
 def describe_date_1d(
     config: Settings, series: Any, summary: dict