Skip to content

Commit

Permalink
feat: add string type for text variables (#1282)
Browse files Browse the repository at this point in the history
* chore(actions): fix docs publishing ci

* update type infering

-  added new data type String
  - added describe function for string
  - added render function for string

* add describe string for spark

- same as category

* add word cloud to requirements

* update tests

-replace Category with String, where needed

* change type hint

- update type hint at string_to_bool function

* change word cloud size to same ratio as fig size

* format string render

- update string render to same format as other renders

* update describe_string_spark

* change import order

* update 'String' type name to 'Text'

- 'Text' title is more accurate

* resolve pre-commit hooks
  • Loading branch information
vorel99 authored May 3, 2023
1 parent 3a4b118 commit 8d97234
Show file tree
Hide file tree
Showing 19 changed files with 546 additions and 82 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ statsmodels>=0.13.2, <0.14
# type checking
typeguard>=2.13.2, <2.14
imagehash==4.3.1
dacite>=1.8
wordcloud>=1.9.1
dacite>=1.8
14 changes: 13 additions & 1 deletion src/ydata_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ class NumVars(BaseModel):
chi_squared_threshold: float = 0.999


class TextVars(BaseModel):
length: bool = True
words: bool = True
characters: bool = True
redact: bool = False
# if text has more than threshold categories, its not category
categorical_threshold: int = 50
# if text has more than threshold % distinct values, its not category
percentage_cat_threshold: float = 0.5


class CatVars(BaseModel):
length: bool = True
characters: bool = True
Expand Down Expand Up @@ -106,6 +117,7 @@ class TimeseriesVars(BaseModel):

class Univariate(BaseModel):
num: NumVars = NumVars()
text: TextVars = TextVars()
cat: CatVars = CatVars()
image: ImageVars = ImageVars()
bool: BoolVars = BoolVars()
Expand Down Expand Up @@ -395,7 +407,7 @@ class Config:
"sensitive": {
"samples": None,
"duplicates": None,
"vars": {"cat": {"redact": True}},
"vars": {"cat": {"redact": True}, "text": {"redact": True}},
},
"dark_mode": {
"html": {
Expand Down
1 change: 1 addition & 0 deletions src/ydata_profiling/expectations_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class ExpectationHandler(Handler):
def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
mapping = {
"Unsupported": [expectation_algorithms.generic_expectations],
"Text": [expectation_algorithms.categorical_expectations],
"Categorical": [expectation_algorithms.categorical_expectations],
"Boolean": [expectation_algorithms.categorical_expectations],
"Numeric": [expectation_algorithms.numeric_expectations],
Expand Down
1 change: 1 addition & 0 deletions src/ydata_profiling/model/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def get_render_map() -> Dict[str, Callable]:
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
"Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
Expand Down
2 changes: 2 additions & 0 deletions src/ydata_profiling/model/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
describe_numeric_pandas,
describe_path_pandas,
describe_supported_pandas,
describe_text_pandas,
describe_timeseries_pandas,
describe_url_pandas,
duplicates_pandas,
Expand All @@ -33,6 +34,7 @@
"describe_numeric_pandas",
"describe_path_pandas",
"describe_supported_pandas",
"describe_text_pandas",
"describe_timeseries_pandas",
"describe_url_pandas",
"duplicates_pandas",
Expand Down
64 changes: 64 additions & 0 deletions src/ydata_profiling/model/pandas/describe_text_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from typing import Tuple

import pandas as pd

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.describe_categorical_pandas import (
length_summary_vc,
unicode_summary_vc,
word_summary_vc,
)
from ydata_profiling.model.summary_algorithms import (
describe_text_1d,
histogram_compute,
series_handle_nulls,
series_hashable,
)


@describe_text_1d.register
@series_hashable
@series_handle_nulls
def pandas_describe_text_1d(
config: Settings,
series: pd.Series,
summary: dict,
) -> Tuple[Settings, pd.Series, dict]:
"""Describe string series.
Args:
config: report Settings object
series: The Series to describe.
summary: The dict containing the series description so far.
Returns:
A dict containing calculated series description values.
"""

series = series.astype(str)

# Only run if at least 1 non-missing value
value_counts = summary["value_counts_without_nan"]
value_counts.index = value_counts.index.astype(str)

summary.update({"first_rows": series.head(5)})

if config.vars.text.length:
summary.update(length_summary_vc(value_counts))
summary.update(
histogram_compute(
config,
summary["length_histogram"].index.values,
len(summary["length_histogram"]),
name="histogram_length",
weights=summary["length_histogram"].values,
)
)

if config.vars.text.characters:
summary.update(unicode_summary_vc(value_counts))

if config.vars.text.words:
summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words))

return config, series, summary
27 changes: 27 additions & 0 deletions src/ydata_profiling/model/spark/describe_text_spark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Tuple

from pyspark.sql import DataFrame

from ydata_profiling.config import Settings
from ydata_profiling.model.summary_algorithms import describe_text_1d


@describe_text_1d.register
def describe_text_1d_spark(
config: Settings, df: DataFrame, summary: dict
) -> Tuple[Settings, DataFrame, dict]:
"""Describe a categorical series.
Args:
series: The Series to describe.
summary: The dict containing the series description so far.
Returns:
A dict containing calculated series description values.
"""

redact = config.vars.text.redact
if not redact:
summary["first_rows"] = df.limit(5).toPandas().squeeze("columns")

return config, df, summary
4 changes: 4 additions & 0 deletions src/ydata_profiling/model/summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
describe_numeric_1d,
describe_path_1d,
describe_supported,
describe_text_1d,
describe_timeseries_1d,
describe_url_1d,
)
Expand Down Expand Up @@ -58,6 +59,9 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
"DateTime": [
describe_date_1d,
],
"Text": [
describe_text_1d,
],
"Categorical": [
describe_categorical_1d,
],
Expand Down
7 changes: 7 additions & 0 deletions src/ydata_profiling/model/summary_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ def describe_numeric_1d(
raise NotImplementedError()


@multimethod
def describe_text_1d(
config: Settings, series: Any, summary: dict
) -> Tuple[Settings, Any, dict, Any]:
raise NotImplementedError()


@multimethod
def describe_date_1d(
config: Settings, series: Any, summary: dict
Expand Down
Loading

0 comments on commit 8d97234

Please sign in to comment.