From 8d97234a765a846eecde1273c23e8bf32ed7ecf9 Mon Sep 17 00:00:00 2001
From: Jan Cap <71695857+vorel99@users.noreply.github.com>
Date: Wed, 3 May 2023 14:42:40 +0200
Subject: [PATCH] feat: add string type for text variables (#1282)
* chore(actions): fix docs publishing ci
* update type infering
- added new data type String
- added describe function for string
- added render function for string
* add describe string for spark
- same as category
* add word cloud to requirements
* update tests
-replace Category with String, where needed
* change type hint
- update type hint at string_to_bool function
* change word cloud size to same ratio as fig size
* format string render
- update string render to same format as other renders
* update describe_string_spark
* change import order
* update 'String' type name to 'Text'
- 'Text' title is more accurate
* resolve pre-commit hooks
---
requirements.txt | 3 +-
src/ydata_profiling/config.py | 14 +-
src/ydata_profiling/expectations_report.py | 1 +
src/ydata_profiling/model/handler.py | 1 +
src/ydata_profiling/model/pandas/__init__.py | 2 +
.../model/pandas/describe_text_pandas.py | 64 ++++++
.../model/spark/describe_text_spark.py | 27 +++
src/ydata_profiling/model/summarizer.py | 4 +
.../model/summary_algorithms.py | 7 +
src/ydata_profiling/model/typeset.py | 111 +++++++++--
.../model/typeset_relations.py | 39 +++-
.../report/structure/variables/__init__.py | 2 +
.../report/structure/variables/render_text.py | 184 ++++++++++++++++++
src/ydata_profiling/visualisation/plot.py | 21 ++
tests/issues/test_issue397.py | 2 +-
tests/issues/test_issue72.py | 4 +-
tests/unit/test_example.py | 2 +-
tests/unit/test_typeset_custom.py | 74 ++++---
tests/unit/test_typeset_default.py | 66 ++++---
19 files changed, 546 insertions(+), 82 deletions(-)
create mode 100644 src/ydata_profiling/model/pandas/describe_text_pandas.py
create mode 100644 src/ydata_profiling/model/spark/describe_text_spark.py
create mode 100644 src/ydata_profiling/report/structure/variables/render_text.py
diff --git a/requirements.txt b/requirements.txt
index 28ba2bc04..a3c139bfd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,4 +22,5 @@ statsmodels>=0.13.2, <0.14
# type checking
typeguard>=2.13.2, <2.14
imagehash==4.3.1
-dacite>=1.8
\ No newline at end of file
+wordcloud>=1.9.1
+dacite>=1.8
diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py
index 0b7e4f8d6..a82e2dd3e 100644
--- a/src/ydata_profiling/config.py
+++ b/src/ydata_profiling/config.py
@@ -45,6 +45,17 @@ class NumVars(BaseModel):
chi_squared_threshold: float = 0.999
+class TextVars(BaseModel):
+ length: bool = True
+ words: bool = True
+ characters: bool = True
+ redact: bool = False
+ # if text has more than threshold categories, its not category
+ categorical_threshold: int = 50
+ # if text has more than threshold % distinct values, its not category
+ percentage_cat_threshold: float = 0.5
+
+
class CatVars(BaseModel):
length: bool = True
characters: bool = True
@@ -106,6 +117,7 @@ class TimeseriesVars(BaseModel):
class Univariate(BaseModel):
num: NumVars = NumVars()
+ text: TextVars = TextVars()
cat: CatVars = CatVars()
image: ImageVars = ImageVars()
bool: BoolVars = BoolVars()
@@ -395,7 +407,7 @@ class Config:
"sensitive": {
"samples": None,
"duplicates": None,
- "vars": {"cat": {"redact": True}},
+ "vars": {"cat": {"redact": True}, "text": {"redact": True}},
},
"dark_mode": {
"html": {
diff --git a/src/ydata_profiling/expectations_report.py b/src/ydata_profiling/expectations_report.py
index c8583fbe2..7979e510b 100644
--- a/src/ydata_profiling/expectations_report.py
+++ b/src/ydata_profiling/expectations_report.py
@@ -15,6 +15,7 @@ class ExpectationHandler(Handler):
def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
mapping = {
"Unsupported": [expectation_algorithms.generic_expectations],
+ "Text": [expectation_algorithms.categorical_expectations],
"Categorical": [expectation_algorithms.categorical_expectations],
"Boolean": [expectation_algorithms.categorical_expectations],
"Numeric": [expectation_algorithms.numeric_expectations],
diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py
index 9d12dd705..5db948fd6 100644
--- a/src/ydata_profiling/model/handler.py
+++ b/src/ydata_profiling/model/handler.py
@@ -69,6 +69,7 @@ def get_render_map() -> Dict[str, Callable]:
"Boolean": render_algorithms.render_boolean,
"Numeric": render_algorithms.render_real,
"Complex": render_algorithms.render_complex,
+ "Text": render_algorithms.render_text,
"DateTime": render_algorithms.render_date,
"Categorical": render_algorithms.render_categorical,
"URL": render_algorithms.render_url,
diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py
index 290fddace..b895f46e2 100644
--- a/src/ydata_profiling/model/pandas/__init__.py
+++ b/src/ydata_profiling/model/pandas/__init__.py
@@ -11,6 +11,7 @@
describe_numeric_pandas,
describe_path_pandas,
describe_supported_pandas,
+ describe_text_pandas,
describe_timeseries_pandas,
describe_url_pandas,
duplicates_pandas,
@@ -33,6 +34,7 @@
"describe_numeric_pandas",
"describe_path_pandas",
"describe_supported_pandas",
+ "describe_text_pandas",
"describe_timeseries_pandas",
"describe_url_pandas",
"duplicates_pandas",
diff --git a/src/ydata_profiling/model/pandas/describe_text_pandas.py b/src/ydata_profiling/model/pandas/describe_text_pandas.py
new file mode 100644
index 000000000..2701b9760
--- /dev/null
+++ b/src/ydata_profiling/model/pandas/describe_text_pandas.py
@@ -0,0 +1,64 @@
+from typing import Tuple
+
+import pandas as pd
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.pandas.describe_categorical_pandas import (
+ length_summary_vc,
+ unicode_summary_vc,
+ word_summary_vc,
+)
+from ydata_profiling.model.summary_algorithms import (
+ describe_text_1d,
+ histogram_compute,
+ series_handle_nulls,
+ series_hashable,
+)
+
+
+@describe_text_1d.register
+@series_hashable
+@series_handle_nulls
+def pandas_describe_text_1d(
+ config: Settings,
+ series: pd.Series,
+ summary: dict,
+) -> Tuple[Settings, pd.Series, dict]:
+ """Describe string series.
+
+ Args:
+ config: report Settings object
+ series: The Series to describe.
+ summary: The dict containing the series description so far.
+
+ Returns:
+ A dict containing calculated series description values.
+ """
+
+ series = series.astype(str)
+
+ # Only run if at least 1 non-missing value
+ value_counts = summary["value_counts_without_nan"]
+ value_counts.index = value_counts.index.astype(str)
+
+ summary.update({"first_rows": series.head(5)})
+
+ if config.vars.text.length:
+ summary.update(length_summary_vc(value_counts))
+ summary.update(
+ histogram_compute(
+ config,
+ summary["length_histogram"].index.values,
+ len(summary["length_histogram"]),
+ name="histogram_length",
+ weights=summary["length_histogram"].values,
+ )
+ )
+
+ if config.vars.text.characters:
+ summary.update(unicode_summary_vc(value_counts))
+
+ if config.vars.text.words:
+ summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words))
+
+ return config, series, summary
diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py
new file mode 100644
index 000000000..b5e27f615
--- /dev/null
+++ b/src/ydata_profiling/model/spark/describe_text_spark.py
@@ -0,0 +1,27 @@
+from typing import Tuple
+
+from pyspark.sql import DataFrame
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.summary_algorithms import describe_text_1d
+
+
+@describe_text_1d.register
+def describe_text_1d_spark(
+ config: Settings, df: DataFrame, summary: dict
+) -> Tuple[Settings, DataFrame, dict]:
+ """Describe a categorical series.
+
+ Args:
+ series: The Series to describe.
+ summary: The dict containing the series description so far.
+
+ Returns:
+ A dict containing calculated series description values.
+ """
+
+ redact = config.vars.text.redact
+ if not redact:
+ summary["first_rows"] = df.limit(5).toPandas().squeeze("columns")
+
+ return config, df, summary
diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
index c112d14ed..f7b7d1b91 100644
--- a/src/ydata_profiling/model/summarizer.py
+++ b/src/ydata_profiling/model/summarizer.py
@@ -19,6 +19,7 @@
describe_numeric_1d,
describe_path_1d,
describe_supported,
+ describe_text_1d,
describe_timeseries_1d,
describe_url_1d,
)
@@ -58,6 +59,9 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
"DateTime": [
describe_date_1d,
],
+ "Text": [
+ describe_text_1d,
+ ],
"Categorical": [
describe_categorical_1d,
],
diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py
index fbb944681..ad64f8aae 100644
--- a/src/ydata_profiling/model/summary_algorithms.py
+++ b/src/ydata_profiling/model/summary_algorithms.py
@@ -126,6 +126,13 @@ def describe_numeric_1d(
raise NotImplementedError()
+@multimethod
+def describe_text_1d(
+ config: Settings, series: Any, summary: dict
+) -> Tuple[Settings, Any, dict, Any]:
+ raise NotImplementedError()
+
+
@multimethod
def describe_date_1d(
config: Settings, series: Any, summary: dict
diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py
index 489c1d5e5..ee1c42e78 100644
--- a/src/ydata_profiling/model/typeset.py
+++ b/src/ydata_profiling/model/typeset.py
@@ -14,12 +14,15 @@
from ydata_profiling.config import Settings
from ydata_profiling.model.typeset_relations import (
- category_is_numeric,
- category_to_numeric,
numeric_is_category,
series_is_string,
string_is_bool,
+ string_is_category,
+ string_is_datetime,
+ string_is_numeric,
string_to_bool,
+ string_to_datetime,
+ string_to_numeric,
to_bool,
to_category,
)
@@ -49,19 +52,38 @@ def typeset_types(config: Settings) -> Set[visions.VisionsBaseType]:
"""Define types based on the config"""
class Unsupported(visions.Generic):
+ """Base type. All other types have relationship with this type."""
+
pass
class Numeric(visions.VisionsBaseType):
+ """Type for all numeric (float, int) columns.
+
+ Can be transformed from
+ - Unsupported
+ - String
+
+ Examples
+ --------
+ >>> s = pd.Series([1, 2, 5, 3, 8, 9])
+ >>> s in Numeric
+ True
+
+ >>> s = pd.Series([.34, 2.9, 55, 3.14, 89, 91])
+ >>> s in Numeric
+ True
+ """
+
@staticmethod
def get_relations() -> Sequence[TypeRelation]:
return [
IdentityRelation(Unsupported),
InferenceRelation(
- Categorical,
- relationship=lambda x, y: partial(category_is_numeric, k=config)(
+ Text,
+ relationship=lambda x, y: partial(string_is_numeric, k=config)(
x, y
),
- transformer=category_to_numeric,
+ transformer=string_to_numeric,
),
]
@@ -72,11 +94,44 @@ def get_relations() -> Sequence[TypeRelation]:
def contains_op(series: pd.Series, state: dict) -> bool:
return pdt.is_numeric_dtype(series) and not pdt.is_bool_dtype(series)
+ class Text(visions.VisionsBaseType):
+ """Type for plaintext columns.
+ Like name, note, string identifier, residence etc.
+
+ Examples
+ --------
+ >>> s = pd.Series(["AX01", "BC32", "AC00"])
+ >>> s in Categorical
+ True
+
+ >>> s = pd.Series([1, 2, 3, 4])
+ >>> s in Categorical
+ False
+ """
+
+ @staticmethod
+ def get_relations() -> Sequence[TypeRelation]:
+ return [
+ IdentityRelation(Unsupported),
+ ]
+
+ @staticmethod
+ @multimethod
+ @series_not_empty
+ @series_handle_nulls
+ def contains_op(series: pd.Series, state: dict) -> bool:
+ return pdt.is_string_dtype(series) and series_is_string(series, state)
+
class DateTime(visions.VisionsBaseType):
@staticmethod
def get_relations() -> Sequence[TypeRelation]:
return [
IdentityRelation(Unsupported),
+ InferenceRelation(
+ Text,
+ relationship=lambda x, y: partial(string_is_datetime)(x, y),
+ transformer=string_to_datetime,
+ ),
]
@staticmethod
@@ -87,6 +142,30 @@ def contains_op(series: pd.Series, state: dict) -> bool:
return pdt.is_datetime64_any_dtype(series)
class Categorical(visions.VisionsBaseType):
+ """Type for categorical columns.
+ Categorical columns in pandas categorical format
+ and columns in string format with small count of unique values.
+
+ Can be transformed from:
+ - Unsupported
+ - Numeric
+ - String
+
+ Examples
+ --------
+ >>> s = pd.Series(["male", "female", "female", "male"], dtype="category")
+ >>> s in Categorical
+ True
+
+ >>> s = pd.Series(["male", "female"])
+ >>> s in Categorical
+ False
+
+ >>> s = pd.Series(["male", "female", "female", "male"])
+ >>> s in Categorical
+ True
+ """
+
@staticmethod
def get_relations() -> Sequence[TypeRelation]:
return [
@@ -98,6 +177,13 @@ def get_relations() -> Sequence[TypeRelation]:
),
transformer=to_category,
),
+ InferenceRelation(
+ Text,
+ relationship=lambda x, y: partial(string_is_category, k=config)(
+ x, y
+ ),
+ transformer=to_category,
+ ),
]
@staticmethod
@@ -110,12 +196,11 @@ def contains_op(series: pd.Series, state: dict) -> bool:
)
if is_valid_dtype:
return True
- elif not pdt.is_object_dtype(series):
- return pandas_has_string_dtype_flag and pdt.is_string_dtype(series)
-
- return series_is_string(series, state)
+ return False
class Boolean(visions.VisionsBaseType):
+ """Type for boolean columns."""
+
@staticmethod
def get_relations() -> Sequence[TypeRelation]:
# Numeric [0, 1] goes via Categorical with distinct_count_without_nan <= 2
@@ -124,7 +209,7 @@ def get_relations() -> Sequence[TypeRelation]:
return [
IdentityRelation(Unsupported),
InferenceRelation(
- Categorical,
+ Text,
relationship=lambda x, y: partial(string_is_bool, k=mapping)(x, y),
transformer=lambda s, st: to_bool(
partial(string_to_bool, k=mapping)(s, st)
@@ -148,7 +233,7 @@ def contains_op(series: pd.Series, state: dict) -> bool:
class URL(visions.VisionsBaseType):
@staticmethod
def get_relations() -> Sequence[TypeRelation]:
- return [IdentityRelation(Categorical)]
+ return [IdentityRelation(Text)]
@staticmethod
@multimethod
@@ -164,7 +249,7 @@ def contains_op(series: pd.Series, state: dict) -> bool:
class Path(visions.VisionsBaseType):
@staticmethod
def get_relations() -> Sequence[TypeRelation]:
- return [IdentityRelation(Categorical)]
+ return [IdentityRelation(Text)]
@staticmethod
@multimethod
@@ -223,7 +308,7 @@ def is_timedependent(series: pd.Series) -> bool:
is_numeric = pdt.is_numeric_dtype(series) and not pdt.is_bool_dtype(series)
return is_numeric and is_timedependent(series)
- types = {Unsupported, Boolean, Numeric, Categorical, DateTime}
+ types = {Unsupported, Boolean, Numeric, Text, Categorical, DateTime}
if config.vars.path.active:
types.add(Path)
if config.vars.file.active:
diff --git a/src/ydata_profiling/model/typeset_relations.py b/src/ydata_profiling/model/typeset_relations.py
index baa047305..0a8bd6d4a 100644
--- a/src/ydata_profiling/model/typeset_relations.py
+++ b/src/ydata_profiling/model/typeset_relations.py
@@ -1,5 +1,5 @@
import functools
-from typing import Callable
+from typing import Callable, Dict
import numpy as np
import pandas as pd
@@ -24,7 +24,7 @@ def inner(series: pd.Series, *args, **kwargs) -> bool:
return inner
-def string_is_bool(series: pd.Series, state: dict, k: Settings) -> bool:
+def string_is_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> bool:
@series_handle_nulls
@try_func
def tester(s: pd.Series, state: dict) -> bool:
@@ -36,7 +36,7 @@ def tester(s: pd.Series, state: dict) -> bool:
return tester(series, state)
-def string_to_bool(series: pd.Series, state: dict, k: Settings) -> pd.Series:
+def string_to_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> pd.Series:
return series.str.lower().map(k)
@@ -66,7 +66,32 @@ def series_is_string(series: pd.Series, state: dict) -> bool:
@series_handle_nulls
-def category_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool:
+def string_is_category(series: pd.Series, state: dict, k: Settings) -> bool:
+ """String is category, if following conditions are met
+ - has at least one and less or equal distinct values as threshold
+ - (distinct values / count of all values) is less than threshold
+ - is not bool"""
+ n_unique = series.nunique()
+ unique_threshold = k.vars.text.percentage_cat_threshold
+ threshold = k.vars.text.categorical_threshold
+ return (
+ 1 <= n_unique <= threshold
+ and n_unique / series.size < unique_threshold
+ and not string_is_bool(series, state, k.vars.bool.mappings)
+ )
+
+
+@series_handle_nulls
+def string_is_datetime(series: pd.Series, state: dict) -> bool:
+ """If we can transform data to datetime and at least one is valid date."""
+ try:
+ return not series.astype("datetime64").isna().all()
+ except: # noqa: E722
+ return False
+
+
+@series_handle_nulls
+def string_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool:
if pdt.is_bool_dtype(series) or object_is_bool(series, state):
return False
@@ -81,7 +106,11 @@ def category_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool:
return not numeric_is_category(series, state, k)
-def category_to_numeric(series: pd.Series, state: dict) -> pd.Series:
+def string_to_datetime(series: pd.Series, state: dict) -> pd.Series:
+ return series.astype("datetime64")
+
+
+def string_to_numeric(series: pd.Series, state: dict) -> pd.Series:
return pd.to_numeric(series, errors="coerce")
diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py
index 0c513b15d..64f1d6d54 100644
--- a/src/ydata_profiling/report/structure/variables/__init__.py
+++ b/src/ydata_profiling/report/structure/variables/__init__.py
@@ -11,6 +11,7 @@
from ydata_profiling.report.structure.variables.render_image import render_image
from ydata_profiling.report.structure.variables.render_path import render_path
from ydata_profiling.report.structure.variables.render_real import render_real
+from ydata_profiling.report.structure.variables.render_text import render_text
from ydata_profiling.report.structure.variables.render_timeseries import (
render_timeseries,
)
@@ -28,6 +29,7 @@
"render_image",
"render_path",
"render_real",
+ "render_text",
"render_timeseries",
"render_url",
]
diff --git a/src/ydata_profiling/report/structure/variables/render_text.py b/src/ydata_profiling/report/structure/variables/render_text.py
new file mode 100644
index 000000000..227ffa9f6
--- /dev/null
+++ b/src/ydata_profiling/report/structure/variables/render_text.py
@@ -0,0 +1,184 @@
+from typing import Any, Dict, List
+
+from ydata_profiling.config import Settings
+from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
+from ydata_profiling.report.presentation.core import (
+ Container,
+ FrequencyTable,
+ Image,
+ Table,
+)
+from ydata_profiling.report.presentation.core.variable_info import VariableInfo
+from ydata_profiling.report.structure.variables.render_categorical import (
+ _get_n,
+ freq_table,
+ render_categorical_frequency,
+ render_categorical_length,
+ render_categorical_unicode,
+)
+from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.visualisation.plot import plot_word_cloud
+
+
+def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
+ varid = summary["varid"]
+ words = config.vars.text.words
+ characters = config.vars.text.characters
+ length = config.vars.text.length
+
+ template_variables = render_common(config, summary)
+
+ top_items: List[Any] = []
+ var_info = VariableInfo(
+ anchor_id=varid,
+ var_name=summary["varname"],
+ var_type=summary["type"],
+ alerts=summary["alerts"],
+ description=summary["description"],
+ style=config.html.style,
+ )
+ top_items.append(var_info)
+
+ table = Table(
+ [
+ {
+ "name": "Distinct",
+ "value": fmt(summary["n_distinct"]),
+ "alert": "n_distinct" in summary["alert_fields"],
+ },
+ {
+ "name": "Distinct (%)",
+ "value": fmt_percent(summary["p_distinct"]),
+ "alert": "p_distinct" in summary["alert_fields"],
+ },
+ {
+ "name": "Missing",
+ "value": fmt(summary["n_missing"]),
+ "alert": "n_missing" in summary["alert_fields"],
+ },
+ {
+ "name": "Missing (%)",
+ "value": fmt_percent(summary["p_missing"]),
+ "alert": "p_missing" in summary["alert_fields"],
+ },
+ {
+ "name": "Memory size",
+ "value": fmt_bytesize(summary["memory_size"]),
+ "alert": False,
+ },
+ ],
+ style=config.html.style,
+ )
+ top_items.append(table)
+
+ if words and "word_counts" in summary:
+ mini_wordcloud = Image(
+ plot_word_cloud(config, summary["word_counts"]),
+ image_format=config.plot.image_format,
+ alt="Mini wordcloud",
+ )
+ top_items.append(mini_wordcloud)
+ template_variables["top"] = Container(top_items, sequence_type="grid")
+
+ # ============================================================================================
+
+ bottom_items = []
+ overview_items = []
+ # length isn't being computed for categorical in spark
+ if length and "max_length" in summary:
+ length_table, length_histo = render_categorical_length(config, summary, varid)
+ overview_items.append(length_table)
+
+ # characters isn't being computed for categorical in spark
+ unitab = None
+ if characters and "category_alias_counts" in summary:
+ overview_table_char, unitab = render_categorical_unicode(config, summary, varid)
+ overview_items.append(overview_table_char)
+
+ unique_stats = render_categorical_frequency(config, summary, varid)
+ overview_items.append(unique_stats)
+
+ if not config.vars.text.redact:
+ rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")
+
+ if isinstance(summary["first_rows"], list):
+ sample = Table(
+ [
+ {
+ "name": name,
+ "value": fmt(value),
+ "alert": False,
+ }
+ for name, *value in zip(rows, *summary["first_rows"])
+ ],
+ name="Sample",
+ style=config.html.style,
+ )
+ else:
+ sample = Table(
+ [
+ {
+ "name": name,
+ "value": fmt(value),
+ "alert": False,
+ }
+ for name, value in zip(rows, summary["first_rows"])
+ ],
+ name="Sample",
+ style=config.html.style,
+ )
+ overview_items.append(sample)
+ overview = Container(
+ overview_items,
+ name="Overview",
+ anchor_id=f"{varid}overview",
+ sequence_type="batch_grid",
+ batch_size=len(overview_items),
+ titles=False,
+ )
+ bottom_items.append(overview)
+
+ if words and "word_counts" in summary:
+ woc = freq_table(
+ freqtable=summary["word_counts"],
+ n=_get_n(summary["word_counts"]),
+ max_number_to_print=10,
+ )
+
+ fqwo = FrequencyTable(
+ woc,
+ name="Common words",
+ anchor_id=f"{varid}cwo",
+ redact=config.vars.text.redact,
+ )
+
+ image = Image(
+ plot_word_cloud(config, summary["word_counts"]),
+ image_format=config.plot.image_format,
+ alt="Wordcloud",
+ )
+
+ bottom_items.append(
+ Container(
+ [fqwo, image],
+ name="Words",
+ anchor_id=f"{varid}word",
+ sequence_type="grid",
+ )
+ )
+
+ if unitab is not None:
+ bottom_items.append(
+ Container(
+ [unitab],
+ name="Characters",
+ anchor_id=f"{varid}characters",
+ sequence_type="grid",
+ )
+ )
+
+ template_variables["bottom"] = Container(
+ bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom"
+ )
+
+ return template_variables
diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py
index fd8764b38..ad2778450 100644
--- a/src/ydata_profiling/visualisation/plot.py
+++ b/src/ydata_profiling/visualisation/plot.py
@@ -13,6 +13,7 @@
from matplotlib.ticker import FuncFormatter
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from typeguard import typechecked
+from wordcloud import WordCloud
from ydata_profiling.config import Settings
from ydata_profiling.utils.common import convert_timestamp_to_datetime
@@ -24,6 +25,20 @@ def format_fn(tick_val: int, tick_pos: Any) -> str:
return convert_timestamp_to_datetime(tick_val).strftime("%Y-%m-%d %H:%M:%S")
+def _plot_word_cloud(
+ series: pd.Series,
+ figsize: tuple = (6, 4),
+) -> plt.Figure:
+ word_dict = series.to_dict()
+ wordcloud = WordCloud(
+ background_color="white", random_state=123, width=300, height=200, scale=2
+ ).generate_from_frequencies(word_dict)
+ plt.figure(figsize=figsize)
+ plot = plt.imshow(wordcloud, interpolation="bilinear")
+ plt.axis("off")
+ return plot
+
+
def _plot_histogram(
config: Settings,
series: np.ndarray,
@@ -99,6 +114,12 @@ def _plot_histogram(
return plot
+@manage_matplotlib_context()
+def plot_word_cloud(config: Settings, word_counts: pd.Series) -> str:
+ _plot_word_cloud(series=word_counts)
+ return plot_360_n0sc0pe(config)
+
+
@manage_matplotlib_context()
def histogram(
config: Settings,
diff --git a/tests/issues/test_issue397.py b/tests/issues/test_issue397.py
index 672378573..ad952e849 100644
--- a/tests/issues/test_issue397.py
+++ b/tests/issues/test_issue397.py
@@ -27,7 +27,7 @@ def test_issue397():
description = report.description_set
- assert description.table["types"] == {"Categorical": 1, "Numeric": 4}
+ assert description.table["types"] == {"Text": 1, "Numeric": 4}
assert description.variables["float-inf"]["p_infinite"] == 0.5
assert description.variables["float-inf"]["n_infinite"] == 2
diff --git a/tests/issues/test_issue72.py b/tests/issues/test_issue72.py
index ae515b1d4..5c8bd19d3 100644
--- a/tests/issues/test_issue72.py
+++ b/tests/issues/test_issue72.py
@@ -32,7 +32,7 @@ def test_issue72_equal():
# 3 == 3, so categorical
assert report.get_description().variables["A"]["type"] == "Categorical"
# Strings are always categorical
- assert report.get_description().variables["B"]["type"] == "Categorical"
+ assert report.get_description().variables["B"]["type"] == "Text"
def test_issue72_lower():
@@ -44,4 +44,4 @@ def test_issue72_lower():
# 3 < 10, so categorical
assert report.get_description().variables["A"]["type"] == "Categorical"
# Strings are always categorical
- assert report.get_description().variables["B"]["type"] == "Categorical"
+ assert report.get_description().variables["B"]["type"] == "Text"
diff --git a/tests/unit/test_example.py b/tests/unit/test_example.py
index 6b196b78a..0797e9f93 100644
--- a/tests/unit/test_example.py
+++ b/tests/unit/test_example.py
@@ -47,4 +47,4 @@ def test_example(get_data_file, test_output_dir):
profile.to_file(output_file)
assert (test_output_dir / "profile.html").exists(), "Output file does not exist"
assert type(profile.get_description()) == BaseDescription, "Unexpected result"
- assert "14" in profile.to_html()
+ assert "9" in profile.to_html()
diff --git a/tests/unit/test_typeset_custom.py b/tests/unit/test_typeset_custom.py
index 86274a17f..2685d3237 100644
--- a/tests/unit/test_typeset_custom.py
+++ b/tests/unit/test_typeset_custom.py
@@ -1,5 +1,4 @@
import datetime
-from dataclasses import dataclass
import numpy as np
import pandas as pd
@@ -50,7 +49,7 @@ def get_profiling_series():
15.9,
13.5,
],
- "cat": [
+ "str": [
"a",
"long text value",
"Élysée",
@@ -61,8 +60,18 @@ def get_profiling_series():
"c",
"c",
],
+ "str_cat": pd.Series(
+ ["male", "male", None, "female", "female", "male", "male"]
+ ),
+ "str_num": ["1", "10", "3.14", "566"],
+ "str_date": ["2000/01/01", "2001/07/24", "2011/12/24", "1980/03/10"],
+ "str_date2": ["2000-01-01", "2001-07-24", "2011-12-24", "1980-03-10"],
"s1": np.ones(9),
"s2": ["some constant text $ % value {obj} " for _ in range(1, 10)],
+ "cat": pd.Series(
+ ["male", "male", None, "female", "female", "male", "male"],
+ dtype="category",
+ ),
"somedate": [
datetime.date(2011, 7, 4),
datetime.datetime(2022, 1, 1, 13, 57),
@@ -137,6 +146,7 @@ def get_profiling_series():
type_map = {str(k): k for k in my_typeset.types}
Numeric = type_map["Numeric"]
+Text = type_map["Text"]
Categorical = type_map["Categorical"]
Boolean = type_map["Boolean"]
DateTime = type_map["DateTime"]
@@ -147,23 +157,12 @@ def get_profiling_series():
typeset2 = ProfilingTypeSet(config2)
type_map2 = {str(k): k for k in typeset2.types}
Numeric2 = type_map2["Numeric"]
+Text2 = type_map2["Text"]
Categorical2 = type_map2["Categorical"]
+DateTime2 = type_map2["DateTime"]
Boolean2 = type_map2["Boolean"]
-@dataclass
-class DataTest:
- def __init__(self, name, contains_type, infer_type, cast_result=None):
- self.name = name
- self.contains_type = contains_type
- self.infer_type = infer_type
- self.cast_result = cast_result
-
-
-cases = [
- DataTest("x", Numeric, Numeric),
-]
-
contains_map = {
Numeric: {
"x",
@@ -177,19 +176,24 @@ def __init__(self, name, contains_type, infer_type, cast_result=None):
"inf_only",
"nullable_int",
},
- Categorical: {
- "id",
- "cat",
- "s2",
- "date_str",
+ Text: {
+ "str",
+ "str_cat",
+ "str_num",
+ "str_date",
+ "str_date2",
"str_yes_no",
"str_yes_no_mixed",
"str_yes_no_nan",
"str_true_false",
"str_true_false_none",
"str_true_false_nan",
+ "id",
"catnum",
+ "date_str",
+ "s2",
},
+ Categorical: {"cat"},
Boolean: {
"bool_tf",
"bool_tf_with_nan",
@@ -227,10 +231,15 @@ def test_contains(name, series, contains_type, member):
"integers_nan": Numeric,
"bool_01": Numeric,
"bool_01_with_nan": Numeric,
- "id": Categorical,
- "cat": Categorical,
+ "id": Text,
+ "str_cat": Categorical,
+ "str_num": Numeric,
+ "str_date": DateTime,
+ "str_date2": DateTime,
"s2": Categorical,
- "date_str": Categorical,
+ "date_str": DateTime,
+ "str": Text,
+ "cat": Categorical,
"bool_tf": Boolean,
"bool_tf_with_nan": Boolean,
"booleans_type": Boolean,
@@ -285,23 +294,28 @@ def test_inference(name, series, inference_type, typeset, difference):
"nullable_int",
},
),
- (
- Numeric2,
- Categorical2,
- {"catnum"},
- ),
(
Boolean2,
- Categorical2,
+ Text2,
{
- "str_true_false",
"str_yes_no",
"str_yes_no_mixed",
"str_yes_no_nan",
+ "str_true_false",
"str_true_false_nan",
"str_true_false_none",
},
),
+ (
+ Categorical2,
+ Text2,
+ {
+ "str_cat",
+ "s2",
+ },
+ ),
+ (Numeric2, Text2, {"str_num", "catnum"}),
+ (DateTime2, Text2, {"str_date", "str_date2", "date_str"}),
]
diff --git a/tests/unit/test_typeset_default.py b/tests/unit/test_typeset_default.py
index 11b42266b..84a7df643 100644
--- a/tests/unit/test_typeset_default.py
+++ b/tests/unit/test_typeset_default.py
@@ -29,6 +29,7 @@
type_map = {str(k): k for k in my_typeset_default.types}
Numeric = type_map["Numeric"]
+Text = type_map["Text"]
Categorical = type_map["Categorical"]
Boolean = type_map["Boolean"]
DateTime = type_map["DateTime"]
@@ -61,12 +62,7 @@
"complex_series_float",
"complex_series_py_float",
},
- Categorical: {
- "categorical_float_series",
- "categorical_int_series",
- "categorical_string_series",
- "categorical_char",
- "ordinal",
+ Text: {
"timestamp_string_series",
"string_with_sep_num_nan",
"string_series",
@@ -97,6 +93,13 @@
"py_datetime_str",
"string_dtype_series",
},
+ Categorical: {
+ "categorical_float_series",
+ "categorical_int_series",
+ "categorical_string_series",
+ "categorical_char",
+ "ordinal",
+ },
Boolean: {
"bool_series",
"bool_series2",
@@ -177,7 +180,7 @@ def test_contains(name, series, contains_type, member):
inference_map = {
"int_series": Numeric,
- "categorical_int_series": Numeric,
+ "categorical_int_series": Categorical,
"int_nan_series": Numeric,
"Int64_int_series": Numeric,
"Int64_int_nan_series": Numeric,
@@ -193,26 +196,26 @@ def test_contains(name, series, contains_type, member):
"float_series5": Numeric,
"float_series6": Numeric,
"complex_series_float": Numeric,
- "categorical_float_series": Numeric,
+ "categorical_float_series": Categorical,
"float_with_inf": Numeric,
"inf_series": Numeric,
"nan_series": Unsupported,
"nan_series_2": Unsupported,
- "string_series": Categorical,
+ "string_series": Text,
"categorical_string_series": Categorical,
- "timestamp_string_series": Categorical,
- "string_with_sep_num_nan": Categorical, # TODO: Introduce thousands separator
- "string_unicode_series": Categorical,
- "string_np_unicode_series": Categorical,
+ "timestamp_string_series": DateTime,
+ "string_with_sep_num_nan": Text, # TODO: Introduce thousands separator
+ "string_unicode_series": Text,
+ "string_np_unicode_series": Text,
"string_num_nan": Numeric,
"string_num": Numeric,
"string_flt_nan": Numeric,
"string_flt": Numeric,
- "string_str_nan": Categorical,
+ "string_str_nan": Text,
"string_bool_nan": Boolean,
"int_str_range": Numeric,
- "string_date": Categorical,
- "str_url": Categorical,
+ "string_date": DateTime,
+ "str_url": Text,
"bool_series": Boolean,
"bool_nan_series": Boolean,
"nullable_bool_series": Boolean,
@@ -234,9 +237,9 @@ def test_contains(name, series, contains_type, member):
"geometry_series": Unsupported,
"path_series_linux": Unsupported,
"path_series_linux_missing": Unsupported,
- "path_series_linux_str": Categorical,
+ "path_series_linux_str": Text,
"path_series_windows": Unsupported,
- "path_series_windows_str": Categorical,
+ "path_series_windows_str": Text,
"url_series": Unsupported,
"url_nan_series": Unsupported,
"url_none_series": Unsupported,
@@ -255,16 +258,16 @@ def test_contains(name, series, contains_type, member):
"empty_int64": Unsupported,
"empty_object": Unsupported,
"ip": Unsupported,
- "ip_str": Categorical,
+ "ip_str": Text,
"ip_missing": Unsupported,
"date_series_nat": DateTime,
"date": Unsupported,
"time": Unsupported,
"categorical_char": Categorical,
"ordinal": Categorical,
- "str_complex": Categorical,
+ "str_complex": Text,
"uuid_series": Unsupported,
- "uuid_series_str": Categorical,
+ "uuid_series_str": Text,
"uuid_series_missing": Unsupported,
"ip_mixed_v4andv6": Unsupported,
"file_test_py": Unsupported,
@@ -275,17 +278,17 @@ def test_contains(name, series, contains_type, member):
"str_int_leading_zeros": Numeric,
"str_float_non_leading_zeros": Numeric,
"str_int_zeros": Numeric,
- "email_address_str": Categorical,
- "str_complex_nan": Categorical,
+ "email_address_str": Text,
+ "str_complex_nan": Text,
"email_address": Unsupported,
"email_address_missing": Unsupported,
"all_null_nat": Unsupported,
- "all_null_empty_str": Categorical,
- "py_datetime_str": Categorical,
+ "all_null_empty_str": Text,
+ "py_datetime_str": DateTime,
"all_null_none": Unsupported,
"complex_series_py_float": Numeric,
"all_null_nan": Unsupported,
- "string_dtype_series": Categorical,
+ "string_dtype_series": Text,
}
@@ -311,7 +314,7 @@ def test_inference(name, series, inference_type, typeset, difference):
(Categorical, Numeric, {"mixed"}),
(
Numeric,
- Categorical,
+ Text,
{
"string_flt",
"string_num_nan",
@@ -331,12 +334,19 @@ def test_inference(name, series, inference_type, typeset, difference):
),
(
Boolean,
- Categorical,
+ Text,
{
"string_bool_nan",
"nullable_bool_series",
},
),
+ (
+ DateTime,
+ Text,
+ {"py_datetime_str", "timestamp_string_series", "string_date"},
+ ),
+ (Categorical, Text, {"categorical_string_series"}),
+ (Categorical, Numeric, {"categorical_float_series"}),
]