From 8d97234a765a846eecde1273c23e8bf32ed7ecf9 Mon Sep 17 00:00:00 2001
From: Jan Cap <71695857+vorel99@users.noreply.github.com>
Date: Wed, 3 May 2023 14:42:40 +0200
Subject: [PATCH] feat: add string type for text variables (#1282)

* chore(actions): fix docs publishing ci

* update type infering

-  added new data type String
  - added describe function for string
  - added render function for string

* add describe string for spark

- same as category

* add word cloud to requirements

* update tests

-replace Category with String, where needed

* change type hint

- update type hint at string_to_bool function

* change word cloud size to same ratio as fig size

* format string render

- update string render to same format as other renders

* update describe_string_spark

* change import order

* update 'String' type name to 'Text'

- 'Text' title is more accurate

* resolve pre-commit hooks
---
 requirements.txt                              |   3 +-
 src/ydata_profiling/config.py                 |  14 +-
 src/ydata_profiling/expectations_report.py    |   1 +
 src/ydata_profiling/model/handler.py          |   1 +
 src/ydata_profiling/model/pandas/__init__.py  |   2 +
 .../model/pandas/describe_text_pandas.py      |  64 ++++++
 .../model/spark/describe_text_spark.py        |  27 +++
 src/ydata_profiling/model/summarizer.py       |   4 +
 .../model/summary_algorithms.py               |   7 +
 src/ydata_profiling/model/typeset.py          | 111 +++++++++--
 .../model/typeset_relations.py                |  39 +++-
 .../report/structure/variables/__init__.py    |   2 +
 .../report/structure/variables/render_text.py | 184 ++++++++++++++++++
 src/ydata_profiling/visualisation/plot.py     |  21 ++
 tests/issues/test_issue397.py                 |   2 +-
 tests/issues/test_issue72.py                  |   4 +-
 tests/unit/test_example.py                    |   2 +-
 tests/unit/test_typeset_custom.py             |  74 ++++---
 tests/unit/test_typeset_default.py            |  66 ++++---
 19 files changed, 546 insertions(+), 82 deletions(-)
 create mode 100644 src/ydata_profiling/model/pandas/describe_text_pandas.py
 create mode 100644 src/ydata_profiling/model/spark/describe_text_spark.py
 create mode 100644 src/ydata_profiling/report/structure/variables/render_text.py

diff --git a/requirements.txt b/requirements.txt
index 28ba2bc04..a3c139bfd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,4 +22,5 @@ statsmodels>=0.13.2, <0.14
 # type checking
 typeguard>=2.13.2, <2.14
 imagehash==4.3.1
-dacite>=1.8
\ No newline at end of file
+wordcloud>=1.9.1
+dacite>=1.8
diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py
index 0b7e4f8d6..a82e2dd3e 100644
--- a/src/ydata_profiling/config.py
+++ b/src/ydata_profiling/config.py
@@ -45,6 +45,17 @@ class NumVars(BaseModel):
     chi_squared_threshold: float = 0.999
 
 
+class TextVars(BaseModel):
+    length: bool = True
+    words: bool = True
+    characters: bool = True
+    redact: bool = False
+    # if text has more than threshold categories, its not category
+    categorical_threshold: int = 50
+    # if text has more than threshold % distinct values, its not category
+    percentage_cat_threshold: float = 0.5
+
+
 class CatVars(BaseModel):
     length: bool = True
     characters: bool = True
@@ -106,6 +117,7 @@ class TimeseriesVars(BaseModel):
 
 class Univariate(BaseModel):
     num: NumVars = NumVars()
+    text: TextVars = TextVars()
     cat: CatVars = CatVars()
     image: ImageVars = ImageVars()
     bool: BoolVars = BoolVars()
@@ -395,7 +407,7 @@ class Config:
         "sensitive": {
             "samples": None,
             "duplicates": None,
-            "vars": {"cat": {"redact": True}},
+            "vars": {"cat": {"redact": True}, "text": {"redact": True}},
         },
         "dark_mode": {
             "html": {
diff --git a/src/ydata_profiling/expectations_report.py b/src/ydata_profiling/expectations_report.py
index c8583fbe2..7979e510b 100644
--- a/src/ydata_profiling/expectations_report.py
+++ b/src/ydata_profiling/expectations_report.py
@@ -15,6 +15,7 @@ class ExpectationHandler(Handler):
     def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
         mapping = {
             "Unsupported": [expectation_algorithms.generic_expectations],
+            "Text": [expectation_algorithms.categorical_expectations],
             "Categorical": [expectation_algorithms.categorical_expectations],
             "Boolean": [expectation_algorithms.categorical_expectations],
             "Numeric": [expectation_algorithms.numeric_expectations],
diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py
index 9d12dd705..5db948fd6 100644
--- a/src/ydata_profiling/model/handler.py
+++ b/src/ydata_profiling/model/handler.py
@@ -69,6 +69,7 @@ def get_render_map() -> Dict[str, Callable]:
         "Boolean": render_algorithms.render_boolean,
         "Numeric": render_algorithms.render_real,
         "Complex": render_algorithms.render_complex,
+        "Text": render_algorithms.render_text,
         "DateTime": render_algorithms.render_date,
         "Categorical": render_algorithms.render_categorical,
         "URL": render_algorithms.render_url,
diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py
index 290fddace..b895f46e2 100644
--- a/src/ydata_profiling/model/pandas/__init__.py
+++ b/src/ydata_profiling/model/pandas/__init__.py
@@ -11,6 +11,7 @@
     describe_numeric_pandas,
     describe_path_pandas,
     describe_supported_pandas,
+    describe_text_pandas,
     describe_timeseries_pandas,
     describe_url_pandas,
     duplicates_pandas,
@@ -33,6 +34,7 @@
     "describe_numeric_pandas",
     "describe_path_pandas",
     "describe_supported_pandas",
+    "describe_text_pandas",
     "describe_timeseries_pandas",
     "describe_url_pandas",
     "duplicates_pandas",
diff --git a/src/ydata_profiling/model/pandas/describe_text_pandas.py b/src/ydata_profiling/model/pandas/describe_text_pandas.py
new file mode 100644
index 000000000..2701b9760
--- /dev/null
+++ b/src/ydata_profiling/model/pandas/describe_text_pandas.py
@@ -0,0 +1,64 @@
+from typing import Tuple
+
+import pandas as pd
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.pandas.describe_categorical_pandas import (
+    length_summary_vc,
+    unicode_summary_vc,
+    word_summary_vc,
+)
+from ydata_profiling.model.summary_algorithms import (
+    describe_text_1d,
+    histogram_compute,
+    series_handle_nulls,
+    series_hashable,
+)
+
+
+@describe_text_1d.register
+@series_hashable
+@series_handle_nulls
+def pandas_describe_text_1d(
+    config: Settings,
+    series: pd.Series,
+    summary: dict,
+) -> Tuple[Settings, pd.Series, dict]:
+    """Describe string series.
+
+    Args:
+        config: report Settings object
+        series: The Series to describe.
+        summary: The dict containing the series description so far.
+
+    Returns:
+        A dict containing calculated series description values.
+    """
+
+    series = series.astype(str)
+
+    # Only run if at least 1 non-missing value
+    value_counts = summary["value_counts_without_nan"]
+    value_counts.index = value_counts.index.astype(str)
+
+    summary.update({"first_rows": series.head(5)})
+
+    if config.vars.text.length:
+        summary.update(length_summary_vc(value_counts))
+        summary.update(
+            histogram_compute(
+                config,
+                summary["length_histogram"].index.values,
+                len(summary["length_histogram"]),
+                name="histogram_length",
+                weights=summary["length_histogram"].values,
+            )
+        )
+
+    if config.vars.text.characters:
+        summary.update(unicode_summary_vc(value_counts))
+
+    if config.vars.text.words:
+        summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words))
+
+    return config, series, summary
diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py
new file mode 100644
index 000000000..b5e27f615
--- /dev/null
+++ b/src/ydata_profiling/model/spark/describe_text_spark.py
@@ -0,0 +1,27 @@
+from typing import Tuple
+
+from pyspark.sql import DataFrame
+
+from ydata_profiling.config import Settings
+from ydata_profiling.model.summary_algorithms import describe_text_1d
+
+
+@describe_text_1d.register
+def describe_text_1d_spark(
+    config: Settings, df: DataFrame, summary: dict
+) -> Tuple[Settings, DataFrame, dict]:
+    """Describe a categorical series.
+
+    Args:
+        series: The Series to describe.
+        summary: The dict containing the series description so far.
+
+    Returns:
+        A dict containing calculated series description values.
+    """
+
+    redact = config.vars.text.redact
+    if not redact:
+        summary["first_rows"] = df.limit(5).toPandas().squeeze("columns")
+
+    return config, df, summary
diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py
index c112d14ed..f7b7d1b91 100644
--- a/src/ydata_profiling/model/summarizer.py
+++ b/src/ydata_profiling/model/summarizer.py
@@ -19,6 +19,7 @@
     describe_numeric_1d,
     describe_path_1d,
     describe_supported,
+    describe_text_1d,
     describe_timeseries_1d,
     describe_url_1d,
 )
@@ -58,6 +59,9 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs):
             "DateTime": [
                 describe_date_1d,
             ],
+            "Text": [
+                describe_text_1d,
+            ],
             "Categorical": [
                 describe_categorical_1d,
             ],
diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py
index fbb944681..ad64f8aae 100644
--- a/src/ydata_profiling/model/summary_algorithms.py
+++ b/src/ydata_profiling/model/summary_algorithms.py
@@ -126,6 +126,13 @@ def describe_numeric_1d(
     raise NotImplementedError()
 
 
+@multimethod
+def describe_text_1d(
+    config: Settings, series: Any, summary: dict
+) -> Tuple[Settings, Any, dict, Any]:
+    raise NotImplementedError()
+
+
 @multimethod
 def describe_date_1d(
     config: Settings, series: Any, summary: dict
diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py
index 489c1d5e5..ee1c42e78 100644
--- a/src/ydata_profiling/model/typeset.py
+++ b/src/ydata_profiling/model/typeset.py
@@ -14,12 +14,15 @@
 
 from ydata_profiling.config import Settings
 from ydata_profiling.model.typeset_relations import (
-    category_is_numeric,
-    category_to_numeric,
     numeric_is_category,
     series_is_string,
     string_is_bool,
+    string_is_category,
+    string_is_datetime,
+    string_is_numeric,
     string_to_bool,
+    string_to_datetime,
+    string_to_numeric,
     to_bool,
     to_category,
 )
@@ -49,19 +52,38 @@ def typeset_types(config: Settings) -> Set[visions.VisionsBaseType]:
     """Define types based on the config"""
 
     class Unsupported(visions.Generic):
+        """Base type. All other types have relationship with this type."""
+
         pass
 
     class Numeric(visions.VisionsBaseType):
+        """Type for all numeric (float, int) columns.
+
+        Can be transformed from
+        - Unsupported
+        - String
+
+        Examples
+        --------
+        >>> s = pd.Series([1, 2, 5, 3, 8, 9])
+        >>> s in Numeric
+        True
+
+        >>> s = pd.Series([.34, 2.9, 55, 3.14, 89, 91])
+        >>> s in Numeric
+        True
+        """
+
         @staticmethod
         def get_relations() -> Sequence[TypeRelation]:
             return [
                 IdentityRelation(Unsupported),
                 InferenceRelation(
-                    Categorical,
-                    relationship=lambda x, y: partial(category_is_numeric, k=config)(
+                    Text,
+                    relationship=lambda x, y: partial(string_is_numeric, k=config)(
                         x, y
                     ),
-                    transformer=category_to_numeric,
+                    transformer=string_to_numeric,
                 ),
             ]
 
@@ -72,11 +94,44 @@ def get_relations() -> Sequence[TypeRelation]:
         def contains_op(series: pd.Series, state: dict) -> bool:
             return pdt.is_numeric_dtype(series) and not pdt.is_bool_dtype(series)
 
+    class Text(visions.VisionsBaseType):
+        """Type for plaintext columns.
+        Like name, note, string identifier, residence etc.
+
+        Examples
+        --------
+        >>> s = pd.Series(["AX01", "BC32", "AC00"])
+        >>> s in Categorical
+        True
+
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> s in Categorical
+        False
+        """
+
+        @staticmethod
+        def get_relations() -> Sequence[TypeRelation]:
+            return [
+                IdentityRelation(Unsupported),
+            ]
+
+        @staticmethod
+        @multimethod
+        @series_not_empty
+        @series_handle_nulls
+        def contains_op(series: pd.Series, state: dict) -> bool:
+            return pdt.is_string_dtype(series) and series_is_string(series, state)
+
     class DateTime(visions.VisionsBaseType):
         @staticmethod
         def get_relations() -> Sequence[TypeRelation]:
             return [
                 IdentityRelation(Unsupported),
+                InferenceRelation(
+                    Text,
+                    relationship=lambda x, y: partial(string_is_datetime)(x, y),
+                    transformer=string_to_datetime,
+                ),
             ]
 
         @staticmethod
@@ -87,6 +142,30 @@ def contains_op(series: pd.Series, state: dict) -> bool:
             return pdt.is_datetime64_any_dtype(series)
 
     class Categorical(visions.VisionsBaseType):
+        """Type for categorical columns.
+        Categorical columns in pandas categorical format
+        and columns in string format with small count of unique values.
+
+        Can be transformed from:
+            - Unsupported
+            - Numeric
+            - String
+
+        Examples
+        --------
+        >>> s = pd.Series(["male", "female", "female", "male"], dtype="category")
+        >>> s in Categorical
+        True
+
+        >>> s = pd.Series(["male", "female"])
+        >>> s in Categorical
+        False
+
+        >>> s = pd.Series(["male", "female", "female", "male"])
+        >>> s in Categorical
+        True
+        """
+
         @staticmethod
         def get_relations() -> Sequence[TypeRelation]:
             return [
@@ -98,6 +177,13 @@ def get_relations() -> Sequence[TypeRelation]:
                     ),
                     transformer=to_category,
                 ),
+                InferenceRelation(
+                    Text,
+                    relationship=lambda x, y: partial(string_is_category, k=config)(
+                        x, y
+                    ),
+                    transformer=to_category,
+                ),
             ]
 
         @staticmethod
@@ -110,12 +196,11 @@ def contains_op(series: pd.Series, state: dict) -> bool:
             )
             if is_valid_dtype:
                 return True
-            elif not pdt.is_object_dtype(series):
-                return pandas_has_string_dtype_flag and pdt.is_string_dtype(series)
-
-            return series_is_string(series, state)
+            return False
 
     class Boolean(visions.VisionsBaseType):
+        """Type for boolean columns."""
+
         @staticmethod
         def get_relations() -> Sequence[TypeRelation]:
             # Numeric [0, 1] goes via Categorical with distinct_count_without_nan <= 2
@@ -124,7 +209,7 @@ def get_relations() -> Sequence[TypeRelation]:
             return [
                 IdentityRelation(Unsupported),
                 InferenceRelation(
-                    Categorical,
+                    Text,
                     relationship=lambda x, y: partial(string_is_bool, k=mapping)(x, y),
                     transformer=lambda s, st: to_bool(
                         partial(string_to_bool, k=mapping)(s, st)
@@ -148,7 +233,7 @@ def contains_op(series: pd.Series, state: dict) -> bool:
     class URL(visions.VisionsBaseType):
         @staticmethod
         def get_relations() -> Sequence[TypeRelation]:
-            return [IdentityRelation(Categorical)]
+            return [IdentityRelation(Text)]
 
         @staticmethod
         @multimethod
@@ -164,7 +249,7 @@ def contains_op(series: pd.Series, state: dict) -> bool:
     class Path(visions.VisionsBaseType):
         @staticmethod
         def get_relations() -> Sequence[TypeRelation]:
-            return [IdentityRelation(Categorical)]
+            return [IdentityRelation(Text)]
 
         @staticmethod
         @multimethod
@@ -223,7 +308,7 @@ def is_timedependent(series: pd.Series) -> bool:
             is_numeric = pdt.is_numeric_dtype(series) and not pdt.is_bool_dtype(series)
             return is_numeric and is_timedependent(series)
 
-    types = {Unsupported, Boolean, Numeric, Categorical, DateTime}
+    types = {Unsupported, Boolean, Numeric, Text, Categorical, DateTime}
     if config.vars.path.active:
         types.add(Path)
         if config.vars.file.active:
diff --git a/src/ydata_profiling/model/typeset_relations.py b/src/ydata_profiling/model/typeset_relations.py
index baa047305..0a8bd6d4a 100644
--- a/src/ydata_profiling/model/typeset_relations.py
+++ b/src/ydata_profiling/model/typeset_relations.py
@@ -1,5 +1,5 @@
 import functools
-from typing import Callable
+from typing import Callable, Dict
 
 import numpy as np
 import pandas as pd
@@ -24,7 +24,7 @@ def inner(series: pd.Series, *args, **kwargs) -> bool:
     return inner
 
 
-def string_is_bool(series: pd.Series, state: dict, k: Settings) -> bool:
+def string_is_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> bool:
     @series_handle_nulls
     @try_func
     def tester(s: pd.Series, state: dict) -> bool:
@@ -36,7 +36,7 @@ def tester(s: pd.Series, state: dict) -> bool:
     return tester(series, state)
 
 
-def string_to_bool(series: pd.Series, state: dict, k: Settings) -> pd.Series:
+def string_to_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> pd.Series:
     return series.str.lower().map(k)
 
 
@@ -66,7 +66,32 @@ def series_is_string(series: pd.Series, state: dict) -> bool:
 
 
 @series_handle_nulls
-def category_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool:
+def string_is_category(series: pd.Series, state: dict, k: Settings) -> bool:
+    """String is category, if following conditions are met
+    - has at least one and less or equal distinct values as threshold
+    - (distinct values / count of all values) is less than threshold
+    - is not bool"""
+    n_unique = series.nunique()
+    unique_threshold = k.vars.text.percentage_cat_threshold
+    threshold = k.vars.text.categorical_threshold
+    return (
+        1 <= n_unique <= threshold
+        and n_unique / series.size < unique_threshold
+        and not string_is_bool(series, state, k.vars.bool.mappings)
+    )
+
+
+@series_handle_nulls
+def string_is_datetime(series: pd.Series, state: dict) -> bool:
+    """If we can transform data to datetime and at least one is valid date."""
+    try:
+        return not series.astype("datetime64").isna().all()
+    except:  # noqa: E722
+        return False
+
+
+@series_handle_nulls
+def string_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool:
     if pdt.is_bool_dtype(series) or object_is_bool(series, state):
         return False
 
@@ -81,7 +106,11 @@ def category_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool:
     return not numeric_is_category(series, state, k)
 
 
-def category_to_numeric(series: pd.Series, state: dict) -> pd.Series:
+def string_to_datetime(series: pd.Series, state: dict) -> pd.Series:
+    return series.astype("datetime64")
+
+
+def string_to_numeric(series: pd.Series, state: dict) -> pd.Series:
     return pd.to_numeric(series, errors="coerce")
 
 
diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py
index 0c513b15d..64f1d6d54 100644
--- a/src/ydata_profiling/report/structure/variables/__init__.py
+++ b/src/ydata_profiling/report/structure/variables/__init__.py
@@ -11,6 +11,7 @@
 from ydata_profiling.report.structure.variables.render_image import render_image
 from ydata_profiling.report.structure.variables.render_path import render_path
 from ydata_profiling.report.structure.variables.render_real import render_real
+from ydata_profiling.report.structure.variables.render_text import render_text
 from ydata_profiling.report.structure.variables.render_timeseries import (
     render_timeseries,
 )
@@ -28,6 +29,7 @@
     "render_image",
     "render_path",
     "render_real",
+    "render_text",
     "render_timeseries",
     "render_url",
 ]
diff --git a/src/ydata_profiling/report/structure/variables/render_text.py b/src/ydata_profiling/report/structure/variables/render_text.py
new file mode 100644
index 000000000..227ffa9f6
--- /dev/null
+++ b/src/ydata_profiling/report/structure/variables/render_text.py
@@ -0,0 +1,184 @@
+from typing import Any, Dict, List
+
+from ydata_profiling.config import Settings
+from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent
+from ydata_profiling.report.presentation.core import (
+    Container,
+    FrequencyTable,
+    Image,
+    Table,
+)
+from ydata_profiling.report.presentation.core.variable_info import VariableInfo
+from ydata_profiling.report.structure.variables.render_categorical import (
+    _get_n,
+    freq_table,
+    render_categorical_frequency,
+    render_categorical_length,
+    render_categorical_unicode,
+)
+from ydata_profiling.report.structure.variables.render_common import render_common
+from ydata_profiling.visualisation.plot import plot_word_cloud
+
+
+def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]:
+    varid = summary["varid"]
+    words = config.vars.text.words
+    characters = config.vars.text.characters
+    length = config.vars.text.length
+
+    template_variables = render_common(config, summary)
+
+    top_items: List[Any] = []
+    var_info = VariableInfo(
+        anchor_id=varid,
+        var_name=summary["varname"],
+        var_type=summary["type"],
+        alerts=summary["alerts"],
+        description=summary["description"],
+        style=config.html.style,
+    )
+    top_items.append(var_info)
+
+    table = Table(
+        [
+            {
+                "name": "Distinct",
+                "value": fmt(summary["n_distinct"]),
+                "alert": "n_distinct" in summary["alert_fields"],
+            },
+            {
+                "name": "Distinct (%)",
+                "value": fmt_percent(summary["p_distinct"]),
+                "alert": "p_distinct" in summary["alert_fields"],
+            },
+            {
+                "name": "Missing",
+                "value": fmt(summary["n_missing"]),
+                "alert": "n_missing" in summary["alert_fields"],
+            },
+            {
+                "name": "Missing (%)",
+                "value": fmt_percent(summary["p_missing"]),
+                "alert": "p_missing" in summary["alert_fields"],
+            },
+            {
+                "name": "Memory size",
+                "value": fmt_bytesize(summary["memory_size"]),
+                "alert": False,
+            },
+        ],
+        style=config.html.style,
+    )
+    top_items.append(table)
+
+    if words and "word_counts" in summary:
+        mini_wordcloud = Image(
+            plot_word_cloud(config, summary["word_counts"]),
+            image_format=config.plot.image_format,
+            alt="Mini wordcloud",
+        )
+        top_items.append(mini_wordcloud)
+    template_variables["top"] = Container(top_items, sequence_type="grid")
+
+    # ============================================================================================
+
+    bottom_items = []
+    overview_items = []
+    # length isn't being computed for categorical in spark
+    if length and "max_length" in summary:
+        length_table, length_histo = render_categorical_length(config, summary, varid)
+        overview_items.append(length_table)
+
+    # characters isn't being computed for categorical in spark
+    unitab = None
+    if characters and "category_alias_counts" in summary:
+        overview_table_char, unitab = render_categorical_unicode(config, summary, varid)
+        overview_items.append(overview_table_char)
+
+    unique_stats = render_categorical_frequency(config, summary, varid)
+    overview_items.append(unique_stats)
+
+    if not config.vars.text.redact:
+        rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row")
+
+        if isinstance(summary["first_rows"], list):
+            sample = Table(
+                [
+                    {
+                        "name": name,
+                        "value": fmt(value),
+                        "alert": False,
+                    }
+                    for name, *value in zip(rows, *summary["first_rows"])
+                ],
+                name="Sample",
+                style=config.html.style,
+            )
+        else:
+            sample = Table(
+                [
+                    {
+                        "name": name,
+                        "value": fmt(value),
+                        "alert": False,
+                    }
+                    for name, value in zip(rows, summary["first_rows"])
+                ],
+                name="Sample",
+                style=config.html.style,
+            )
+        overview_items.append(sample)
+    overview = Container(
+        overview_items,
+        name="Overview",
+        anchor_id=f"{varid}overview",
+        sequence_type="batch_grid",
+        batch_size=len(overview_items),
+        titles=False,
+    )
+    bottom_items.append(overview)
+
+    if words and "word_counts" in summary:
+        woc = freq_table(
+            freqtable=summary["word_counts"],
+            n=_get_n(summary["word_counts"]),
+            max_number_to_print=10,
+        )
+
+        fqwo = FrequencyTable(
+            woc,
+            name="Common words",
+            anchor_id=f"{varid}cwo",
+            redact=config.vars.text.redact,
+        )
+
+        image = Image(
+            plot_word_cloud(config, summary["word_counts"]),
+            image_format=config.plot.image_format,
+            alt="Wordcloud",
+        )
+
+        bottom_items.append(
+            Container(
+                [fqwo, image],
+                name="Words",
+                anchor_id=f"{varid}word",
+                sequence_type="grid",
+            )
+        )
+
+    if unitab is not None:
+        bottom_items.append(
+            Container(
+                [unitab],
+                name="Characters",
+                anchor_id=f"{varid}characters",
+                sequence_type="grid",
+            )
+        )
+
+    template_variables["bottom"] = Container(
+        bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom"
+    )
+
+    return template_variables
diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py
index fd8764b38..ad2778450 100644
--- a/src/ydata_profiling/visualisation/plot.py
+++ b/src/ydata_profiling/visualisation/plot.py
@@ -13,6 +13,7 @@
 from matplotlib.ticker import FuncFormatter
 from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
 from typeguard import typechecked
+from wordcloud import WordCloud
 
 from ydata_profiling.config import Settings
 from ydata_profiling.utils.common import convert_timestamp_to_datetime
@@ -24,6 +25,20 @@ def format_fn(tick_val: int, tick_pos: Any) -> str:
     return convert_timestamp_to_datetime(tick_val).strftime("%Y-%m-%d %H:%M:%S")
 
 
+def _plot_word_cloud(
+    series: pd.Series,
+    figsize: tuple = (6, 4),
+) -> plt.Figure:
+    word_dict = series.to_dict()
+    wordcloud = WordCloud(
+        background_color="white", random_state=123, width=300, height=200, scale=2
+    ).generate_from_frequencies(word_dict)
+    plt.figure(figsize=figsize)
+    plot = plt.imshow(wordcloud, interpolation="bilinear")
+    plt.axis("off")
+    return plot
+
+
 def _plot_histogram(
     config: Settings,
     series: np.ndarray,
@@ -99,6 +114,12 @@ def _plot_histogram(
     return plot
 
 
+@manage_matplotlib_context()
+def plot_word_cloud(config: Settings, word_counts: pd.Series) -> str:
+    _plot_word_cloud(series=word_counts)
+    return plot_360_n0sc0pe(config)
+
+
 @manage_matplotlib_context()
 def histogram(
     config: Settings,
diff --git a/tests/issues/test_issue397.py b/tests/issues/test_issue397.py
index 672378573..ad952e849 100644
--- a/tests/issues/test_issue397.py
+++ b/tests/issues/test_issue397.py
@@ -27,7 +27,7 @@ def test_issue397():
 
     description = report.description_set
 
-    assert description.table["types"] == {"Categorical": 1, "Numeric": 4}
+    assert description.table["types"] == {"Text": 1, "Numeric": 4}
 
     assert description.variables["float-inf"]["p_infinite"] == 0.5
     assert description.variables["float-inf"]["n_infinite"] == 2
diff --git a/tests/issues/test_issue72.py b/tests/issues/test_issue72.py
index ae515b1d4..5c8bd19d3 100644
--- a/tests/issues/test_issue72.py
+++ b/tests/issues/test_issue72.py
@@ -32,7 +32,7 @@ def test_issue72_equal():
     # 3 == 3, so categorical
     assert report.get_description().variables["A"]["type"] == "Categorical"
     # Strings are always categorical
-    assert report.get_description().variables["B"]["type"] == "Categorical"
+    assert report.get_description().variables["B"]["type"] == "Text"
 
 
 def test_issue72_lower():
@@ -44,4 +44,4 @@ def test_issue72_lower():
     # 3 < 10, so categorical
     assert report.get_description().variables["A"]["type"] == "Categorical"
     # Strings are always categorical
-    assert report.get_description().variables["B"]["type"] == "Categorical"
+    assert report.get_description().variables["B"]["type"] == "Text"
diff --git a/tests/unit/test_example.py b/tests/unit/test_example.py
index 6b196b78a..0797e9f93 100644
--- a/tests/unit/test_example.py
+++ b/tests/unit/test_example.py
@@ -47,4 +47,4 @@ def test_example(get_data_file, test_output_dir):
     profile.to_file(output_file)
     assert (test_output_dir / "profile.html").exists(), "Output file does not exist"
     assert type(profile.get_description()) == BaseDescription, "Unexpected result"
-    assert "<span class=badge>14</span>" in profile.to_html()
+    assert "<span class=badge>9</span>" in profile.to_html()
diff --git a/tests/unit/test_typeset_custom.py b/tests/unit/test_typeset_custom.py
index 86274a17f..2685d3237 100644
--- a/tests/unit/test_typeset_custom.py
+++ b/tests/unit/test_typeset_custom.py
@@ -1,5 +1,4 @@
 import datetime
-from dataclasses import dataclass
 
 import numpy as np
 import pandas as pd
@@ -50,7 +49,7 @@ def get_profiling_series():
             15.9,
             13.5,
         ],
-        "cat": [
+        "str": [
             "a",
             "long text value",
             "Élysée",
@@ -61,8 +60,18 @@ def get_profiling_series():
             "c",
             "c",
         ],
+        "str_cat": pd.Series(
+            ["male", "male", None, "female", "female", "male", "male"]
+        ),
+        "str_num": ["1", "10", "3.14", "566"],
+        "str_date": ["2000/01/01", "2001/07/24", "2011/12/24", "1980/03/10"],
+        "str_date2": ["2000-01-01", "2001-07-24", "2011-12-24", "1980-03-10"],
         "s1": np.ones(9),
         "s2": ["some constant text $ % value {obj} " for _ in range(1, 10)],
+        "cat": pd.Series(
+            ["male", "male", None, "female", "female", "male", "male"],
+            dtype="category",
+        ),
         "somedate": [
             datetime.date(2011, 7, 4),
             datetime.datetime(2022, 1, 1, 13, 57),
@@ -137,6 +146,7 @@ def get_profiling_series():
 
 type_map = {str(k): k for k in my_typeset.types}
 Numeric = type_map["Numeric"]
+Text = type_map["Text"]
 Categorical = type_map["Categorical"]
 Boolean = type_map["Boolean"]
 DateTime = type_map["DateTime"]
@@ -147,23 +157,12 @@ def get_profiling_series():
 typeset2 = ProfilingTypeSet(config2)
 type_map2 = {str(k): k for k in typeset2.types}
 Numeric2 = type_map2["Numeric"]
+Text2 = type_map2["Text"]
 Categorical2 = type_map2["Categorical"]
+DateTime2 = type_map2["DateTime"]
 Boolean2 = type_map2["Boolean"]
 
 
-@dataclass
-class DataTest:
-    def __init__(self, name, contains_type, infer_type, cast_result=None):
-        self.name = name
-        self.contains_type = contains_type
-        self.infer_type = infer_type
-        self.cast_result = cast_result
-
-
-cases = [
-    DataTest("x", Numeric, Numeric),
-]
-
 contains_map = {
     Numeric: {
         "x",
@@ -177,19 +176,24 @@ def __init__(self, name, contains_type, infer_type, cast_result=None):
         "inf_only",
         "nullable_int",
     },
-    Categorical: {
-        "id",
-        "cat",
-        "s2",
-        "date_str",
+    Text: {
+        "str",
+        "str_cat",
+        "str_num",
+        "str_date",
+        "str_date2",
         "str_yes_no",
         "str_yes_no_mixed",
         "str_yes_no_nan",
         "str_true_false",
         "str_true_false_none",
         "str_true_false_nan",
+        "id",
         "catnum",
+        "date_str",
+        "s2",
     },
+    Categorical: {"cat"},
     Boolean: {
         "bool_tf",
         "bool_tf_with_nan",
@@ -227,10 +231,15 @@ def test_contains(name, series, contains_type, member):
     "integers_nan": Numeric,
     "bool_01": Numeric,
     "bool_01_with_nan": Numeric,
-    "id": Categorical,
-    "cat": Categorical,
+    "id": Text,
+    "str_cat": Categorical,
+    "str_num": Numeric,
+    "str_date": DateTime,
+    "str_date2": DateTime,
     "s2": Categorical,
-    "date_str": Categorical,
+    "date_str": DateTime,
+    "str": Text,
+    "cat": Categorical,
     "bool_tf": Boolean,
     "bool_tf_with_nan": Boolean,
     "booleans_type": Boolean,
@@ -285,23 +294,28 @@ def test_inference(name, series, inference_type, typeset, difference):
             "nullable_int",
         },
     ),
-    (
-        Numeric2,
-        Categorical2,
-        {"catnum"},
-    ),
     (
         Boolean2,
-        Categorical2,
+        Text2,
         {
-            "str_true_false",
             "str_yes_no",
             "str_yes_no_mixed",
             "str_yes_no_nan",
+            "str_true_false",
             "str_true_false_nan",
             "str_true_false_none",
         },
     ),
+    (
+        Categorical2,
+        Text2,
+        {
+            "str_cat",
+            "s2",
+        },
+    ),
+    (Numeric2, Text2, {"str_num", "catnum"}),
+    (DateTime2, Text2, {"str_date", "str_date2", "date_str"}),
 ]
 
 
diff --git a/tests/unit/test_typeset_default.py b/tests/unit/test_typeset_default.py
index 11b42266b..84a7df643 100644
--- a/tests/unit/test_typeset_default.py
+++ b/tests/unit/test_typeset_default.py
@@ -29,6 +29,7 @@
 
 type_map = {str(k): k for k in my_typeset_default.types}
 Numeric = type_map["Numeric"]
+Text = type_map["Text"]
 Categorical = type_map["Categorical"]
 Boolean = type_map["Boolean"]
 DateTime = type_map["DateTime"]
@@ -61,12 +62,7 @@
         "complex_series_float",
         "complex_series_py_float",
     },
-    Categorical: {
-        "categorical_float_series",
-        "categorical_int_series",
-        "categorical_string_series",
-        "categorical_char",
-        "ordinal",
+    Text: {
         "timestamp_string_series",
         "string_with_sep_num_nan",
         "string_series",
@@ -97,6 +93,13 @@
         "py_datetime_str",
         "string_dtype_series",
     },
+    Categorical: {
+        "categorical_float_series",
+        "categorical_int_series",
+        "categorical_string_series",
+        "categorical_char",
+        "ordinal",
+    },
     Boolean: {
         "bool_series",
         "bool_series2",
@@ -177,7 +180,7 @@ def test_contains(name, series, contains_type, member):
 
 inference_map = {
     "int_series": Numeric,
-    "categorical_int_series": Numeric,
+    "categorical_int_series": Categorical,
     "int_nan_series": Numeric,
     "Int64_int_series": Numeric,
     "Int64_int_nan_series": Numeric,
@@ -193,26 +196,26 @@ def test_contains(name, series, contains_type, member):
     "float_series5": Numeric,
     "float_series6": Numeric,
     "complex_series_float": Numeric,
-    "categorical_float_series": Numeric,
+    "categorical_float_series": Categorical,
     "float_with_inf": Numeric,
     "inf_series": Numeric,
     "nan_series": Unsupported,
     "nan_series_2": Unsupported,
-    "string_series": Categorical,
+    "string_series": Text,
     "categorical_string_series": Categorical,
-    "timestamp_string_series": Categorical,
-    "string_with_sep_num_nan": Categorical,  # TODO: Introduce thousands separator
-    "string_unicode_series": Categorical,
-    "string_np_unicode_series": Categorical,
+    "timestamp_string_series": DateTime,
+    "string_with_sep_num_nan": Text,  # TODO: Introduce thousands separator
+    "string_unicode_series": Text,
+    "string_np_unicode_series": Text,
     "string_num_nan": Numeric,
     "string_num": Numeric,
     "string_flt_nan": Numeric,
     "string_flt": Numeric,
-    "string_str_nan": Categorical,
+    "string_str_nan": Text,
     "string_bool_nan": Boolean,
     "int_str_range": Numeric,
-    "string_date": Categorical,
-    "str_url": Categorical,
+    "string_date": DateTime,
+    "str_url": Text,
     "bool_series": Boolean,
     "bool_nan_series": Boolean,
     "nullable_bool_series": Boolean,
@@ -234,9 +237,9 @@ def test_contains(name, series, contains_type, member):
     "geometry_series": Unsupported,
     "path_series_linux": Unsupported,
     "path_series_linux_missing": Unsupported,
-    "path_series_linux_str": Categorical,
+    "path_series_linux_str": Text,
     "path_series_windows": Unsupported,
-    "path_series_windows_str": Categorical,
+    "path_series_windows_str": Text,
     "url_series": Unsupported,
     "url_nan_series": Unsupported,
     "url_none_series": Unsupported,
@@ -255,16 +258,16 @@ def test_contains(name, series, contains_type, member):
     "empty_int64": Unsupported,
     "empty_object": Unsupported,
     "ip": Unsupported,
-    "ip_str": Categorical,
+    "ip_str": Text,
     "ip_missing": Unsupported,
     "date_series_nat": DateTime,
     "date": Unsupported,
     "time": Unsupported,
     "categorical_char": Categorical,
     "ordinal": Categorical,
-    "str_complex": Categorical,
+    "str_complex": Text,
     "uuid_series": Unsupported,
-    "uuid_series_str": Categorical,
+    "uuid_series_str": Text,
     "uuid_series_missing": Unsupported,
     "ip_mixed_v4andv6": Unsupported,
     "file_test_py": Unsupported,
@@ -275,17 +278,17 @@ def test_contains(name, series, contains_type, member):
     "str_int_leading_zeros": Numeric,
     "str_float_non_leading_zeros": Numeric,
     "str_int_zeros": Numeric,
-    "email_address_str": Categorical,
-    "str_complex_nan": Categorical,
+    "email_address_str": Text,
+    "str_complex_nan": Text,
     "email_address": Unsupported,
     "email_address_missing": Unsupported,
     "all_null_nat": Unsupported,
-    "all_null_empty_str": Categorical,
-    "py_datetime_str": Categorical,
+    "all_null_empty_str": Text,
+    "py_datetime_str": DateTime,
     "all_null_none": Unsupported,
     "complex_series_py_float": Numeric,
     "all_null_nan": Unsupported,
-    "string_dtype_series": Categorical,
+    "string_dtype_series": Text,
 }
 
 
@@ -311,7 +314,7 @@ def test_inference(name, series, inference_type, typeset, difference):
     (Categorical, Numeric, {"mixed"}),
     (
         Numeric,
-        Categorical,
+        Text,
         {
             "string_flt",
             "string_num_nan",
@@ -331,12 +334,19 @@ def test_inference(name, series, inference_type, typeset, difference):
     ),
     (
         Boolean,
-        Categorical,
+        Text,
         {
             "string_bool_nan",
             "nullable_bool_series",
         },
     ),
+    (
+        DateTime,
+        Text,
+        {"py_datetime_str", "timestamp_string_series", "string_date"},
+    ),
+    (Categorical, Text, {"categorical_string_series"}),
+    (Categorical, Numeric, {"categorical_float_series"}),
 ]