From 8d97234a765a846eecde1273c23e8bf32ed7ecf9 Mon Sep 17 00:00:00 2001 From: Jan Cap <71695857+vorel99@users.noreply.github.com> Date: Wed, 3 May 2023 14:42:40 +0200 Subject: [PATCH] feat: add string type for text variables (#1282) * chore(actions): fix docs publishing ci * update type infering - added new data type String - added describe function for string - added render function for string * add describe string for spark - same as category * add word cloud to requirements * update tests -replace Category with String, where needed * change type hint - update type hint at string_to_bool function * change word cloud size to same ratio as fig size * format string render - update string render to same format as other renders * update describe_string_spark * change import order * update 'String' type name to 'Text' - 'Text' title is more accurate * resolve pre-commit hooks --- requirements.txt | 3 +- src/ydata_profiling/config.py | 14 +- src/ydata_profiling/expectations_report.py | 1 + src/ydata_profiling/model/handler.py | 1 + src/ydata_profiling/model/pandas/__init__.py | 2 + .../model/pandas/describe_text_pandas.py | 64 ++++++ .../model/spark/describe_text_spark.py | 27 +++ src/ydata_profiling/model/summarizer.py | 4 + .../model/summary_algorithms.py | 7 + src/ydata_profiling/model/typeset.py | 111 +++++++++-- .../model/typeset_relations.py | 39 +++- .../report/structure/variables/__init__.py | 2 + .../report/structure/variables/render_text.py | 184 ++++++++++++++++++ src/ydata_profiling/visualisation/plot.py | 21 ++ tests/issues/test_issue397.py | 2 +- tests/issues/test_issue72.py | 4 +- tests/unit/test_example.py | 2 +- tests/unit/test_typeset_custom.py | 74 ++++--- tests/unit/test_typeset_default.py | 66 ++++--- 19 files changed, 546 insertions(+), 82 deletions(-) create mode 100644 src/ydata_profiling/model/pandas/describe_text_pandas.py create mode 100644 src/ydata_profiling/model/spark/describe_text_spark.py create mode 100644 src/ydata_profiling/report/structure/variables/render_text.py diff --git a/requirements.txt b/requirements.txt index 28ba2bc04..a3c139bfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,4 +22,5 @@ statsmodels>=0.13.2, <0.14 # type checking typeguard>=2.13.2, <2.14 imagehash==4.3.1 -dacite>=1.8 \ No newline at end of file +wordcloud>=1.9.1 +dacite>=1.8 diff --git a/src/ydata_profiling/config.py b/src/ydata_profiling/config.py index 0b7e4f8d6..a82e2dd3e 100644 --- a/src/ydata_profiling/config.py +++ b/src/ydata_profiling/config.py @@ -45,6 +45,17 @@ class NumVars(BaseModel): chi_squared_threshold: float = 0.999 +class TextVars(BaseModel): + length: bool = True + words: bool = True + characters: bool = True + redact: bool = False + # if text has more than threshold categories, its not category + categorical_threshold: int = 50 + # if text has more than threshold % distinct values, its not category + percentage_cat_threshold: float = 0.5 + + class CatVars(BaseModel): length: bool = True characters: bool = True @@ -106,6 +117,7 @@ class TimeseriesVars(BaseModel): class Univariate(BaseModel): num: NumVars = NumVars() + text: TextVars = TextVars() cat: CatVars = CatVars() image: ImageVars = ImageVars() bool: BoolVars = BoolVars() @@ -395,7 +407,7 @@ class Config: "sensitive": { "samples": None, "duplicates": None, - "vars": {"cat": {"redact": True}}, + "vars": {"cat": {"redact": True}, "text": {"redact": True}}, }, "dark_mode": { "html": { diff --git a/src/ydata_profiling/expectations_report.py b/src/ydata_profiling/expectations_report.py index c8583fbe2..7979e510b 100644 --- a/src/ydata_profiling/expectations_report.py +++ b/src/ydata_profiling/expectations_report.py @@ -15,6 +15,7 @@ class ExpectationHandler(Handler): def __init__(self, typeset: VisionsTypeset, *args, **kwargs): mapping = { "Unsupported": [expectation_algorithms.generic_expectations], + "Text": [expectation_algorithms.categorical_expectations], "Categorical": [expectation_algorithms.categorical_expectations], "Boolean": [expectation_algorithms.categorical_expectations], "Numeric": [expectation_algorithms.numeric_expectations], diff --git a/src/ydata_profiling/model/handler.py b/src/ydata_profiling/model/handler.py index 9d12dd705..5db948fd6 100644 --- a/src/ydata_profiling/model/handler.py +++ b/src/ydata_profiling/model/handler.py @@ -69,6 +69,7 @@ def get_render_map() -> Dict[str, Callable]: "Boolean": render_algorithms.render_boolean, "Numeric": render_algorithms.render_real, "Complex": render_algorithms.render_complex, + "Text": render_algorithms.render_text, "DateTime": render_algorithms.render_date, "Categorical": render_algorithms.render_categorical, "URL": render_algorithms.render_url, diff --git a/src/ydata_profiling/model/pandas/__init__.py b/src/ydata_profiling/model/pandas/__init__.py index 290fddace..b895f46e2 100644 --- a/src/ydata_profiling/model/pandas/__init__.py +++ b/src/ydata_profiling/model/pandas/__init__.py @@ -11,6 +11,7 @@ describe_numeric_pandas, describe_path_pandas, describe_supported_pandas, + describe_text_pandas, describe_timeseries_pandas, describe_url_pandas, duplicates_pandas, @@ -33,6 +34,7 @@ "describe_numeric_pandas", "describe_path_pandas", "describe_supported_pandas", + "describe_text_pandas", "describe_timeseries_pandas", "describe_url_pandas", "duplicates_pandas", diff --git a/src/ydata_profiling/model/pandas/describe_text_pandas.py b/src/ydata_profiling/model/pandas/describe_text_pandas.py new file mode 100644 index 000000000..2701b9760 --- /dev/null +++ b/src/ydata_profiling/model/pandas/describe_text_pandas.py @@ -0,0 +1,64 @@ +from typing import Tuple + +import pandas as pd + +from ydata_profiling.config import Settings +from ydata_profiling.model.pandas.describe_categorical_pandas import ( + length_summary_vc, + unicode_summary_vc, + word_summary_vc, +) +from ydata_profiling.model.summary_algorithms import ( + describe_text_1d, + histogram_compute, + series_handle_nulls, + series_hashable, +) + + +@describe_text_1d.register +@series_hashable +@series_handle_nulls +def pandas_describe_text_1d( + config: Settings, + series: pd.Series, + summary: dict, +) -> Tuple[Settings, pd.Series, dict]: + """Describe string series. + + Args: + config: report Settings object + series: The Series to describe. + summary: The dict containing the series description so far. + + Returns: + A dict containing calculated series description values. + """ + + series = series.astype(str) + + # Only run if at least 1 non-missing value + value_counts = summary["value_counts_without_nan"] + value_counts.index = value_counts.index.astype(str) + + summary.update({"first_rows": series.head(5)}) + + if config.vars.text.length: + summary.update(length_summary_vc(value_counts)) + summary.update( + histogram_compute( + config, + summary["length_histogram"].index.values, + len(summary["length_histogram"]), + name="histogram_length", + weights=summary["length_histogram"].values, + ) + ) + + if config.vars.text.characters: + summary.update(unicode_summary_vc(value_counts)) + + if config.vars.text.words: + summary.update(word_summary_vc(value_counts, config.vars.cat.stop_words)) + + return config, series, summary diff --git a/src/ydata_profiling/model/spark/describe_text_spark.py b/src/ydata_profiling/model/spark/describe_text_spark.py new file mode 100644 index 000000000..b5e27f615 --- /dev/null +++ b/src/ydata_profiling/model/spark/describe_text_spark.py @@ -0,0 +1,27 @@ +from typing import Tuple + +from pyspark.sql import DataFrame + +from ydata_profiling.config import Settings +from ydata_profiling.model.summary_algorithms import describe_text_1d + + +@describe_text_1d.register +def describe_text_1d_spark( + config: Settings, df: DataFrame, summary: dict +) -> Tuple[Settings, DataFrame, dict]: + """Describe a categorical series. + + Args: + series: The Series to describe. + summary: The dict containing the series description so far. + + Returns: + A dict containing calculated series description values. + """ + + redact = config.vars.text.redact + if not redact: + summary["first_rows"] = df.limit(5).toPandas().squeeze("columns") + + return config, df, summary diff --git a/src/ydata_profiling/model/summarizer.py b/src/ydata_profiling/model/summarizer.py index c112d14ed..f7b7d1b91 100644 --- a/src/ydata_profiling/model/summarizer.py +++ b/src/ydata_profiling/model/summarizer.py @@ -19,6 +19,7 @@ describe_numeric_1d, describe_path_1d, describe_supported, + describe_text_1d, describe_timeseries_1d, describe_url_1d, ) @@ -58,6 +59,9 @@ def __init__(self, typeset: VisionsTypeset, *args, **kwargs): "DateTime": [ describe_date_1d, ], + "Text": [ + describe_text_1d, + ], "Categorical": [ describe_categorical_1d, ], diff --git a/src/ydata_profiling/model/summary_algorithms.py b/src/ydata_profiling/model/summary_algorithms.py index fbb944681..ad64f8aae 100644 --- a/src/ydata_profiling/model/summary_algorithms.py +++ b/src/ydata_profiling/model/summary_algorithms.py @@ -126,6 +126,13 @@ def describe_numeric_1d( raise NotImplementedError() +@multimethod +def describe_text_1d( + config: Settings, series: Any, summary: dict +) -> Tuple[Settings, Any, dict, Any]: + raise NotImplementedError() + + @multimethod def describe_date_1d( config: Settings, series: Any, summary: dict diff --git a/src/ydata_profiling/model/typeset.py b/src/ydata_profiling/model/typeset.py index 489c1d5e5..ee1c42e78 100644 --- a/src/ydata_profiling/model/typeset.py +++ b/src/ydata_profiling/model/typeset.py @@ -14,12 +14,15 @@ from ydata_profiling.config import Settings from ydata_profiling.model.typeset_relations import ( - category_is_numeric, - category_to_numeric, numeric_is_category, series_is_string, string_is_bool, + string_is_category, + string_is_datetime, + string_is_numeric, string_to_bool, + string_to_datetime, + string_to_numeric, to_bool, to_category, ) @@ -49,19 +52,38 @@ def typeset_types(config: Settings) -> Set[visions.VisionsBaseType]: """Define types based on the config""" class Unsupported(visions.Generic): + """Base type. All other types have relationship with this type.""" + pass class Numeric(visions.VisionsBaseType): + """Type for all numeric (float, int) columns. + + Can be transformed from + - Unsupported + - String + + Examples + -------- + >>> s = pd.Series([1, 2, 5, 3, 8, 9]) + >>> s in Numeric + True + + >>> s = pd.Series([.34, 2.9, 55, 3.14, 89, 91]) + >>> s in Numeric + True + """ + @staticmethod def get_relations() -> Sequence[TypeRelation]: return [ IdentityRelation(Unsupported), InferenceRelation( - Categorical, - relationship=lambda x, y: partial(category_is_numeric, k=config)( + Text, + relationship=lambda x, y: partial(string_is_numeric, k=config)( x, y ), - transformer=category_to_numeric, + transformer=string_to_numeric, ), ] @@ -72,11 +94,44 @@ def get_relations() -> Sequence[TypeRelation]: def contains_op(series: pd.Series, state: dict) -> bool: return pdt.is_numeric_dtype(series) and not pdt.is_bool_dtype(series) + class Text(visions.VisionsBaseType): + """Type for plaintext columns. + Like name, note, string identifier, residence etc. + + Examples + -------- + >>> s = pd.Series(["AX01", "BC32", "AC00"]) + >>> s in Categorical + True + + >>> s = pd.Series([1, 2, 3, 4]) + >>> s in Categorical + False + """ + + @staticmethod + def get_relations() -> Sequence[TypeRelation]: + return [ + IdentityRelation(Unsupported), + ] + + @staticmethod + @multimethod + @series_not_empty + @series_handle_nulls + def contains_op(series: pd.Series, state: dict) -> bool: + return pdt.is_string_dtype(series) and series_is_string(series, state) + class DateTime(visions.VisionsBaseType): @staticmethod def get_relations() -> Sequence[TypeRelation]: return [ IdentityRelation(Unsupported), + InferenceRelation( + Text, + relationship=lambda x, y: partial(string_is_datetime)(x, y), + transformer=string_to_datetime, + ), ] @staticmethod @@ -87,6 +142,30 @@ def contains_op(series: pd.Series, state: dict) -> bool: return pdt.is_datetime64_any_dtype(series) class Categorical(visions.VisionsBaseType): + """Type for categorical columns. + Categorical columns in pandas categorical format + and columns in string format with small count of unique values. + + Can be transformed from: + - Unsupported + - Numeric + - String + + Examples + -------- + >>> s = pd.Series(["male", "female", "female", "male"], dtype="category") + >>> s in Categorical + True + + >>> s = pd.Series(["male", "female"]) + >>> s in Categorical + False + + >>> s = pd.Series(["male", "female", "female", "male"]) + >>> s in Categorical + True + """ + @staticmethod def get_relations() -> Sequence[TypeRelation]: return [ @@ -98,6 +177,13 @@ def get_relations() -> Sequence[TypeRelation]: ), transformer=to_category, ), + InferenceRelation( + Text, + relationship=lambda x, y: partial(string_is_category, k=config)( + x, y + ), + transformer=to_category, + ), ] @staticmethod @@ -110,12 +196,11 @@ def contains_op(series: pd.Series, state: dict) -> bool: ) if is_valid_dtype: return True - elif not pdt.is_object_dtype(series): - return pandas_has_string_dtype_flag and pdt.is_string_dtype(series) - - return series_is_string(series, state) + return False class Boolean(visions.VisionsBaseType): + """Type for boolean columns.""" + @staticmethod def get_relations() -> Sequence[TypeRelation]: # Numeric [0, 1] goes via Categorical with distinct_count_without_nan <= 2 @@ -124,7 +209,7 @@ def get_relations() -> Sequence[TypeRelation]: return [ IdentityRelation(Unsupported), InferenceRelation( - Categorical, + Text, relationship=lambda x, y: partial(string_is_bool, k=mapping)(x, y), transformer=lambda s, st: to_bool( partial(string_to_bool, k=mapping)(s, st) @@ -148,7 +233,7 @@ def contains_op(series: pd.Series, state: dict) -> bool: class URL(visions.VisionsBaseType): @staticmethod def get_relations() -> Sequence[TypeRelation]: - return [IdentityRelation(Categorical)] + return [IdentityRelation(Text)] @staticmethod @multimethod @@ -164,7 +249,7 @@ def contains_op(series: pd.Series, state: dict) -> bool: class Path(visions.VisionsBaseType): @staticmethod def get_relations() -> Sequence[TypeRelation]: - return [IdentityRelation(Categorical)] + return [IdentityRelation(Text)] @staticmethod @multimethod @@ -223,7 +308,7 @@ def is_timedependent(series: pd.Series) -> bool: is_numeric = pdt.is_numeric_dtype(series) and not pdt.is_bool_dtype(series) return is_numeric and is_timedependent(series) - types = {Unsupported, Boolean, Numeric, Categorical, DateTime} + types = {Unsupported, Boolean, Numeric, Text, Categorical, DateTime} if config.vars.path.active: types.add(Path) if config.vars.file.active: diff --git a/src/ydata_profiling/model/typeset_relations.py b/src/ydata_profiling/model/typeset_relations.py index baa047305..0a8bd6d4a 100644 --- a/src/ydata_profiling/model/typeset_relations.py +++ b/src/ydata_profiling/model/typeset_relations.py @@ -1,5 +1,5 @@ import functools -from typing import Callable +from typing import Callable, Dict import numpy as np import pandas as pd @@ -24,7 +24,7 @@ def inner(series: pd.Series, *args, **kwargs) -> bool: return inner -def string_is_bool(series: pd.Series, state: dict, k: Settings) -> bool: +def string_is_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> bool: @series_handle_nulls @try_func def tester(s: pd.Series, state: dict) -> bool: @@ -36,7 +36,7 @@ def tester(s: pd.Series, state: dict) -> bool: return tester(series, state) -def string_to_bool(series: pd.Series, state: dict, k: Settings) -> pd.Series: +def string_to_bool(series: pd.Series, state: dict, k: Dict[str, bool]) -> pd.Series: return series.str.lower().map(k) @@ -66,7 +66,32 @@ def series_is_string(series: pd.Series, state: dict) -> bool: @series_handle_nulls -def category_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool: +def string_is_category(series: pd.Series, state: dict, k: Settings) -> bool: + """String is category, if following conditions are met + - has at least one and less or equal distinct values as threshold + - (distinct values / count of all values) is less than threshold + - is not bool""" + n_unique = series.nunique() + unique_threshold = k.vars.text.percentage_cat_threshold + threshold = k.vars.text.categorical_threshold + return ( + 1 <= n_unique <= threshold + and n_unique / series.size < unique_threshold + and not string_is_bool(series, state, k.vars.bool.mappings) + ) + + +@series_handle_nulls +def string_is_datetime(series: pd.Series, state: dict) -> bool: + """If we can transform data to datetime and at least one is valid date.""" + try: + return not series.astype("datetime64").isna().all() + except: # noqa: E722 + return False + + +@series_handle_nulls +def string_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool: if pdt.is_bool_dtype(series) or object_is_bool(series, state): return False @@ -81,7 +106,11 @@ def category_is_numeric(series: pd.Series, state: dict, k: Settings) -> bool: return not numeric_is_category(series, state, k) -def category_to_numeric(series: pd.Series, state: dict) -> pd.Series: +def string_to_datetime(series: pd.Series, state: dict) -> pd.Series: + return series.astype("datetime64") + + +def string_to_numeric(series: pd.Series, state: dict) -> pd.Series: return pd.to_numeric(series, errors="coerce") diff --git a/src/ydata_profiling/report/structure/variables/__init__.py b/src/ydata_profiling/report/structure/variables/__init__.py index 0c513b15d..64f1d6d54 100644 --- a/src/ydata_profiling/report/structure/variables/__init__.py +++ b/src/ydata_profiling/report/structure/variables/__init__.py @@ -11,6 +11,7 @@ from ydata_profiling.report.structure.variables.render_image import render_image from ydata_profiling.report.structure.variables.render_path import render_path from ydata_profiling.report.structure.variables.render_real import render_real +from ydata_profiling.report.structure.variables.render_text import render_text from ydata_profiling.report.structure.variables.render_timeseries import ( render_timeseries, ) @@ -28,6 +29,7 @@ "render_image", "render_path", "render_real", + "render_text", "render_timeseries", "render_url", ] diff --git a/src/ydata_profiling/report/structure/variables/render_text.py b/src/ydata_profiling/report/structure/variables/render_text.py new file mode 100644 index 000000000..227ffa9f6 --- /dev/null +++ b/src/ydata_profiling/report/structure/variables/render_text.py @@ -0,0 +1,184 @@ +from typing import Any, Dict, List + +from ydata_profiling.config import Settings +from ydata_profiling.report.formatters import fmt, fmt_bytesize, fmt_percent +from ydata_profiling.report.presentation.core import ( + Container, + FrequencyTable, + Image, + Table, +) +from ydata_profiling.report.presentation.core.variable_info import VariableInfo +from ydata_profiling.report.structure.variables.render_categorical import ( + _get_n, + freq_table, + render_categorical_frequency, + render_categorical_length, + render_categorical_unicode, +) +from ydata_profiling.report.structure.variables.render_common import render_common +from ydata_profiling.visualisation.plot import plot_word_cloud + + +def render_text(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: + varid = summary["varid"] + words = config.vars.text.words + characters = config.vars.text.characters + length = config.vars.text.length + + template_variables = render_common(config, summary) + + top_items: List[Any] = [] + var_info = VariableInfo( + anchor_id=varid, + var_name=summary["varname"], + var_type=summary["type"], + alerts=summary["alerts"], + description=summary["description"], + style=config.html.style, + ) + top_items.append(var_info) + + table = Table( + [ + { + "name": "Distinct", + "value": fmt(summary["n_distinct"]), + "alert": "n_distinct" in summary["alert_fields"], + }, + { + "name": "Distinct (%)", + "value": fmt_percent(summary["p_distinct"]), + "alert": "p_distinct" in summary["alert_fields"], + }, + { + "name": "Missing", + "value": fmt(summary["n_missing"]), + "alert": "n_missing" in summary["alert_fields"], + }, + { + "name": "Missing (%)", + "value": fmt_percent(summary["p_missing"]), + "alert": "p_missing" in summary["alert_fields"], + }, + { + "name": "Memory size", + "value": fmt_bytesize(summary["memory_size"]), + "alert": False, + }, + ], + style=config.html.style, + ) + top_items.append(table) + + if words and "word_counts" in summary: + mini_wordcloud = Image( + plot_word_cloud(config, summary["word_counts"]), + image_format=config.plot.image_format, + alt="Mini wordcloud", + ) + top_items.append(mini_wordcloud) + template_variables["top"] = Container(top_items, sequence_type="grid") + + # ============================================================================================ + + bottom_items = [] + overview_items = [] + # length isn't being computed for categorical in spark + if length and "max_length" in summary: + length_table, length_histo = render_categorical_length(config, summary, varid) + overview_items.append(length_table) + + # characters isn't being computed for categorical in spark + unitab = None + if characters and "category_alias_counts" in summary: + overview_table_char, unitab = render_categorical_unicode(config, summary, varid) + overview_items.append(overview_table_char) + + unique_stats = render_categorical_frequency(config, summary, varid) + overview_items.append(unique_stats) + + if not config.vars.text.redact: + rows = ("1st row", "2nd row", "3rd row", "4th row", "5th row") + + if isinstance(summary["first_rows"], list): + sample = Table( + [ + { + "name": name, + "value": fmt(value), + "alert": False, + } + for name, *value in zip(rows, *summary["first_rows"]) + ], + name="Sample", + style=config.html.style, + ) + else: + sample = Table( + [ + { + "name": name, + "value": fmt(value), + "alert": False, + } + for name, value in zip(rows, summary["first_rows"]) + ], + name="Sample", + style=config.html.style, + ) + overview_items.append(sample) + overview = Container( + overview_items, + name="Overview", + anchor_id=f"{varid}overview", + sequence_type="batch_grid", + batch_size=len(overview_items), + titles=False, + ) + bottom_items.append(overview) + + if words and "word_counts" in summary: + woc = freq_table( + freqtable=summary["word_counts"], + n=_get_n(summary["word_counts"]), + max_number_to_print=10, + ) + + fqwo = FrequencyTable( + woc, + name="Common words", + anchor_id=f"{varid}cwo", + redact=config.vars.text.redact, + ) + + image = Image( + plot_word_cloud(config, summary["word_counts"]), + image_format=config.plot.image_format, + alt="Wordcloud", + ) + + bottom_items.append( + Container( + [fqwo, image], + name="Words", + anchor_id=f"{varid}word", + sequence_type="grid", + ) + ) + + if unitab is not None: + bottom_items.append( + Container( + [unitab], + name="Characters", + anchor_id=f"{varid}characters", + sequence_type="grid", + ) + ) + + template_variables["bottom"] = Container( + bottom_items, sequence_type="tabs", anchor_id=f"{varid}bottom" + ) + + return template_variables diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py index fd8764b38..ad2778450 100644 --- a/src/ydata_profiling/visualisation/plot.py +++ b/src/ydata_profiling/visualisation/plot.py @@ -13,6 +13,7 @@ from matplotlib.ticker import FuncFormatter from statsmodels.graphics.tsaplots import plot_acf, plot_pacf from typeguard import typechecked +from wordcloud import WordCloud from ydata_profiling.config import Settings from ydata_profiling.utils.common import convert_timestamp_to_datetime @@ -24,6 +25,20 @@ def format_fn(tick_val: int, tick_pos: Any) -> str: return convert_timestamp_to_datetime(tick_val).strftime("%Y-%m-%d %H:%M:%S") +def _plot_word_cloud( + series: pd.Series, + figsize: tuple = (6, 4), +) -> plt.Figure: + word_dict = series.to_dict() + wordcloud = WordCloud( + background_color="white", random_state=123, width=300, height=200, scale=2 + ).generate_from_frequencies(word_dict) + plt.figure(figsize=figsize) + plot = plt.imshow(wordcloud, interpolation="bilinear") + plt.axis("off") + return plot + + def _plot_histogram( config: Settings, series: np.ndarray, @@ -99,6 +114,12 @@ def _plot_histogram( return plot +@manage_matplotlib_context() +def plot_word_cloud(config: Settings, word_counts: pd.Series) -> str: + _plot_word_cloud(series=word_counts) + return plot_360_n0sc0pe(config) + + @manage_matplotlib_context() def histogram( config: Settings, diff --git a/tests/issues/test_issue397.py b/tests/issues/test_issue397.py index 672378573..ad952e849 100644 --- a/tests/issues/test_issue397.py +++ b/tests/issues/test_issue397.py @@ -27,7 +27,7 @@ def test_issue397(): description = report.description_set - assert description.table["types"] == {"Categorical": 1, "Numeric": 4} + assert description.table["types"] == {"Text": 1, "Numeric": 4} assert description.variables["float-inf"]["p_infinite"] == 0.5 assert description.variables["float-inf"]["n_infinite"] == 2 diff --git a/tests/issues/test_issue72.py b/tests/issues/test_issue72.py index ae515b1d4..5c8bd19d3 100644 --- a/tests/issues/test_issue72.py +++ b/tests/issues/test_issue72.py @@ -32,7 +32,7 @@ def test_issue72_equal(): # 3 == 3, so categorical assert report.get_description().variables["A"]["type"] == "Categorical" # Strings are always categorical - assert report.get_description().variables["B"]["type"] == "Categorical" + assert report.get_description().variables["B"]["type"] == "Text" def test_issue72_lower(): @@ -44,4 +44,4 @@ def test_issue72_lower(): # 3 < 10, so categorical assert report.get_description().variables["A"]["type"] == "Categorical" # Strings are always categorical - assert report.get_description().variables["B"]["type"] == "Categorical" + assert report.get_description().variables["B"]["type"] == "Text" diff --git a/tests/unit/test_example.py b/tests/unit/test_example.py index 6b196b78a..0797e9f93 100644 --- a/tests/unit/test_example.py +++ b/tests/unit/test_example.py @@ -47,4 +47,4 @@ def test_example(get_data_file, test_output_dir): profile.to_file(output_file) assert (test_output_dir / "profile.html").exists(), "Output file does not exist" assert type(profile.get_description()) == BaseDescription, "Unexpected result" - assert "14" in profile.to_html() + assert "9" in profile.to_html() diff --git a/tests/unit/test_typeset_custom.py b/tests/unit/test_typeset_custom.py index 86274a17f..2685d3237 100644 --- a/tests/unit/test_typeset_custom.py +++ b/tests/unit/test_typeset_custom.py @@ -1,5 +1,4 @@ import datetime -from dataclasses import dataclass import numpy as np import pandas as pd @@ -50,7 +49,7 @@ def get_profiling_series(): 15.9, 13.5, ], - "cat": [ + "str": [ "a", "long text value", "Élysée", @@ -61,8 +60,18 @@ def get_profiling_series(): "c", "c", ], + "str_cat": pd.Series( + ["male", "male", None, "female", "female", "male", "male"] + ), + "str_num": ["1", "10", "3.14", "566"], + "str_date": ["2000/01/01", "2001/07/24", "2011/12/24", "1980/03/10"], + "str_date2": ["2000-01-01", "2001-07-24", "2011-12-24", "1980-03-10"], "s1": np.ones(9), "s2": ["some constant text $ % value {obj} " for _ in range(1, 10)], + "cat": pd.Series( + ["male", "male", None, "female", "female", "male", "male"], + dtype="category", + ), "somedate": [ datetime.date(2011, 7, 4), datetime.datetime(2022, 1, 1, 13, 57), @@ -137,6 +146,7 @@ def get_profiling_series(): type_map = {str(k): k for k in my_typeset.types} Numeric = type_map["Numeric"] +Text = type_map["Text"] Categorical = type_map["Categorical"] Boolean = type_map["Boolean"] DateTime = type_map["DateTime"] @@ -147,23 +157,12 @@ def get_profiling_series(): typeset2 = ProfilingTypeSet(config2) type_map2 = {str(k): k for k in typeset2.types} Numeric2 = type_map2["Numeric"] +Text2 = type_map2["Text"] Categorical2 = type_map2["Categorical"] +DateTime2 = type_map2["DateTime"] Boolean2 = type_map2["Boolean"] -@dataclass -class DataTest: - def __init__(self, name, contains_type, infer_type, cast_result=None): - self.name = name - self.contains_type = contains_type - self.infer_type = infer_type - self.cast_result = cast_result - - -cases = [ - DataTest("x", Numeric, Numeric), -] - contains_map = { Numeric: { "x", @@ -177,19 +176,24 @@ def __init__(self, name, contains_type, infer_type, cast_result=None): "inf_only", "nullable_int", }, - Categorical: { - "id", - "cat", - "s2", - "date_str", + Text: { + "str", + "str_cat", + "str_num", + "str_date", + "str_date2", "str_yes_no", "str_yes_no_mixed", "str_yes_no_nan", "str_true_false", "str_true_false_none", "str_true_false_nan", + "id", "catnum", + "date_str", + "s2", }, + Categorical: {"cat"}, Boolean: { "bool_tf", "bool_tf_with_nan", @@ -227,10 +231,15 @@ def test_contains(name, series, contains_type, member): "integers_nan": Numeric, "bool_01": Numeric, "bool_01_with_nan": Numeric, - "id": Categorical, - "cat": Categorical, + "id": Text, + "str_cat": Categorical, + "str_num": Numeric, + "str_date": DateTime, + "str_date2": DateTime, "s2": Categorical, - "date_str": Categorical, + "date_str": DateTime, + "str": Text, + "cat": Categorical, "bool_tf": Boolean, "bool_tf_with_nan": Boolean, "booleans_type": Boolean, @@ -285,23 +294,28 @@ def test_inference(name, series, inference_type, typeset, difference): "nullable_int", }, ), - ( - Numeric2, - Categorical2, - {"catnum"}, - ), ( Boolean2, - Categorical2, + Text2, { - "str_true_false", "str_yes_no", "str_yes_no_mixed", "str_yes_no_nan", + "str_true_false", "str_true_false_nan", "str_true_false_none", }, ), + ( + Categorical2, + Text2, + { + "str_cat", + "s2", + }, + ), + (Numeric2, Text2, {"str_num", "catnum"}), + (DateTime2, Text2, {"str_date", "str_date2", "date_str"}), ] diff --git a/tests/unit/test_typeset_default.py b/tests/unit/test_typeset_default.py index 11b42266b..84a7df643 100644 --- a/tests/unit/test_typeset_default.py +++ b/tests/unit/test_typeset_default.py @@ -29,6 +29,7 @@ type_map = {str(k): k for k in my_typeset_default.types} Numeric = type_map["Numeric"] +Text = type_map["Text"] Categorical = type_map["Categorical"] Boolean = type_map["Boolean"] DateTime = type_map["DateTime"] @@ -61,12 +62,7 @@ "complex_series_float", "complex_series_py_float", }, - Categorical: { - "categorical_float_series", - "categorical_int_series", - "categorical_string_series", - "categorical_char", - "ordinal", + Text: { "timestamp_string_series", "string_with_sep_num_nan", "string_series", @@ -97,6 +93,13 @@ "py_datetime_str", "string_dtype_series", }, + Categorical: { + "categorical_float_series", + "categorical_int_series", + "categorical_string_series", + "categorical_char", + "ordinal", + }, Boolean: { "bool_series", "bool_series2", @@ -177,7 +180,7 @@ def test_contains(name, series, contains_type, member): inference_map = { "int_series": Numeric, - "categorical_int_series": Numeric, + "categorical_int_series": Categorical, "int_nan_series": Numeric, "Int64_int_series": Numeric, "Int64_int_nan_series": Numeric, @@ -193,26 +196,26 @@ def test_contains(name, series, contains_type, member): "float_series5": Numeric, "float_series6": Numeric, "complex_series_float": Numeric, - "categorical_float_series": Numeric, + "categorical_float_series": Categorical, "float_with_inf": Numeric, "inf_series": Numeric, "nan_series": Unsupported, "nan_series_2": Unsupported, - "string_series": Categorical, + "string_series": Text, "categorical_string_series": Categorical, - "timestamp_string_series": Categorical, - "string_with_sep_num_nan": Categorical, # TODO: Introduce thousands separator - "string_unicode_series": Categorical, - "string_np_unicode_series": Categorical, + "timestamp_string_series": DateTime, + "string_with_sep_num_nan": Text, # TODO: Introduce thousands separator + "string_unicode_series": Text, + "string_np_unicode_series": Text, "string_num_nan": Numeric, "string_num": Numeric, "string_flt_nan": Numeric, "string_flt": Numeric, - "string_str_nan": Categorical, + "string_str_nan": Text, "string_bool_nan": Boolean, "int_str_range": Numeric, - "string_date": Categorical, - "str_url": Categorical, + "string_date": DateTime, + "str_url": Text, "bool_series": Boolean, "bool_nan_series": Boolean, "nullable_bool_series": Boolean, @@ -234,9 +237,9 @@ def test_contains(name, series, contains_type, member): "geometry_series": Unsupported, "path_series_linux": Unsupported, "path_series_linux_missing": Unsupported, - "path_series_linux_str": Categorical, + "path_series_linux_str": Text, "path_series_windows": Unsupported, - "path_series_windows_str": Categorical, + "path_series_windows_str": Text, "url_series": Unsupported, "url_nan_series": Unsupported, "url_none_series": Unsupported, @@ -255,16 +258,16 @@ def test_contains(name, series, contains_type, member): "empty_int64": Unsupported, "empty_object": Unsupported, "ip": Unsupported, - "ip_str": Categorical, + "ip_str": Text, "ip_missing": Unsupported, "date_series_nat": DateTime, "date": Unsupported, "time": Unsupported, "categorical_char": Categorical, "ordinal": Categorical, - "str_complex": Categorical, + "str_complex": Text, "uuid_series": Unsupported, - "uuid_series_str": Categorical, + "uuid_series_str": Text, "uuid_series_missing": Unsupported, "ip_mixed_v4andv6": Unsupported, "file_test_py": Unsupported, @@ -275,17 +278,17 @@ def test_contains(name, series, contains_type, member): "str_int_leading_zeros": Numeric, "str_float_non_leading_zeros": Numeric, "str_int_zeros": Numeric, - "email_address_str": Categorical, - "str_complex_nan": Categorical, + "email_address_str": Text, + "str_complex_nan": Text, "email_address": Unsupported, "email_address_missing": Unsupported, "all_null_nat": Unsupported, - "all_null_empty_str": Categorical, - "py_datetime_str": Categorical, + "all_null_empty_str": Text, + "py_datetime_str": DateTime, "all_null_none": Unsupported, "complex_series_py_float": Numeric, "all_null_nan": Unsupported, - "string_dtype_series": Categorical, + "string_dtype_series": Text, } @@ -311,7 +314,7 @@ def test_inference(name, series, inference_type, typeset, difference): (Categorical, Numeric, {"mixed"}), ( Numeric, - Categorical, + Text, { "string_flt", "string_num_nan", @@ -331,12 +334,19 @@ def test_inference(name, series, inference_type, typeset, difference): ), ( Boolean, - Categorical, + Text, { "string_bool_nan", "nullable_bool_series", }, ), + ( + DateTime, + Text, + {"py_datetime_str", "timestamp_string_series", "string_date"}, + ), + (Categorical, Text, {"categorical_string_series"}), + (Categorical, Numeric, {"categorical_float_series"}), ]