From f3bc959a5a9bf8c1b95db7ec767063fc4d5904d0 Mon Sep 17 00:00:00 2001 From: Alex Barros Date: Fri, 6 Dec 2024 12:30:47 -0300 Subject: [PATCH] fix: type schema not checking for empty columns (#1679) * fix: type schema not checking for empty columns * fix: remove alerts unused parameters * fix: indicate user defined type on empty columns * fix(linting): code formatting --------- Co-authored-by: Azory YData Bot --- src/ydata_profiling/model/alerts.py | 4 ++-- .../model/pandas/summary_pandas.py | 24 ++++++++++++++----- .../structure/variables/render_generic.py | 2 +- tests/unit/test_typeset_default.py | 11 +++++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/ydata_profiling/model/alerts.py b/src/ydata_profiling/model/alerts.py index 3c2c4fee9..09ae6c0a3 100644 --- a/src/ydata_profiling/model/alerts.py +++ b/src/ydata_profiling/model/alerts.py @@ -634,7 +634,7 @@ def supported_alerts(summary: dict) -> List[Alert]: return alerts -def unsupported_alerts(summary: Dict[str, Any]) -> List[Alert]: +def unsupported_alerts() -> List[Alert]: alerts: List[Alert] = [ UnsupportedAlert(), RejectedAlert(), @@ -657,7 +657,7 @@ def check_variable_alerts(config: Settings, col: str, description: dict) -> List alerts += generic_alerts(description) if description["type"] == "Unsupported": - alerts += unsupported_alerts(description) + alerts += unsupported_alerts() else: alerts += supported_alerts(description) diff --git a/src/ydata_profiling/model/pandas/summary_pandas.py b/src/ydata_profiling/model/pandas/summary_pandas.py index 5d15b2d3c..68e019451 100644 --- a/src/ydata_profiling/model/pandas/summary_pandas.py +++ b/src/ydata_profiling/model/pandas/summary_pandas.py @@ -16,6 +16,14 @@ from ydata_profiling.utils.dataframe import sort_column_names +def _is_cast_type_defined(typeset: VisionsTypeset, series: str) -> bool: + return ( + isinstance(typeset, ProfilingTypeSet) + and typeset.type_schema + and series in typeset.type_schema + ) + + @describe_1d.register def pandas_describe_1d( config: Settings, @@ -38,11 +46,10 @@ def pandas_describe_1d( # Make sure pd.NA is not in the series series = series.fillna(np.nan) - if ( - isinstance(typeset, ProfilingTypeSet) - and typeset.type_schema - and series.name in typeset.type_schema - ): + has_cast_type = _is_cast_type_defined(typeset, series.name) + cast_type = str(typeset.type_schema[series.name]) if has_cast_type else None + + if has_cast_type and not series.isna().all(): vtype = typeset.type_schema[series.name] elif config.infer_dtypes: @@ -55,7 +62,12 @@ def pandas_describe_1d( vtype = typeset.detect_type(series) typeset.type_schema[series.name] = vtype - return summarizer.summarize(config, series, dtype=vtype) + summary = summarizer.summarize(config, series, dtype=vtype) + # Cast type is only used on unsupported columns rendering pipeline + # to indicate the correct variable type when inference is not possible + summary["cast_type"] = cast_type + + return summary @get_series_descriptions.register diff --git a/src/ydata_profiling/report/structure/variables/render_generic.py b/src/ydata_profiling/report/structure/variables/render_generic.py index 0b2e00efb..0a8ce1e55 100644 --- a/src/ydata_profiling/report/structure/variables/render_generic.py +++ b/src/ydata_profiling/report/structure/variables/render_generic.py @@ -12,7 +12,7 @@ def render_generic(config: Settings, summary: dict) -> dict: info = VariableInfo( anchor_id=summary["varid"], alerts=summary["alerts"], - var_type="Unsupported", + var_type=summary["cast_type"] or "Unsupported", var_name=summary["varname"], description=summary["description"], style=config.html.style, diff --git a/tests/unit/test_typeset_default.py b/tests/unit/test_typeset_default.py index 8d58aeb03..d93d61cb0 100644 --- a/tests/unit/test_typeset_default.py +++ b/tests/unit/test_typeset_default.py @@ -475,3 +475,14 @@ def test_type_schema(dataframe: pd.DataFrame, column: str, type_schema: dict): assert prof.typeset.type_schema[column] == prof.typeset._get_type( type_schema[column] ) + + +def test_type_schema_with_null_column(): + df = pd.DataFrame({"null_col": [None] * 100}) + prof = ProfileReport(df, type_schema={"null_col": "datetime"}) + description = prof.description_set + assert description.variables["null_col"]["type"] == "Unsupported" + + prof = ProfileReport(df, type_schema={"null_col": "numeric"}) + description = prof.description_set + assert description.variables["null_col"]["type"] == "Unsupported"