diff --git a/splink/profile_data.py b/splink/profile_data.py index 62b73546d6..bcf341777d 100644 --- a/splink/profile_data.py +++ b/splink/profile_data.py @@ -1,9 +1,12 @@ +import logging import re from copy import deepcopy from .charts import altair_or_json, load_chart_definition from .misc import ensure_is_list +logger = logging.getLogger(__name__) + def _group_name(cols_or_expr): cols_or_expr = re.sub(r"[^0-9a-zA-Z_]", " ", cols_or_expr) @@ -270,21 +273,35 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10): percentile_rows = [ p for p in percentile_rows_all if p["group_name"] == _group_name(expression) ] - percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows) - top_n_rows = [ - p for p in top_n_rows_all if p["group_name"] == _group_name(expression) - ] - bottom_n_rows = [ - p for p in bottom_n_rows_all if p["group_name"] == _group_name(expression) - ] - # remove concat blank from expression title - expression = expression.replace(", ' '", "") - inner_chart = _get_inner_chart_spec_freq( - percentile_rows, top_n_rows, bottom_n_rows, expression - ) - inner_charts.append(inner_chart) - outer_spec = deepcopy(_outer_chart_spec_freq) - - outer_spec["vconcat"] = inner_charts - - return altair_or_json(outer_spec) + if percentile_rows == []: + logger.warning( + "Warning: No charts produced for " + f"{expression}" + " as the column only contains null values." + ) + else: + percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows) + top_n_rows = [ + p for p in top_n_rows_all if p["group_name"] == _group_name(expression) + ] + bottom_n_rows = [ + p + for p in bottom_n_rows_all + if p["group_name"] == _group_name(expression) + ] + # remove concat blank from expression title + expression = expression.replace(", ' '", "") + inner_chart = _get_inner_chart_spec_freq( + percentile_rows, top_n_rows, bottom_n_rows, expression + ) + inner_charts.append(inner_chart) + + if inner_charts != []: + + outer_spec = deepcopy(_outer_chart_spec_freq) + outer_spec["vconcat"] = inner_charts + + return altair_or_json(outer_spec) + + else: + return None diff --git a/tests/test_profile_data.py b/tests/test_profile_data.py index 98b0725ff0..dcc2123c99 100644 --- a/tests/test_profile_data.py +++ b/tests/test_profile_data.py @@ -176,3 +176,23 @@ def test_profile_using_spark(df_spark): ) assert len(generate_raw_profile_dataset([["first_name", "blank"]], linker)) == 0 + + +def test_profile_null_columns(caplog): + + df = pd.DataFrame( + [ + {"unique_id": 1, "test_1": 1, "test_2": None}, + ] + ) + + linker = DuckDBLinker(df) + + linker.profile_columns(["test_1", "test_2"]) + + captured_logs = caplog.text + + assert ( + "Warning: No charts produced for test_2 as the column only " + "contains null values." + ) in captured_logs