Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

856 profile null column #1339

Merged
merged 16 commits into from
Nov 8, 2023
53 changes: 35 additions & 18 deletions splink/profile_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import logging
import re
from copy import deepcopy

from .charts import altair_or_json, load_chart_definition
from .misc import ensure_is_list

logger = logging.getLogger(__name__)


def _group_name(cols_or_expr):
cols_or_expr = re.sub(r"[^0-9a-zA-Z_]", " ", cols_or_expr)
Expand Down Expand Up @@ -270,21 +273,35 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
percentile_rows = [
p for p in percentile_rows_all if p["group_name"] == _group_name(expression)
]
percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
top_n_rows = [
p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
]
bottom_n_rows = [
p for p in bottom_n_rows_all if p["group_name"] == _group_name(expression)
]
# remove concat blank from expression title
expression = expression.replace(", ' '", "")
inner_chart = _get_inner_chart_spec_freq(
percentile_rows, top_n_rows, bottom_n_rows, expression
)
inner_charts.append(inner_chart)
outer_spec = deepcopy(_outer_chart_spec_freq)

outer_spec["vconcat"] = inner_charts

return altair_or_json(outer_spec)
if percentile_rows == []:
ThomasHepworth marked this conversation as resolved.
Show resolved Hide resolved
logger.warning(
"Warning: No charts produced for "
f"{expression}"
" as the column only contains null values."
)
else:
percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
top_n_rows = [
p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
]
bottom_n_rows = [
p
for p in bottom_n_rows_all
if p["group_name"] == _group_name(expression)
]
# remove concat blank from expression title
expression = expression.replace(", ' '", "")
inner_chart = _get_inner_chart_spec_freq(
percentile_rows, top_n_rows, bottom_n_rows, expression
)
inner_charts.append(inner_chart)

if inner_charts != []:

outer_spec = deepcopy(_outer_chart_spec_freq)
outer_spec["vconcat"] = inner_charts

return altair_or_json(outer_spec)

else:
return None
20 changes: 20 additions & 0 deletions tests/test_profile_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,23 @@ def test_profile_using_spark(df_spark):
)

assert len(generate_raw_profile_dataset([["first_name", "blank"]], linker)) == 0


def test_profile_null_columns(caplog):
ThomasHepworth marked this conversation as resolved.
Show resolved Hide resolved

df = pd.DataFrame(
[
{"unique_id": 1, "test_1": 1, "test_2": None},
]
)

linker = DuckDBLinker(df)

linker.profile_columns(["test_1", "test_2"])
ThomasHepworth marked this conversation as resolved.
Show resolved Hide resolved

captured_logs = caplog.text

assert (
"Warning: No charts produced for test_2 as the column only "
"contains null values."
) in captured_logs
Loading