Skip to content

Commit

Permalink
Merge pull request #1339 from moj-analytical-services/856_profile_nul…
Browse files Browse the repository at this point in the history
…l_column

856 profile null column
  • Loading branch information
sama-ds authored Nov 8, 2023
2 parents d673eaa + 1145289 commit 4b3d365
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 18 deletions.
53 changes: 35 additions & 18 deletions splink/profile_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import logging
import re
from copy import deepcopy

from .charts import altair_or_json, load_chart_definition
from .misc import ensure_is_list

logger = logging.getLogger(__name__)


def _group_name(cols_or_expr):
cols_or_expr = re.sub(r"[^0-9a-zA-Z_]", " ", cols_or_expr)
Expand Down Expand Up @@ -270,21 +273,35 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
percentile_rows = [
p for p in percentile_rows_all if p["group_name"] == _group_name(expression)
]
percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
top_n_rows = [
p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
]
bottom_n_rows = [
p for p in bottom_n_rows_all if p["group_name"] == _group_name(expression)
]
# remove concat blank from expression title
expression = expression.replace(", ' '", "")
inner_chart = _get_inner_chart_spec_freq(
percentile_rows, top_n_rows, bottom_n_rows, expression
)
inner_charts.append(inner_chart)
outer_spec = deepcopy(_outer_chart_spec_freq)

outer_spec["vconcat"] = inner_charts

return altair_or_json(outer_spec)
if percentile_rows == []:
logger.warning(
"Warning: No charts produced for "
f"{expression}"
" as the column only contains null values."
)
else:
percentile_rows = _add_100_percentile_to_df_percentiles(percentile_rows)
top_n_rows = [
p for p in top_n_rows_all if p["group_name"] == _group_name(expression)
]
bottom_n_rows = [
p
for p in bottom_n_rows_all
if p["group_name"] == _group_name(expression)
]
# remove concat blank from expression title
expression = expression.replace(", ' '", "")
inner_chart = _get_inner_chart_spec_freq(
percentile_rows, top_n_rows, bottom_n_rows, expression
)
inner_charts.append(inner_chart)

if inner_charts != []:

outer_spec = deepcopy(_outer_chart_spec_freq)
outer_spec["vconcat"] = inner_charts

return altair_or_json(outer_spec)

else:
return None
20 changes: 20 additions & 0 deletions tests/test_profile_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,23 @@ def test_profile_using_spark(df_spark):
)

assert len(generate_raw_profile_dataset([["first_name", "blank"]], linker)) == 0


def test_profile_null_columns(caplog):

df = pd.DataFrame(
[
{"unique_id": 1, "test_1": 1, "test_2": None},
]
)

linker = DuckDBLinker(df)

linker.profile_columns(["test_1", "test_2"])

captured_logs = caplog.text

assert (
"Warning: No charts produced for test_2 as the column only "
"contains null values."
) in captured_logs

0 comments on commit 4b3d365

Please sign in to comment.