Skip to content

Commit

Permalink
Merge pull request #1611 from moj-analytical-services/missingness-sam…
Browse files Browse the repository at this point in the history
…e-columns

Check input frames have same columns - missingness
  • Loading branch information
ThomasHepworth authored Nov 8, 2023
2 parents 882c396 + 70cc6d2 commit 68aee62
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 12 deletions.
41 changes: 31 additions & 10 deletions splink/linker.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,18 +250,38 @@ def __init__(
self.debug_mode = False

@property
def _get_input_columns(
def _input_columns(
self,
as_list=True,
):
"""Retrieve the column names from the input dataset(s)"""
df_obj: SplinkDataFrame = next(iter(self._input_tables_dict.values()))

column_names = (
[col.name() for col in df_obj.columns] if as_list else df_obj.columns
)
input_dfs = self._input_tables_dict.values()

# get a list of the column names for each input frame
# sort it for consistent ordering, and give each frame's
# columns as a tuple so we can hash it
column_names_by_input_df = [
tuple(sorted([col.name() for col in input_df.columns]))
for input_df in input_dfs
]
# check that the set of input columns is the same for each frame,
# fail if the sets are different
if len(set(column_names_by_input_df)) > 1:
common_cols = set.intersection(
*(set(col_names) for col_names in column_names_by_input_df)
)
problem_names = {
col
for frame_col_names in column_names_by_input_df
for col in frame_col_names
if col not in common_cols
}
raise SplinkException(
"All linker input frames must have the same set of columns. "
"The following columns were not found in all input frames: "
+ ", ".join(problem_names)
)

return column_names
return next(iter(input_dfs)).columns

@property
def _cache_uid(self):
Expand Down Expand Up @@ -3044,8 +3064,9 @@ def missingness_chart(self, input_dataset: str = None):
Args:
input_dataset (str, optional): Name of one of the input tables in the
database. If provided, missingness will be computed for this table alone.
Defaults to None.
database. If provided, missingness will be computed for
this table alone.
Defaults to None.
Examples:
```py
Expand Down
2 changes: 1 addition & 1 deletion splink/missingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ def missingness_sqls(columns, input_tablename):


def missingness_data(linker, input_tablename):
columns = linker._input_columns
if input_tablename is None:
splink_dataframe = linker._initialise_df_concat(materialise=True)
else:
splink_dataframe = linker._table_to_splink_dataframe(
input_tablename, input_tablename
)
columns = splink_dataframe.columns

sqls = missingness_sqls(columns, splink_dataframe.physical_name)

Expand Down
2 changes: 1 addition & 1 deletion splink/profile_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def profile_columns(linker, column_expressions=None, top_n=10, bottom_n=10):
"""

if not column_expressions:
column_expressions = linker._get_input_columns
column_expressions = [col.name() for col in linker._input_columns]

df_concat = linker._initialise_df_concat()

Expand Down
35 changes: 35 additions & 0 deletions tests/test_missingness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd
from pytest import raises

from splink.exceptions import SplinkException
from tests.decorator import mark_with_dialects_excluding


@mark_with_dialects_excluding()
def test_missingness_chart(dialect, test_helpers):
helper = test_helpers[dialect]

df = helper.load_frame_from_csv("./tests/datasets/fake_1000_from_splink_demos.csv")

linker = helper.Linker(
df, {"link_type": "dedupe_only"}, **helper.extra_linker_args()
)
linker.missingness_chart()


@mark_with_dialects_excluding()
def test_missingness_chart_mismatched_columns(dialect, test_helpers):
helper = test_helpers[dialect]

df_l = helper.load_frame_from_csv(
"./tests/datasets/fake_1000_from_splink_demos.csv"
)
df_r = pd.read_csv("./tests/datasets/fake_1000_from_splink_demos.csv")
df_r.rename(columns={"surname": "SURNAME"}, inplace=True)
df_r = helper.convert_frame(df_r)

linker = helper.Linker(
[df_l, df_r], {"link_type": "link_only"}, **helper.extra_linker_args()
)
with raises(SplinkException):
linker.missingness_chart()

0 comments on commit 68aee62

Please sign in to comment.