Skip to content

Commit

Permalink
Merge pull request #773 from moj-analytical-services/roc_from_labels_…
Browse files Browse the repository at this point in the history
…column

[FEAT] ROC/Precision recall/truth table from label column name
  • Loading branch information
RobinL authored Sep 15, 2022
2 parents fb98c96 + 135e1e5 commit b613a08
Show file tree
Hide file tree
Showing 8 changed files with 420 additions and 128 deletions.
9 changes: 6 additions & 3 deletions docs/linker.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,18 @@ tags:
- match_weights_chart
- missingness_chart
- parameter_estimate_comparisons_chart
- precision_recall_chart_from_labels
- precision_recall_chart_from_labels_column
- precision_recall_chart_from_labels_table
- predict
- prediction_errors_from_label_column
- prediction_errors_from_labels_table
- profile_columns
- roc_chart_from_labels
- roc_table_from_labels
- roc_chart_from_labels_column
- roc_chart_from_labels_table
- save_settings_to_json
- train_m_from_pairwise_labels
- truth_space_table_from_labels_column
- truth_space_table_from_labels_table
- unlinkables_chart
- waterfall_chart
rendering:
Expand Down
9 changes: 6 additions & 3 deletions docs/linkerqa.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@ tags:
- match_weight_histogram
- match_weights_chart
- parameter_estimate_comparisons_chart
- precision_recall_chart_from_labels
- precision_recall_chart_from_labels_column
- precision_recall_chart_from_labels_table
- prediction_errors_from_label_column
- prediction_errors_from_labels_table
- roc_chart_from_labels
- roc_table_from_labels
- roc_chart_from_labels_column
- roc_chart_from_labels_table
- truth_space_table_from_labels_column
- truth_space_table_from_labels_table
- unlinkables_chart
- waterfall_chart
rendering:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink"
version = "3.2.1"
version = "3.3.0.dev01"
description = "Fast probabilistic data linkage at scale"
authors = ["Robin Linacre <[email protected]>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth"]
license = "MIT"
Expand Down
4 changes: 1 addition & 3 deletions splink/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1 @@
import pkg_resources

__version__ = pkg_resources.require("splink")[0].version
__version__ = "3.3.0.dev01"
79 changes: 61 additions & 18 deletions splink/accuracy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from copy import copy
from copy import deepcopy


from .block_from_labels import block_from_labels
from .comparison_vector_values import compute_comparison_vector_values_sql
Expand Down Expand Up @@ -184,7 +185,7 @@ def truth_space_table_from_labels_with_predictions_sqls(
return sqls


def roc_table(
def truth_space_table_from_labels_table(
linker, labels_tablename, threshold_actual=0.5, match_weight_round_to_nearest=None
):

Expand Down Expand Up @@ -213,6 +214,41 @@ def roc_table(
return df_truth_space_table


def truth_space_table_from_labels_column(
linker, label_colname, threshold_actual=0.5, match_weight_round_to_nearest=None
):

new_matchkey = len(linker._settings_obj._blocking_rules_to_generate_predictions)

df_predict = _predict_from_label_column_sql(
linker,
label_colname,
)

sql = f"""
select
cast(({label_colname}_l = {label_colname}_r) as float) as clerical_match_score,
not (cast(match_key as int) = {new_matchkey})
as found_by_blocking_rules,
*
from {df_predict.physical_name}
"""

linker._enqueue_sql(sql, "__splink__labels_with_predictions")

# c_P and c_N are clerical positive and negative, respectively
sqls = truth_space_table_from_labels_with_predictions_sqls(
threshold_actual, match_weight_round_to_nearest
)

for sql in sqls:
linker._enqueue_sql(sql["sql"], sql["output_table_name"])

df_truth_space_table = linker._execute_sql_pipeline()

return df_truth_space_table


def predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename):
sqls = block_from_labels(linker, labels_tablename)

Expand Down Expand Up @@ -280,20 +316,12 @@ def prediction_errors_from_labels_table(
return linker._execute_sql_pipeline()


# from splink.linker import Linker
def _predict_from_label_column_sql(linker, label_colname):


def prediction_errors_from_label_column(
linker,
label_colname,
include_false_positives=True,
include_false_negatives=True,
threshold=0.5,
):
# In the case of labels, we use them to block
# In the case we have a label column, we want to apply the model's blocking rules
# but add in blocking on the label colname

linker = deepcopy(linker)
settings = linker._settings_obj
brs = settings._blocking_rules_to_generate_predictions

Expand All @@ -304,19 +332,38 @@ def prediction_errors_from_label_column(
# Need the label colname to be in additional columns to retain

add_cols = settings._additional_columns_to_retain_list
add_columns_to_restore = copy(add_cols)

if label_colname not in add_cols:
settings._additional_columns_to_retain_list.append(label_colname)

# Now we want to create predictions
df_predict = linker.predict()

return df_predict


def prediction_errors_from_label_column(
linker,
label_colname,
include_false_positives=True,
include_false_negatives=True,
threshold=0.5,
):

df_predict = _predict_from_label_column_sql(
linker,
label_colname,
)

# Clerical match score is 1 where the label_colname is equal else zero

# _predict_from_label_column_sql will add a match key for matching on labels
new_matchkey = len(linker._settings_obj._blocking_rules_to_generate_predictions)

sql = f"""
select
cast(({label_colname}_l = {label_colname}_r) as float) as clerical_match_score,
not (cast(match_key as int) = {label_blocking_rule.match_key})
not (cast(match_key as int) = {new_matchkey})
as found_by_blocking_rules,
*
from {df_predict.physical_name}
Expand Down Expand Up @@ -358,8 +405,4 @@ def prediction_errors_from_label_column(

predictions = linker._execute_sql_pipeline()

# Remove the blocking rule we added and restore original add cols to ret
brs.pop()
settings._additional_columns_to_retain_list = add_columns_to_restore

return predictions
Loading

0 comments on commit b613a08

Please sign in to comment.