Merge pull request #773 from moj-analytical-services/roc_from_labels_…

…column [FEAT] ROC/Precision recall/truth table from label column name
moj-analytical-services · Sep 15, 2022 · b613a08 · b613a08
2 parents fb98c96 + 135e1e5
commit b613a08
Show file tree

Hide file tree

Showing 8 changed files with 420 additions and 128 deletions.
diff --git a/docs/linker.md b/docs/linker.md
@@ -31,15 +31,18 @@ tags:
         - match_weights_chart
         - missingness_chart
         - parameter_estimate_comparisons_chart
-        - precision_recall_chart_from_labels
+        - precision_recall_chart_from_labels_column
+        - precision_recall_chart_from_labels_table
         - predict
         - prediction_errors_from_label_column
         - prediction_errors_from_labels_table
         - profile_columns
-        - roc_chart_from_labels
-        - roc_table_from_labels
+        - roc_chart_from_labels_column
+        - roc_chart_from_labels_table
         - save_settings_to_json
         - train_m_from_pairwise_labels
+        - truth_space_table_from_labels_column
+        - truth_space_table_from_labels_table
         - unlinkables_chart
         - waterfall_chart
     rendering:

diff --git a/docs/linkerqa.md b/docs/linkerqa.md
@@ -15,11 +15,14 @@ tags:
         - match_weight_histogram
         - match_weights_chart
         - parameter_estimate_comparisons_chart
-        - precision_recall_chart_from_labels
+        - precision_recall_chart_from_labels_column
+        - precision_recall_chart_from_labels_table
         - prediction_errors_from_label_column
         - prediction_errors_from_labels_table
-        - roc_chart_from_labels
-        - roc_table_from_labels
+        - roc_chart_from_labels_column
+        - roc_chart_from_labels_table
+        - truth_space_table_from_labels_column
+        - truth_space_table_from_labels_table
         - unlinkables_chart
         - waterfall_chart
     rendering:

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "splink"
-version = "3.2.1"
+version = "3.3.0.dev01"
 description = "Fast probabilistic data linkage at scale"
 authors = ["Robin Linacre <[email protected]>", "Sam Lindsay", "Theodore Manassis", "Tom Hepworth"]
 license = "MIT"

diff --git a/splink/__init__.py b/splink/__init__.py
@@ -1,3 +1 @@
-import pkg_resources
-
-__version__ = pkg_resources.require("splink")[0].version
+__version__ = "3.3.0.dev01"
diff --git a/splink/accuracy.py b/splink/accuracy.py
@@ -1,4 +1,5 @@
-from copy import copy
+from copy import deepcopy
+
 
 from .block_from_labels import block_from_labels
 from .comparison_vector_values import compute_comparison_vector_values_sql
@@ -184,7 +185,7 @@ def truth_space_table_from_labels_with_predictions_sqls(
     return sqls
 
 
-def roc_table(
+def truth_space_table_from_labels_table(
     linker, labels_tablename, threshold_actual=0.5, match_weight_round_to_nearest=None
 ):
 
@@ -213,6 +214,41 @@ def roc_table(
     return df_truth_space_table
 
 
+def truth_space_table_from_labels_column(
+    linker, label_colname, threshold_actual=0.5, match_weight_round_to_nearest=None
+):
+
+    new_matchkey = len(linker._settings_obj._blocking_rules_to_generate_predictions)
+
+    df_predict = _predict_from_label_column_sql(
+        linker,
+        label_colname,
+    )
+
+    sql = f"""
+    select
+    cast(({label_colname}_l = {label_colname}_r) as float) as clerical_match_score,
+    not (cast(match_key as int) = {new_matchkey})
+        as found_by_blocking_rules,
+    *
+    from {df_predict.physical_name}
+    """
+
+    linker._enqueue_sql(sql, "__splink__labels_with_predictions")
+
+    # c_P and c_N are clerical positive and negative, respectively
+    sqls = truth_space_table_from_labels_with_predictions_sqls(
+        threshold_actual, match_weight_round_to_nearest
+    )
+
+    for sql in sqls:
+        linker._enqueue_sql(sql["sql"], sql["output_table_name"])
+
+    df_truth_space_table = linker._execute_sql_pipeline()
+
+    return df_truth_space_table
+
+
 def predictions_from_sample_of_pairwise_labels_sql(linker, labels_tablename):
     sqls = block_from_labels(linker, labels_tablename)
 
@@ -280,20 +316,12 @@ def prediction_errors_from_labels_table(
     return linker._execute_sql_pipeline()
 
 
-# from splink.linker import Linker
+def _predict_from_label_column_sql(linker, label_colname):
 
-
-def prediction_errors_from_label_column(
-    linker,
-    label_colname,
-    include_false_positives=True,
-    include_false_negatives=True,
-    threshold=0.5,
-):
     # In the case of labels, we use them to block
     # In the case we have a label column, we want to apply the model's blocking rules
     # but add in blocking on the label colname
-
+    linker = deepcopy(linker)
     settings = linker._settings_obj
     brs = settings._blocking_rules_to_generate_predictions
 
@@ -304,19 +332,38 @@ def prediction_errors_from_label_column(
     # Need the label colname to be in additional columns to retain
 
     add_cols = settings._additional_columns_to_retain_list
-    add_columns_to_restore = copy(add_cols)
+
     if label_colname not in add_cols:
         settings._additional_columns_to_retain_list.append(label_colname)
 
     # Now we want to create predictions
     df_predict = linker.predict()
 
+    return df_predict
+
+
+def prediction_errors_from_label_column(
+    linker,
+    label_colname,
+    include_false_positives=True,
+    include_false_negatives=True,
+    threshold=0.5,
+):
+
+    df_predict = _predict_from_label_column_sql(
+        linker,
+        label_colname,
+    )
+
     # Clerical match score is 1 where the label_colname is equal else zero
 
+    # _predict_from_label_column_sql will add a match key for matching on labels
+    new_matchkey = len(linker._settings_obj._blocking_rules_to_generate_predictions)
+
     sql = f"""
     select
     cast(({label_colname}_l = {label_colname}_r) as float) as clerical_match_score,
-    not (cast(match_key as int) = {label_blocking_rule.match_key})
+    not (cast(match_key as int) = {new_matchkey})
         as found_by_blocking_rules,
     *
     from {df_predict.physical_name}
@@ -358,8 +405,4 @@ def prediction_errors_from_label_column(
 
     predictions = linker._execute_sql_pipeline()
 
-    # Remove the blocking rule we added and restore original add cols to ret
-    brs.pop()
-    settings._additional_columns_to_retain_list = add_columns_to_restore
-
     return predictions