feat(linking): implement possible match short-term (#111)

## Description * Implement Possible Match using short-term approach: Apply a user-configurable range to the Belongingness Ratio threshold. * To support Possible Match, update Link API response as follows: **Example:** `belongingness_ratio=[0.75, 0.9]` **No Match** ``` { "patient_reference_id": "5a12bf00-16f8-4c03-b980-81d3f5ff6db6", "person_reference_id": "b104942e-6327-4643-801a-5fb22cb621c4", # New Person ID "results": [] } ``` **Possible Match** ``` { "patient_reference_id": "c5fcfc39-a915-4b65-afc7-b681c892652e", "person_reference_id": None, "results": [ { "person_reference_id": "bf27f9f2-2882-4e6f-9db9-6a916679cff7", "belongingness_ratio": 0.8 } ] } ``` **Match** `include_multiple_matches=false` ``` { "patient_reference_id": "2eb14d86-217f-45c3-a528-d038e7eadce7", "person_reference_id": "6abb63cb-6c31-4142-bdbf-8b2956dab1c1", "results": [ { "person_reference_id": "6abb63cb-6c31-4142-bdbf-8b2956dab1c1", "belongingness_ratio": 0.98 }, ] } ``` **Match** `include_multiple_matches=true` ``` { "patient_reference_id": "2eb14d86-217f-45c3-a528-d038e7eadce7", "person_reference_id": "6abb63cb-6c31-4142-bdbf-8b2956dab1c1", # Person ID with highest Belongingness Ratio "results": [ { "person_reference_id": "6abb63cb-6c31-4142-bdbf-8b2956dab1c1", "belongingness_ratio": 0.98 }, { "person_reference_id": "8acf6104-d633-4516-8c7f-2c6e88a8ceae", "belongingness_ratio": 0.8 }, ] } ``` ## Related Issues #91 ## Additional Notes * Rename `cluster_ratio` -> `belongingness_ratio` for clarity * Move `belongingness_ratio` definition from `AlgorithmPass` to `Algorithm` level (this was the DIBBs team's original intent) * Add user-configurable `Algorithm.include_multiple_matches` boolean parameter to return multiple matches * Given `belongingness_ratio=[X, Y], if `Algorithm.include_multiple_matches=true`, return all `results` where `belongingness_ratio >= X]. Assign `person_reference_id` of Person with highest Belongingness Ratio. * Add test coverage for * Possible Match result (Basic, Enhanced Algorithm) * New Link API response (`LinkResponse`, `LinkFhirResponse`) * Include multiple matches or not (`Algorithm.include_multiple_matches`) * Add new test patient bundle assets for above coverage --------- Co-authored-by: Eric Buckley <[email protected]> Co-authored-by: cbrinson-rise8 <[email protected]>
CDCgov · Nov 18, 2024 · 1775e79 · 1775e79
1 parent 379cd47
commit 1775e79
Show file tree

Hide file tree

Showing 20 changed files with 1,191 additions and 291 deletions.
diff --git a/.github/workflows/check_smoke_tests.yml b/.github/workflows/check_smoke_tests.yml
@@ -62,7 +62,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_1"
-        echo "$RESPONSE_1" | jq -e '.is_match == false' 
+        echo "$RESPONSE_1" | jq -e '.prediction == "no_match"' 
 
         PERSON_REFERENCE_ID=$(echo "$RESPONSE_1" | jq -r '.person_reference_id')
 
@@ -71,7 +71,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_2"
-        echo "$RESPONSE_2" | jq -e '.is_match == true'  
+        echo "$RESPONSE_2" | jq -e '.prediction == "match"'  
         echo "$RESPONSE_2" | jq -e --arg id "$PERSON_REFERENCE_ID" '.person_reference_id == $id'
 
         #enhanced tests
@@ -80,7 +80,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_3"
-        echo "$RESPONSE_3" | jq -e '.is_match == false'  
+        echo "$RESPONSE_3" | jq -e '.prediction == "no_match"'  
 
         PERSON_REFERENCE_ID=$(echo "$RESPONSE_3" | jq -r '.person_reference_id')
 
@@ -89,7 +89,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_4"
-        echo "$RESPONSE_4" | jq -e '.is_match == true' 
+        echo "$RESPONSE_4" | jq -e '.prediction == "match"' 
         echo "$RESPONSE_4" | jq -e --arg id "$PERSON_REFERENCE_ID" '.person_reference_id == $id'     
       
         #invalid tests

diff --git a/src/recordlinker/assets/initial_algorithms.json b/src/recordlinker/assets/initial_algorithms.json
@@ -3,6 +3,8 @@
         "label": "dibbs-basic",
         "description": "The DIBBs Default Algorithm. Based on field experimentation and statistical analysis, this deterministic two-pass algorithm combines geographical and personal information to maximize linkage quality while minimizing false positives",
         "is_default": true,
+        "include_multiple_matches": true,
+        "belongingness_ratio": [0.75, 0.9],
         "passes": [
             {
                 "blocking_keys": [
@@ -15,7 +17,6 @@
                     "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_perfect_match",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "thresholds": {
                         "FIRST_NAME": 0.9,
@@ -39,7 +40,6 @@
                     "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_exact"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_perfect_match",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "thresholds": {
                         "FIRST_NAME": 0.9,
@@ -57,6 +57,8 @@
         "label": "dibbs-enhanced",
         "description": "The DIBBs Log-Odds Algorithm. This optional algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.",
         "is_default": false,
+        "include_multiple_matches": true,
+        "belongingness_ratio": [0.75, 0.9],
         "passes": [
             {
                 "blocking_keys": [
@@ -69,7 +71,6 @@
                     "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "similarity_measure": "JaroWinkler",
                     "thresholds": {
@@ -106,7 +107,6 @@
                     "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "similarity_measure": "JaroWinkler",
                     "thresholds": {

diff --git a/src/recordlinker/database/mpi_service.py b/src/recordlinker/database/mpi_service.py
@@ -82,8 +82,6 @@ def insert_patient(
 
     :returns: The inserted Patient record
     """
-    # create a new Person record if one isn't provided
-    person = person or models.Person()
 
     patient = models.Patient(person=person, record=record, external_patient_id=external_patient_id)
 
@@ -127,12 +125,13 @@ def bulk_insert_patients(
     if not records:
         return []
 
-    person = person or models.Person()
-    session.add(person)
-    session.flush()
+    if person:
+        session.add(person)
+        session.flush()
+
     pat_data = [
         {
-            "person_id": person.id,
+            "person_id": person and person.id,
             "_data": record.to_json(prune_empty=True),
             "external_patient_id": record.external_id,
             "external_person_id": external_person_id,

diff --git a/src/recordlinker/linking/link.py b/src/recordlinker/linking/link.py
@@ -6,8 +6,8 @@
 """
 
 import collections
+import dataclasses
 import typing
-import uuid
 
 from sqlalchemy import orm
 
@@ -29,6 +29,12 @@
     TRACER = MockTracer()
 
 
+@dataclasses.dataclass
+class LinkResult:
+    person: models.Person
+    belongingness_ratio: float
+
+
 def compare(
     record: schemas.PIIRecord, patient: models.Patient, algorithm_pass: models.AlgorithmPass
 ) -> bool:
@@ -59,7 +65,7 @@ def link_record_against_mpi(
     session: orm.Session,
     algorithm: models.Algorithm,
     external_person_id: typing.Optional[str] = None,
-) -> tuple[bool, uuid.UUID, uuid.UUID]:
+) -> tuple[models.Patient, models.Person | None, list[LinkResult]]:
     """
     Runs record linkage on a single incoming record (extracted from a FHIR
     bundle) using an existing database as an MPI. Uses a flexible algorithm
@@ -82,10 +88,10 @@ def link_record_against_mpi(
     # Membership scores need to persist across linkage passes so that we can
     # find the highest scoring match across all passes
     scores: dict[models.Person, float] = collections.defaultdict(float)
+    # the minimum ratio of matches needed to be considered a cluster member
+    belongingness_ratio_lower_bound, belongingness_ratio_upper_bound = algorithm.belongingness_ratio
     for algorithm_pass in algorithm.passes:
         with TRACER.start_as_current_span("link.pass"):
-            # the minimum ratio of matches needed to be considered a cluster member
-            cluster_ratio = algorithm_pass.cluster_ratio
             # initialize a dictionary to hold the clusters of patients for each person
             clusters: dict[models.Person, list[models.Patient]] = collections.defaultdict(list)
             # block on the pii_record and the algorithm's blocking criteria, then
@@ -108,17 +114,33 @@ def link_record_against_mpi(
                             if compare(record, patient, algorithm_pass):
                                 matched_count += 1
                     # calculate the match ratio for this person cluster
-                    match_ratio = matched_count / len(patients)
-                    if match_ratio >= cluster_ratio:
+                    belongingness_ratio = matched_count / len(patients)
+                    if belongingness_ratio >= belongingness_ratio_lower_bound:
                         # The match ratio is larger than the minimum cluster threshold,
                         # optionally update the max score for this person
-                        scores[person] = max(scores[person], match_ratio)
+                        scores[person] = max(scores[person], belongingness_ratio)
 
     matched_person: typing.Optional[models.Person] = None
     if scores:
         # Find the person with the highest matching score
         matched_person, _ = max(scores.items(), key=lambda i: i[1])
 
+    sorted_scores: list[LinkResult] = [LinkResult(k, v) for k, v in sorted(scores.items(), reverse=True, key=lambda item: item[1])]
+    if not scores:
+        # No match
+        matched_person = models.Person() # Create new Person Cluster
+        results = []
+    elif sorted_scores[0].belongingness_ratio >= belongingness_ratio_upper_bound:
+        # Match (1 or many)
+        matched_person = sorted_scores[0].person
+        results = [x for x in sorted_scores if x.belongingness_ratio >= belongingness_ratio_upper_bound] # Multiple matches
+        if not algorithm.include_multiple_matches:
+            results = [results[0]] # 1 Match (highest Belongingness Ratio)
+    else:
+        # Possible match
+        matched_person = None
+        results = sorted_scores
+
     with TRACER.start_as_current_span("insert"):
         patient = mpi_service.insert_patient(
             session,
@@ -130,4 +152,4 @@ def link_record_against_mpi(
         )
 
     # return a tuple indicating whether a match was found and the person ID
-    return (bool(matched_person), patient.person.reference_id, patient.reference_id)
+    return (patient, patient.person, results)
diff --git a/src/recordlinker/models/algorithm.py b/src/recordlinker/models/algorithm.py
@@ -23,10 +23,27 @@ class Algorithm(Base):
     is_default: orm.Mapped[bool] = orm.mapped_column(default=False, index=True)
     label: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), unique=True)
     description: orm.Mapped[str] = orm.mapped_column(sqltypes.Text(), nullable=True)
+    include_multiple_matches: orm.Mapped[bool] = orm.mapped_column(sqltypes.Boolean, default=True)
+    belongingness_ratio_lower_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
+    belongingness_ratio_upper_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
     passes: orm.Mapped[list["AlgorithmPass"]] = orm.relationship(
         back_populates="algorithm", cascade="all, delete-orphan"
     )
 
+    @property
+    def belongingness_ratio(self) -> tuple[float, float]:
+        """
+        Get the Belongingness Ratio Threshold Range for this algorithm pass.
+        """
+        return (self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound)
+
+    @belongingness_ratio.setter  # type: ignore
+    def belongingness_ratio(self, value: tuple[float, float]):
+        """
+        Set the Belongingess Ratio for this algorithm pass.
+        """
+        self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound = value
+
     @classmethod
     def from_dict(cls, **data: dict) -> "Algorithm":
         """
@@ -82,7 +99,6 @@ class AlgorithmPass(Base):
     blocking_keys: orm.Mapped[list[str]] = orm.mapped_column(sqltypes.JSON)
     _evaluators: orm.Mapped[dict[str, str]] = orm.mapped_column("evaluators", sqltypes.JSON)
     _rule: orm.Mapped[str] = orm.mapped_column("rule", sqltypes.String(255))
-    cluster_ratio: orm.Mapped[float] = orm.mapped_column(sqltypes.Float)
     kwargs: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON, default=dict)
 
     @property

diff --git a/src/recordlinker/models/mpi.py b/src/recordlinker/models/mpi.py
@@ -39,7 +39,7 @@ class Patient(Base):
     __tablename__ = "mpi_patient"
 
     id: orm.Mapped[int] = orm.mapped_column(get_bigint_pk(), autoincrement=True, primary_key=True)
-    person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"))
+    person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"), nullable=True)
     person: orm.Mapped["Person"] = orm.relationship(back_populates="patients")
     # NOTE: We're using a protected attribute here to store the data string, as we
     # want getter/setter access to the data dictionary to trigger updating the

diff --git a/src/recordlinker/routes/link_router.py b/src/recordlinker/routes/link_router.py
@@ -46,16 +46,16 @@ async def link_piirecord(
     # link the record
     try:
         # Make a copy of record_to_link so we don't modify the original
-        (found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=input.record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=input.external_person_id,
         )
         return schemas.LinkResponse(
-            is_match=found_match,
-            patient_reference_id=patient_reference_id,
-            person_reference_id=new_person_id,
+            patient_reference_id=patient.reference_id,
+            person_reference_id=(person and person.reference_id),
+            results=[schemas.LinkResult(**r.__dict__) for r in results]
         )
 
     except ValueError:
@@ -86,12 +86,10 @@ async def link_dibbs(
         algorithm = algorithm_service.default_algorithm(db_session)
 
     if not algorithm:
-        response.status_code = fastapi.status.HTTP_422_UNPROCESSABLE_ENTITY
-        return schemas.LinkFhirResponse(
-            found_match=False,
-            updated_bundle=input_bundle,
-            message="Error: No algorithm found",
-        )
+        raise fastapi.HTTPException(
+            status_code=fastapi.status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail="Error: Invalid algorithm specified"
+            )
 
     # Now extract the patient record we want to link
     try:
@@ -101,36 +99,40 @@ async def link_dibbs(
             if entry.get("resource", {}).get("resourceType", "") == "Patient"
         ][0]
     except IndexError:
-        response.status_code = fastapi.status.HTTP_400_BAD_REQUEST
-        return schemas.LinkFhirResponse(
-            found_match=False,
-            updated_bundle=input_bundle,
-            message="Supplied bundle contains no Patient resource to link on.",
-        )
+        raise fastapi.HTTPException(
+            status_code=fastapi.status.HTTP_400_BAD_REQUEST,
+            detail="Supplied bundle contains no Patient resource to link on."
+            )
+
 
     # convert record to PII
     pii_record: schemas.PIIRecord = fhir.fhir_record_to_pii_record(record_to_link)
 
     # Now link the record
     try:
-        (found_match, new_person_id, _) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=pii_record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=external_id,
         )
-        updated_bundle = fhir.add_person_resource(
-            str(new_person_id), pii_record.external_id, input_bundle
+        updated_bundle: dict | None = None
+        if person:
+            updated_bundle = fhir.add_person_resource(
+                str(person.reference_id), pii_record.external_id, input_bundle
+            )
+        return schemas.LinkFhirResponse(
+            patient_reference_id=patient.reference_id,
+            person_reference_id=(person and person.reference_id),
+            results=[schemas.LinkResult(**r.__dict__) for r in results],
+            updated_bundle=updated_bundle
         )
-        return schemas.LinkFhirResponse(found_match=found_match, updated_bundle=updated_bundle)
 
     except ValueError as err:
-        response.status_code = fastapi.status.HTTP_400_BAD_REQUEST
-        return schemas.LinkFhirResponse(
-            found_match=False,
-            updated_bundle=input_bundle,
-            message=f"Could not connect to database: {err}",
-        )
+        raise fastapi.HTTPException(
+            status_code=fastapi.status.HTTP_400_BAD_REQUEST,
+            detail=f"Could not connect to database: {err}"
+            )
 
 
 @router.post("/fhir", summary="Link FHIR")
@@ -177,18 +179,21 @@ async def link_fhir(
     # link the record
     try:
         # Make a copy of pii_record so we don't modify the original
-        (found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=pii_record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=external_id,
         )
         return schemas.LinkResponse(
-            is_match=found_match,
-            patient_reference_id=patient_reference_id,
-            person_reference_id=new_person_id,
+            patient_reference_id=patient.reference_id,
+            person_reference_id=(person and person.reference_id),
+            results=[schemas.LinkResult(**r.__dict__) for r in results]
         )
 
     except ValueError:
         response.status_code = fastapi.status.HTTP_400_BAD_REQUEST
-        raise fastapi.HTTPException(status_code=400, detail="Error: Bad request")
+        raise fastapi.HTTPException(
+            status_code=400,
+            detail="Error: Bad request"
+            )
diff --git a/src/recordlinker/schemas/__init__.py b/src/recordlinker/schemas/__init__.py
@@ -5,6 +5,7 @@
 from .link import LinkFhirResponse
 from .link import LinkInput
 from .link import LinkResponse
+from .link import LinkResult
 from .mpi import PatientPersonRef
 from .mpi import PatientRef
 from .mpi import PersonRef
@@ -23,6 +24,7 @@
     "PIIRecord",
     "LinkInput",
     "LinkResponse",
+    "LinkResult",
     "LinkFhirInput",
     "LinkFhirResponse",
     "PersonRef",