CDCgov · alhayward · Nov 18, 2024 · Oct 30, 2024 · Oct 31, 2024 · Nov 1, 2024
@@ -62,7 +62,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_1"
-        echo "$RESPONSE_1" | jq -e '.is_match == false' 
+        echo "$RESPONSE_1" | jq -e '.prediction == "no_match"' 
 
         PERSON_REFERENCE_ID=$(echo "$RESPONSE_1" | jq -r '.person_reference_id')
 
@@ -71,7 +71,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_2"
-        echo "$RESPONSE_2" | jq -e '.is_match == true'  
+        echo "$RESPONSE_2" | jq -e '.prediction == match'  
         echo "$RESPONSE_2" | jq -e --arg id "$PERSON_REFERENCE_ID" '.person_reference_id == $id'
 
         #enhanced tests
@@ -80,7 +80,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_3"
-        echo "$RESPONSE_3" | jq -e '.is_match == false'  
+        echo "$RESPONSE_3" | jq -e '.prediction == no_match'  
 
         PERSON_REFERENCE_ID=$(echo "$RESPONSE_3" | jq -r '.person_reference_id')
 
@@ -89,7 +89,7 @@ jobs:
         -H "Content-Type: application/json")
 
         echo "Response: $RESPONSE_4"
-        echo "$RESPONSE_4" | jq -e '.is_match == true' 
+        echo "$RESPONSE_4" | jq -e '.prediction == match' 
         echo "$RESPONSE_4" | jq -e --arg id "$PERSON_REFERENCE_ID" '.person_reference_id == $id'     
 
         #invalid tests

@@ -3,6 +3,8 @@
         "label": "dibbs-basic",
         "description": "The DIBBs Default Algorithm. Based on field experimentation and statistical analysis, this deterministic two-pass algorithm combines geographical and personal information to maximize linkage quality while minimizing false positives",
         "is_default": true,
+        "include_multiple_matches": true,
+        "belongingness_ratio": [0.75, 0.9],
         "passes": [
             {
                 "blocking_keys": [
@@ -15,7 +17,6 @@
                     "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_exact"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_perfect_match",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "thresholds": {
                         "FIRST_NAME": 0.9,
@@ -39,7 +40,6 @@
                     "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_exact"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_perfect_match",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "thresholds": {
                         "FIRST_NAME": 0.9,
@@ -57,6 +57,8 @@
         "label": "dibbs-enhanced",
         "description": "The DIBBs Log-Odds Algorithm. This optional algorithm uses statistical correction to adjust the links between incoming records and previously processed patients (it does so by taking advantage of the fact that some fields are more informative than others—e.g., two records matching on MRN is stronger evidence that they should be linked than if the records matched on zip code). It can be used if additional granularity in matching links is desired. However, while the DIBBs Log-Odds Algorithm can create higher-quality links, it is dependent on statistical updating and pre-calculated population analysis, which requires some work on the part of the user. For those cases where additional precision or stronger matching criteria are required, the Log-Odds algorithm is detailed below.",
         "is_default": false,
+        "include_multiple_matches": true,
+        "belongingness_ratio": [0.75, 0.9],
         "passes": [
             {
                 "blocking_keys": [
@@ -69,7 +71,6 @@
                     "LAST_NAME": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "similarity_measure": "JaroWinkler",
                     "thresholds": {
@@ -106,7 +107,6 @@
                     "BIRTHDATE": "func:recordlinker.linking.matchers.feature_match_log_odds_fuzzy_compare"
                 },
                 "rule": "func:recordlinker.linking.matchers.eval_log_odds_cutoff",
-                "cluster_ratio": 0.9,
                 "kwargs": {
                     "similarity_measure": "JaroWinkler",
                     "thresholds": {

@@ -82,8 +82,6 @@ def insert_patient(
 
     :returns: The inserted Patient record
     """
-    # create a new Person record if one isn't provided
-    person = person or models.Person()
 
     patient = models.Patient(person=person, record=record, external_patient_id=external_patient_id)
 

@@ -7,7 +7,6 @@
 
 import collections
 import typing
-import uuid
 
 from sqlalchemy import orm
 
@@ -59,7 +58,7 @@ def link_record_against_mpi(
     session: orm.Session,
     algorithm: models.Algorithm,
     external_person_id: typing.Optional[str] = None,
-) -> tuple[bool, uuid.UUID, uuid.UUID]:
+) -> tuple[models.Patient, models.Person | None, list[dict]]:
     """
     Runs record linkage on a single incoming record (extracted from a FHIR
     bundle) using an existing database as an MPI. Uses a flexible algorithm
@@ -82,10 +81,10 @@ def link_record_against_mpi(
     # Membership scores need to persist across linkage passes so that we can
     # find the highest scoring match across all passes
     scores: dict[models.Person, float] = collections.defaultdict(float)
+    # the minimum ratio of matches needed to be considered a cluster member
+    belongingness_ratio_lower_bound, belongingness_ratio_upper_bound = algorithm.belongingness_ratio
     for algorithm_pass in algorithm.passes:
         with TRACER.start_as_current_span("link.pass"):
-            # the minimum ratio of matches needed to be considered a cluster member
-            cluster_ratio = algorithm_pass.cluster_ratio
             # initialize a dictionary to hold the clusters of patients for each person
             clusters: dict[models.Person, list[models.Patient]] = collections.defaultdict(list)
             # block on the pii_record and the algorithm's blocking criteria, then
@@ -108,17 +107,33 @@ def link_record_against_mpi(
                             if compare(record, patient, algorithm_pass):
                                 matched_count += 1
                     # calculate the match ratio for this person cluster
-                    match_ratio = matched_count / len(patients)
-                    if match_ratio >= cluster_ratio:
+                    belongingness_ratio = matched_count / len(patients)
+                    if belongingness_ratio >= belongingness_ratio_lower_bound:
                         # The match ratio is larger than the minimum cluster threshold,
                         # optionally update the max score for this person
-                        scores[person] = max(scores[person], match_ratio)
+                        scores[person] = max(scores[person], belongingness_ratio)
 
     matched_person: typing.Optional[models.Person] = None
     if scores:
         # Find the person with the highest matching score
         matched_person, _ = max(scores.items(), key=lambda i: i[1])
 
+    sorted_scores = [{"person": k, "belongingness_ratio": v} for k, v in sorted(scores.items(), reverse=True, key=lambda item: item[1])]
+    if not scores:
+        # No match
+        matched_person = models.Person() # Create new Person Cluster
+        results = []
+    elif float(sorted_scores[0]["belongingness_ratio"]) >= belongingness_ratio_upper_bound:
+        # Match (1 or many)
+        matched_person = sorted_scores[0]["person"]
+        results = [x for x in sorted_scores if float(x["belongingness_ratio"]) >= belongingness_ratio_upper_bound] # Multiple matches
+        if not algorithm.include_multiple_matches:
+            results = [results[0]] # 1 Match (highest Belongingness Ratio)
+    else:
+        # Possible match
+        matched_person = None
+        results = sorted_scores
+
     with TRACER.start_as_current_span("insert"):
         patient = mpi_service.insert_patient(
             session,
@@ -130,4 +145,4 @@ def link_record_against_mpi(
         )
 
     # return a tuple indicating whether a match was found and the person ID
-    return (bool(matched_person), patient.person.reference_id, patient.reference_id)
+    return (patient, patient.person, results)
@@ -23,10 +23,27 @@ class Algorithm(Base):
     is_default: orm.Mapped[bool] = orm.mapped_column(default=False, index=True)
     label: orm.Mapped[str] = orm.mapped_column(sqltypes.String(255), unique=True)
     description: orm.Mapped[str] = orm.mapped_column(sqltypes.Text(), nullable=True)
+    include_multiple_matches: orm.Mapped[bool] = orm.mapped_column(sqltypes.Boolean, default=True)
+    belongingness_ratio_lower_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
+    belongingness_ratio_upper_bound: orm.Mapped[float] = orm.mapped_column(sqltypes.Float, default=1.0)
     passes: orm.Mapped[list["AlgorithmPass"]] = orm.relationship(
         back_populates="algorithm", cascade="all, delete-orphan"
     )
 
+    @property
+    def belongingness_ratio(self) -> tuple[float, float]:
+        """
+        Get the Belongingness Ratio Threshold Range for this algorithm pass.
+        """
+        return (self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound)
+
+    @belongingness_ratio.setter  # type: ignore
+    def belongingness_ratio(self, value: tuple[float, float]):
+        """
+        Set the Belongingess Ratio for this algorithm pass.
+        """
+        self.belongingness_ratio_lower_bound, self.belongingness_ratio_upper_bound = value
+
     @classmethod
     def from_dict(cls, **data: dict) -> "Algorithm":
         """
@@ -82,7 +99,6 @@ class AlgorithmPass(Base):
     blocking_keys: orm.Mapped[list[str]] = orm.mapped_column(sqltypes.JSON)
     _evaluators: orm.Mapped[dict[str, str]] = orm.mapped_column("evaluators", sqltypes.JSON)
     _rule: orm.Mapped[str] = orm.mapped_column("rule", sqltypes.String(255))
-    cluster_ratio: orm.Mapped[float] = orm.mapped_column(sqltypes.Float)
     kwargs: orm.Mapped[dict] = orm.mapped_column(sqltypes.JSON, default=dict)
 
     @property

@@ -39,7 +39,7 @@ class Patient(Base):
     __tablename__ = "mpi_patient"
 
     id: orm.Mapped[int] = orm.mapped_column(get_bigint_pk(), autoincrement=True, primary_key=True)
-    person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"))
+    person_id: orm.Mapped[int] = orm.mapped_column(schema.ForeignKey(f"{Person.__tablename__}.id"), nullable=True)
     person: orm.Mapped["Person"] = orm.relationship(back_populates="patients")
     # NOTE: We're using a protected attribute here to store the data string, as we
     # want getter/setter access to the data dictionary to trigger updating the

@@ -46,16 +46,16 @@
     # link the record
     try:
         # Make a copy of record_to_link so we don't modify the original
-        (found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=input.record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=input.external_person_id,
         )
         return schemas.LinkResponse(
-            is_match=found_match,
-            patient_reference_id=patient_reference_id,
-            person_reference_id=new_person_id,
+            patient_reference_id=patient.reference_id,
+            person_reference_id=(person and person.reference_id),
+            results=results
         )
 
     except ValueError:
@@ -86,12 +86,10 @@
         algorithm = algorithm_service.default_algorithm(db_session)
 
     if not algorithm:
-        response.status_code = fastapi.status.HTTP_422_UNPROCESSABLE_ENTITY
-        return schemas.LinkFhirResponse(
-            found_match=False,
-            updated_bundle=input_bundle,
-            message="Error: No algorithm found",
-        )
+        raise fastapi.HTTPException(
+            status_code=fastapi.status.HTTP_422_UNPROCESSABLE_ENTITY,
+            detail="Error: Invalid algorithm specified"
+            )
 
     # Now extract the patient record we want to link
     try:
@@ -101,36 +99,40 @@
             if entry.get("resource", {}).get("resourceType", "") == "Patient"
         ][0]
     except IndexError:
-        response.status_code = fastapi.status.HTTP_400_BAD_REQUEST
-        return schemas.LinkFhirResponse(
-            found_match=False,
-            updated_bundle=input_bundle,
-            message="Supplied bundle contains no Patient resource to link on.",
-        )
+        raise fastapi.HTTPException(
+            status_code=fastapi.status.HTTP_400_BAD_REQUEST,
+            detail="Supplied bundle contains no Patient resource to link on."
+            )
+
 
     # convert record to PII
     pii_record: schemas.PIIRecord = fhir.fhir_record_to_pii_record(record_to_link)
 
     # Now link the record
     try:
-        (found_match, new_person_id, _) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=pii_record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=external_id,
         )
-        updated_bundle = fhir.add_person_resource(
-            str(new_person_id), pii_record.external_id, input_bundle
+        updated_bundle: dict | None = None
+        if person:
+            updated_bundle = fhir.add_person_resource(
+                str(person.reference_id), pii_record.external_id, input_bundle
+            )
+        return schemas.LinkFhirResponse(
+            patient_reference_id=patient.reference_id,
+            person_reference_id=(person and person.reference_id),
+            results=results,
+            updated_bundle=updated_bundle
         )
-        return schemas.LinkFhirResponse(found_match=found_match, updated_bundle=updated_bundle)
 
     except ValueError as err:
-        response.status_code = fastapi.status.HTTP_400_BAD_REQUEST
-        return schemas.LinkFhirResponse(
-            found_match=False,
-            updated_bundle=input_bundle,
-            message=f"Could not connect to database: {err}",
-        )
+        raise fastapi.HTTPException(
+            status_code=fastapi.status.HTTP_400_BAD_REQUEST,
+            detail=f"Could not connect to database: {err}"
+            )
 
 
 @router.post("/fhir", summary="Link FHIR")
@@ -177,18 +179,21 @@
     # link the record
     try:
         # Make a copy of pii_record so we don't modify the original
-        (found_match, new_person_id, patient_reference_id) = link.link_record_against_mpi(
+        (patient, person, results) = link.link_record_against_mpi(
             record=pii_record,
             session=db_session,
             algorithm=algorithm,
             external_person_id=external_id,
         )
         return schemas.LinkResponse(
-            is_match=found_match,
-            patient_reference_id=patient_reference_id,
-            person_reference_id=new_person_id,
+            patient_reference_id=patient.reference_id,
+            person_reference_id=(person and person.reference_id),
+            results=results
         )
 
     except ValueError:
         response.status_code = fastapi.status.HTTP_400_BAD_REQUEST
-        raise fastapi.HTTPException(status_code=400, detail="Error: Bad request")
+        raise fastapi.HTTPException(
+            status_code=400,
+            detail="Error: Bad request"
+            )
@@ -9,6 +9,7 @@
 import typing
 
 import pydantic
+from typing_extensions import Annotated
 
 from recordlinker.linking import matchers
 from recordlinker.models.mpi import BlockingKey
@@ -26,7 +27,6 @@
     blocking_keys: list[str]
     evaluators: dict[str, str]
     rule: str
-    cluster_ratio: float
     kwargs: dict[str, typing.Any] = {}
 
     @pydantic.field_validator("blocking_keys", mode="before")
@@ -44,7 +44,7 @@
     @pydantic.field_validator("evaluators", mode="before")
     def validate_evaluators(cls, value):
         """
-        Validated the evaluators into a list of feature comparison functions.
+        Validate the evaluators into a list of feature comparison functions.
         """
         for k, v in value.items():
             try:
@@ -78,8 +78,24 @@
     label: str = pydantic.Field(pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$")
     description: typing.Optional[str] = None
     is_default: bool = False
+    include_multiple_matches: bool = True
+    belongingness_ratio: tuple[Annotated[float, pydantic.Field(ge=0, le=1)], Annotated[float, pydantic.Field(ge=0, le=1)]]
     passes: typing.Sequence[AlgorithmPass]
 
+    @pydantic.field_validator("belongingness_ratio", mode="before")
+    def validate_belongingness_ratio(cls, value):
+        """
+        Validate the Belongingness Ratio Threshold Range.
+        """
+        lower_bound, upper_bound = value
+        if lower_bound < 0 or lower_bound > 1:
+            raise ValueError(f"Invalid lower bound: {lower_bound}")
+        if upper_bound < 0 or upper_bound > 1:
+            raise ValueError(f"Invalid upper bound: {upper_bound}")
+        if lower_bound > upper_bound:
+            raise ValueError(f"Invalid range. Lower bound must be less than upper bound: {value}")
+        return (lower_bound, upper_bound)
+
 
 class AlgorithmSummary(Algorithm):
     """