Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding Features and Blocking Keys for phone and email #148

Merged
merged 2 commits into from
Dec 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions docs/site/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ linkage evaluation phase. The following features are supported:

`BIRTHDATE`

: The patient's birthdate in the format `YYYY-MM-DD`.
: The patient's birthdate (normalized to `YYYY-MM-DD`).

`MRN`

Expand All @@ -25,7 +25,7 @@ linkage evaluation phase. The following features are supported:

`SEX`

: The patient's sex in the format of `M`, `F`, or `U` for unknown.
: The patient's sex (normalized to `M`, `F`, or `U` for unknown).

`GENDER`

Expand Down Expand Up @@ -75,6 +75,14 @@ linkage evaluation phase. The following features are supported:

: The patient's phone, email, fax, or other contact information.

`PHONE`

: The patient's phone number (normalized to 10 digits).

`EMAIL`

: The patient's email address.

`DRIVERS_LICENSE`

: The patient's driver's license number.
Expand Down Expand Up @@ -113,6 +121,14 @@ patient data and used during query retrieval. The following blocking key types a

: The first 4 characters of the patient's address.

`PHONE` (ID: **8**)

: The last 4 digits of the patient's phone number.

`EMAIL` (ID: **9**)

: The first 4 characters of the patient's email address.


### Evaluation Functions

Expand Down
2 changes: 2 additions & 0 deletions src/recordlinker/models/mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ class BlockingKey(enum.Enum):
FIRST_NAME = ("FIRST_NAME", 5, "First 4 characters of the first name")
LAST_NAME = ("LAST_NAME", 6, "First 4 characters of the last name")
ADDRESS = ("ADDRESS", 7, "First 4 characters of the address")
PHONE = ("PHONE", 8, "Last 4 characters of the phone number")
EMAIL = ("EMAIL", 9, "First 4 characters of the email address")

def __init__(self, value: str, _id: int, description: str):
self._value = value
Expand Down
33 changes: 33 additions & 0 deletions src/recordlinker/schemas/pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class Feature(enum.Enum):
RACE = "RACE"
GENDER = "GENDER"
TELECOM = "TELECOM"
PHONE = "PHONE"
EMAIL = "EMAIL"
SUFFIX = "SUFFIX"
COUNTY = "COUNTY"
DRIVERS_LICENSE = "DRIVERS_LICENSE"
Expand Down Expand Up @@ -142,6 +144,23 @@ class Telecom(pydantic.BaseModel):
system: typing.Optional[str] = None
use: typing.Optional[str] = None

def phone_number(self) -> str | None:
"""
Return the phone number from the telecom record.
"""
if self.system != "phone":
return None
# normalize the number to include just the 10 digits
return re.sub(r"\D", "", self.value)[:10]

def email(self) -> str | None:
"""
Return the email address from the telecom record.
"""
if self.system != "email":
return None
return self.value


class DriversLicense(pydantic.BaseModel):
"""
Expand Down Expand Up @@ -362,6 +381,16 @@ def feature_iter(self, feature: Feature) -> typing.Iterator[str]:
for telecom in self.telecom:
if telecom.value:
yield telecom.value
elif feature == Feature.PHONE:
for telecom in self.telecom:
number = telecom.phone_number()
if number:
yield number
elif feature == Feature.EMAIL:
for telecom in self.telecom:
email = telecom.email()
if email:
yield email
elif feature == Feature.SUFFIX:
for name in self.name:
for suffix in name.suffix:
Expand Down Expand Up @@ -402,6 +431,10 @@ def blocking_keys(self, key: models.BlockingKey) -> set[str]:
vals.update({x[:4] for x in self.feature_iter(Feature.LAST_NAME)})
elif key == models.BlockingKey.ADDRESS:
vals.update({x[:4] for x in self.feature_iter(Feature.ADDRESS)})
elif key == models.BlockingKey.PHONE:
vals.update({x[-4:] for x in self.feature_iter(Feature.PHONE)})
elif key == models.BlockingKey.EMAIL:
vals.update({x[:4] for x in self.feature_iter(Feature.EMAIL)})

# if any vals are longer than the BLOCKING_KEY_MAX_LENGTH, raise an error
if any(len(x) > models.BLOCKING_VALUE_MAX_LENGTH for x in vals):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/routes/test_seed_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_large_batch(self, client):
assert sum(len(p["patients"]) for p in persons) == 1285
assert client.session.query(models.Person).count() == 100
assert client.session.query(models.Patient).count() == 1285
assert client.session.query(models.BlockingValue).count() == 8995
assert client.session.query(models.BlockingValue).count() == 10280

@mock.patch("recordlinker.database.algorithm_service.default_algorithm")
def test_seed_and_link(self, mock_algorithm, basic_algorithm, client):
Expand Down
27 changes: 25 additions & 2 deletions tests/unit/schemas/test_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def test_feature_iter(self):
],
telecom=[
pii.Telecom(value="555-123-4567"),
pii.Telecom(value="555-987-6543", system="phone"),
pii.Telecom(value="(555) 987-6543", system="phone"),
pii.Telecom(value="[email protected]", system="email"),
],
drivers_license=pii.DriversLicense(value="D1234567", authority="VA"),
Expand All @@ -257,9 +257,11 @@ def test_feature_iter(self):
assert list(record.feature_iter(pii.Feature.GENDER)) == ["UNKNOWN"]
assert list(record.feature_iter(pii.Feature.TELECOM)) == [
"555-123-4567",
"555-987-6543",
"(555) 987-6543",
"[email protected]",
]
assert list(record.feature_iter(pii.Feature.PHONE)) == ["5559876543"]
assert list(record.feature_iter(pii.Feature.EMAIL)) == ["[email protected]"]
assert list(record.feature_iter(pii.Feature.SUFFIX)) == ["suffix", "suffix2"]
assert list(record.feature_iter(pii.Feature.COUNTY)) == ["county"]
assert list(record.feature_iter(pii.Feature.DRIVERS_LICENSE)) == ["D1234567|VA"]
Expand Down Expand Up @@ -366,6 +368,27 @@ def test_blocking_keys_address_first_four(self):
rec = pii.PIIRecord(**{"address": [{"line": ["123 Main St"]}, {"line": ["456 Elm St"]}]})
assert rec.blocking_keys(BlockingKey.ADDRESS) == {"123 ", "456 "}

def test_blocking_keys_phone_last_four(self):
rec = pii.PIIRecord(**{"phone": "555-123-4567"})
assert rec.blocking_keys(BlockingKey.PHONE) == set()
rec = pii.PIIRecord(**{"telecom": [{"value": "(555) 123-4567", "system": "phone"}]})
assert rec.blocking_keys(BlockingKey.PHONE) == {"4567"}
rec = pii.PIIRecord(**{"telecom": [{"value": "555.123.4567", "system": "phone"}, {"value": "555-987-6543 ext 123", "system": "phone"}]})
assert rec.blocking_keys(BlockingKey.PHONE) == {"4567", "6543"}
rec = pii.PIIRecord(**{"telecom": [{"value": "555.123.4567", "system": "phone"}, {"value": "555-987-6543", "system": "fax"}]})
assert rec.blocking_keys(BlockingKey.PHONE) == {"4567"}

def test_blocking_keys_email_first_four(self):
rec = pii.PIIRecord(**{"email": "[email protected]"})
assert rec.blocking_keys(BlockingKey.EMAIL) == set()
rec = pii.PIIRecord(**{"telecom": [{"value": "[email protected]", "system": "email"}]})
assert rec.blocking_keys(BlockingKey.EMAIL) == {"test"}
rec = pii.PIIRecord(**{"telecom": [{"value": "[email protected]", "system": "email"}, {"value": "[email protected]", "system": "email"}]})
assert rec.blocking_keys(BlockingKey.EMAIL) == {"test", "bob@"}
rec = pii.PIIRecord(**{"telecom": [{"value": "[email protected]", "system": "email"}, {"value": "[email protected]", "system": "other"}]})
assert rec.blocking_keys(BlockingKey.EMAIL) == {"t@gm"}


def test_blocking_values(self):
rec = pii.PIIRecord(
**{
Expand Down
Loading