Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get patterns from regex match in ITIN recognizer #959

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions presidio-analyzer/presidio_analyzer/pattern_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,17 +187,20 @@ def __analyze_patterns(

for match in matches:
start, end = match.span()

pattern_from_match = self.get_pattern_from_match(pattern, match)

current_match = text[start:end]

# Skip empty results
if current_match == "":
continue

score = pattern.score
score = pattern_from_match.score

validation_result = self.validate_result(current_match)
description = self.build_regex_explanation(
self.name, pattern.name, pattern.regex, score, validation_result
self.name, pattern_from_match.name, pattern.regex, score, validation_result
)
pattern_result = RecognizerResult(
entity_type=self.supported_entities[0],
Expand Down Expand Up @@ -230,6 +233,18 @@ def __analyze_patterns(
results = EntityRecognizer.remove_duplicates(results)
return results

def get_pattern_from_match(
self, pattern: Pattern, match: re.Match
) -> Pattern:
"""
Return a new Pattern based on the matched regex info e.g., the named groups in the regex.

aperezfals marked this conversation as resolved.
Show resolved Hide resolved
:param pattern: the pattern initially found.
:param math: regex match
:return: A new pattern based on the regex match info.
"""
return pattern

def to_dict(self) -> Dict:
"""Serialize instance into a dictionary."""
return_dict = super().to_dict()
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Optional, List

from presidio_analyzer import Pattern, PatternRecognizer

from regex import Match

class UsItinRecognizer(PatternRecognizer):
"""
Expand All @@ -14,19 +14,9 @@ class UsItinRecognizer(PatternRecognizer):
"""

PATTERNS = [
Pattern(
"Itin (very weak)",
r"\b9\d{2}[- ](5\d|6[0-5]|7\d|8[0-8]|9([0-2]|[4-9]))\d{4}\b|\b9\d{2}(5\d|6[0-5]|7\d|8[0-8]|9([0-2]|[4-9]))[- ]\d{4}\b", # noqa: E501
0.05,
),
Pattern(
"Itin (weak)",
r"\b9\d{2}(5\d|6[0-5]|7\d|8[0-8]|9([0-2]|[4-9]))\d{4}\b", # noqa: E501
0.3,
),
Pattern(
"Itin (medium)",
r"\b9\d{2}[- ](5\d|6[0-5]|7\d|8[0-8]|9([0-2]|[4-9]))[- ]\d{4}\b", # noqa: E501
r"\b9\d{2}(?P<firstSeparator>[- ]?)(5\d|6[0-5]|7\d|8[0-8]|9([0-2]|[4-9]))(?P<secondSeparator>[- ]?)\d{4}\b", # noqa: E501
0.5,
),
]
Expand All @@ -48,3 +38,25 @@ def __init__(
context=context,
supported_language=supported_language,
)

def get_pattern_from_match(
self, pattern: Pattern, match: Match
) -> Pattern:
first_separator = match.group('firstSeparator')
second_separator = match.group('secondSeparator')

if first_separator and second_separator:
return pattern

if not first_separator and not second_separator:
return Pattern(
"Itin (weak)",
pattern.regex,
0.3
)

return Pattern(
"Itin (very weak)",
pattern.regex,
0.05
aperezfals marked this conversation as resolved.
Show resolved Hide resolved
)
8 changes: 7 additions & 1 deletion presidio-analyzer/tests/test_us_itin_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,17 @@ def entities():
"text, expected_len, expected_positions, expected_score_ranges",
[
# fmt: off
("911-701234 91170-1234", 2, ((0, 10), (11, 21),), ((0.0, 0.3), (0.0, 0.3),),),
("911-701234 91170-1234", 2,
((0, 10), (11, 21),),
((0.0, 0.3), (0.0, 0.3),),),
("911 701234 91170 1234", 2,
((0, 10), (11, 21),),
((0.0, 0.3), (0.0, 0.3),),),
("911701234", 1, ((0, 9),), ((0.3, 0.4),),),
("911-70-1234", 1, ((0, 11),), ((0.5, 0.6),),),
("911-53-1234", 1, ((0, 11),), ((0.5, 0.6),),),
("911-64-1234", 1, ((0, 11),), ((0.5, 0.6),),),
("911 63 7534", 1, ((0, 11),), ((0.5, 0.6),),),
("911-89-1234", 0, (), (),),
("my tax id 911-89-1234", 0, (), (),),
# fmt: on
Expand Down