Skip to content

Commit

Permalink
Use symmetric aligned levenshtein to avoid double work
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Sep 19, 2024
1 parent 22f2077 commit 4544687
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 11 deletions.
Binary file modified nomenklatura/data/regression-v3.pkl
Binary file not shown.
34 changes: 32 additions & 2 deletions nomenklatura/matching/compare/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,37 @@ def aligned_levenshtein(qfp: str, rfp: str) -> float:
return levenshtein_similarity(qaligned, raligned)


def name_fingerprint_levenshtein(query: E, result: E) -> float:
def symmetric_aligned_levenshtein(qfp: str, rfp: str) -> float:
qtokens = name_words(qfp, min_length=2)
rtokens = name_words(rfp, min_length=2)
qlen = len(qtokens)
rlen = len(rtokens)
for part in name_words(clean_name_ascii(rfp), min_length=2):
if part not in rtokens:
rtokens.append(part)

scores: Dict[Tuple[str, str], float] = {}
# compute all pairwise scores for name parts:
for q, r in product(set(qtokens), set(rtokens)):
scores[(q, r)] = levenshtein_similarity(q, r)
aligned: List[Tuple[str, str, float]] = []
# find the best pairing for each name part by score:
for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
# one name part can only be used once, but can show up multiple times:
while q in qtokens and r in rtokens:
qtokens.remove(q)
rtokens.remove(r)
aligned.append((q, r, score))

qfactor = (qlen - len(qtokens)) / qlen
rfactor = (rlen - len(rtokens)) / rlen
qaligned = "".join(p[0] for p in aligned)
raligned = "".join(p[1] for p in aligned)
score = levenshtein_similarity(qaligned, raligned)
return score * max(qfactor, rfactor)


def name_fingerprint_levenshtein(query: E, result: E, lev=aligned_levenshtein) -> float:
"""Two non-person entities have similar fingerprinted names. This includes
simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the
Damerau-Levensthein string distance algorithm."""
Expand All @@ -112,7 +142,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float:
continue
score = levenshtein_similarity(qfp.replace(" ", ""), rfp.replace(" ", ""))
max_score = max(max_score, score)
score = aligned_levenshtein(qfp, rfp)
score = lev(qfp, rfp)
max_score = max(max_score, score)
return max_score

Expand Down
15 changes: 7 additions & 8 deletions nomenklatura/matching/regression_v3/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from nomenklatura.matching.regression_v3.util import tokenize_pair, compare_levenshtein
from nomenklatura.matching.compare.util import is_disjoint, has_overlap, extract_numbers
from nomenklatura.matching.compare.names import aligned_levenshtein
from nomenklatura.matching.compare.names import aligned_levenshtein, name_fingerprint_levenshtein, symmetric_aligned_levenshtein
from nomenklatura.matching.util import has_schema, props_pair, type_pair
from nomenklatura.matching.util import max_in_sets
from nomenklatura.util import fingerprint_name
Expand All @@ -31,15 +31,12 @@ def normalize_names(raws: Iterable[str]) -> Set[str]:
def name_levenshtein(left: E, right: E) -> float:
"""Consider the edit distance (as a fraction of name length) between the two most
similar names linked to both entities."""
lv, rv = type_pair(left, right, registry.name)
lvn, rvn = normalize_names(lv), normalize_names(rv)
if has_schema(left, right, "Person"):
lv, rv = type_pair(left, right, registry.name)
lvn, rvn = normalize_names(lv), normalize_names(rv)
return max_in_sets(lvn, rvn, compare_levenshtein)
else:
return max(
max_in_sets(lv, rv, aligned_levenshtein),
max_in_sets(rv, lv, aligned_levenshtein),
)
return name_fingerprint_levenshtein(left, right, symmetric_aligned_levenshtein)


def first_name_match(left: E, right: E) -> float:
Expand Down Expand Up @@ -98,7 +95,9 @@ def name_numbers(left: E, right: E) -> float:


def name_similarity(left: E, right: E) -> float:
"""Compute the similarity between the names of two entities."""
"""Compute the similarity between the names of two entities, picking the max from
a full string match, token overlap-based score, and levenshtein distance-based
score."""
return max(
[
name_match(left, right),
Expand Down
41 changes: 40 additions & 1 deletion tests/matching/test_names.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from nomenklatura.matching.compare.names import name_literal_match
from nomenklatura.matching.compare.names import aligned_levenshtein, name_literal_match, symmetric_aligned_levenshtein
from nomenklatura.matching.compare.names import last_name_mismatch
from nomenklatura.matching.compare.names import name_fingerprint_levenshtein
from nomenklatura.matching.compare.names import person_name_jaro_winkler
Expand Down Expand Up @@ -320,6 +320,45 @@ def test_name_fingerprint_levenshtein():
assert name_fingerprint_levenshtein(query, result) > 0.5


def test_symmetric_aligned_levenshtein():
# Joint Stock Company Zlatoustovsky Machine Building Plant (JSC Zlatmash)
name1 = "jsc zlatoustovsky machine building plant"
name2 = name1 + " jsc zlatmash"
score1 = symmetric_aligned_levenshtein(name1, name2)
score2 = symmetric_aligned_levenshtein(name2, name1)
assert score1 == score2, (score1, score2)
#assert 0.71 < score1 < 0.72 , score1
assert score1 == 1, score1

name1 = "jsc zlatoustovsky machine building plant"
name2 = "jsc zlatostofsky machine building plant"
score1 = symmetric_aligned_levenshtein(name1, name2)
score2 = symmetric_aligned_levenshtein(name2, name1)
assert score1 == score2, (score1, score2)
assert 0.94 < score1 < 0.95 , score1

name2 = name2 + " jsc zlatmash"
score1 = symmetric_aligned_levenshtein(name1, name2)
score2 = symmetric_aligned_levenshtein(name2, name1)
assert score1 == score2, (score1, score2)
assert 0.94 < score1 < 0.95 , score1

name1 = "jsc zlatoustovsky machine building plant"
name2 = "jsc zlatoustovsky machine building zavod"
score1 = symmetric_aligned_levenshtein(name1, name2)
score2 = symmetric_aligned_levenshtein(name2, name1)
assert score1 == score2, (score1, score2)
assert score1 == 0, score1

name1 = "jsc zlatoustovsky machine building plant"
name2 = "zlatoustovsky machine building"
score1 = symmetric_aligned_levenshtein(name1, name2)
score2 = symmetric_aligned_levenshtein(name2, name1)
assert score1 == score2, (score1, score2)
assert score1 == 1, score1



def test_org_name_partial_match():
query = e("Company", name="CRYSTALORD LIMITED")
result = e("Company", name="CRYSTALORD LTD")
Expand Down

0 comments on commit 4544687

Please sign in to comment.