diff --git a/nomenklatura/data/regression-v3.pkl b/nomenklatura/data/regression-v3.pkl index 808f3ac..39078a8 100644 Binary files a/nomenklatura/data/regression-v3.pkl and b/nomenklatura/data/regression-v3.pkl differ diff --git a/nomenklatura/matching/compare/names.py b/nomenklatura/matching/compare/names.py index 34286ac..3ce44bb 100644 --- a/nomenklatura/matching/compare/names.py +++ b/nomenklatura/matching/compare/names.py @@ -95,7 +95,37 @@ def aligned_levenshtein(qfp: str, rfp: str) -> float: return levenshtein_similarity(qaligned, raligned) -def name_fingerprint_levenshtein(query: E, result: E) -> float: +def symmetric_aligned_levenshtein(qfp: str, rfp: str) -> float: + qtokens = name_words(qfp, min_length=2) + rtokens = name_words(rfp, min_length=2) + qlen = len(qtokens) + rlen = len(rtokens) + for part in name_words(clean_name_ascii(rfp), min_length=2): + if part not in rtokens: + rtokens.append(part) + + scores: Dict[Tuple[str, str], float] = {} + # compute all pairwise scores for name parts: + for q, r in product(set(qtokens), set(rtokens)): + scores[(q, r)] = levenshtein_similarity(q, r) + aligned: List[Tuple[str, str, float]] = [] + # find the best pairing for each name part by score: + for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True): + # one name part can only be used once, but can show up multiple times: + while q in qtokens and r in rtokens: + qtokens.remove(q) + rtokens.remove(r) + aligned.append((q, r, score)) + + qfactor = (qlen - len(qtokens)) / qlen + rfactor = (rlen - len(rtokens)) / rlen + qaligned = "".join(p[0] for p in aligned) + raligned = "".join(p[1] for p in aligned) + score = levenshtein_similarity(qaligned, raligned) + return score * max(qfactor, rfactor) + + +def name_fingerprint_levenshtein(query: E, result: E, lev=aligned_levenshtein) -> float: """Two non-person entities have similar fingerprinted names. This includes simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the Damerau-Levensthein string distance algorithm.""" @@ -112,7 +142,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float: continue score = levenshtein_similarity(qfp.replace(" ", ""), rfp.replace(" ", "")) max_score = max(max_score, score) - score = aligned_levenshtein(qfp, rfp) + score = lev(qfp, rfp) max_score = max(max_score, score) return max_score diff --git a/nomenklatura/matching/regression_v3/names.py b/nomenklatura/matching/regression_v3/names.py index 006cbdc..d2b42e0 100644 --- a/nomenklatura/matching/regression_v3/names.py +++ b/nomenklatura/matching/regression_v3/names.py @@ -6,7 +6,7 @@ from nomenklatura.matching.regression_v3.util import tokenize_pair, compare_levenshtein from nomenklatura.matching.compare.util import is_disjoint, has_overlap, extract_numbers -from nomenklatura.matching.compare.names import aligned_levenshtein +from nomenklatura.matching.compare.names import aligned_levenshtein, name_fingerprint_levenshtein, symmetric_aligned_levenshtein from nomenklatura.matching.util import has_schema, props_pair, type_pair from nomenklatura.matching.util import max_in_sets from nomenklatura.util import fingerprint_name @@ -31,15 +31,12 @@ def normalize_names(raws: Iterable[str]) -> Set[str]: def name_levenshtein(left: E, right: E) -> float: """Consider the edit distance (as a fraction of name length) between the two most similar names linked to both entities.""" - lv, rv = type_pair(left, right, registry.name) - lvn, rvn = normalize_names(lv), normalize_names(rv) if has_schema(left, right, "Person"): + lv, rv = type_pair(left, right, registry.name) + lvn, rvn = normalize_names(lv), normalize_names(rv) return max_in_sets(lvn, rvn, compare_levenshtein) else: - return max( - max_in_sets(lv, rv, aligned_levenshtein), - max_in_sets(rv, lv, aligned_levenshtein), - ) + return name_fingerprint_levenshtein(left, right, symmetric_aligned_levenshtein) def first_name_match(left: E, right: E) -> float: @@ -98,7 +95,9 @@ def name_numbers(left: E, right: E) -> float: def name_similarity(left: E, right: E) -> float: - """Compute the similarity between the names of two entities.""" + """Compute the similarity between the names of two entities, picking the max from + a full string match, token overlap-based score, and levenshtein distance-based + score.""" return max( [ name_match(left, right), diff --git a/tests/matching/test_names.py b/tests/matching/test_names.py index 35ceb56..2663f41 100644 --- a/tests/matching/test_names.py +++ b/tests/matching/test_names.py @@ -1,4 +1,4 @@ -from nomenklatura.matching.compare.names import name_literal_match +from nomenklatura.matching.compare.names import aligned_levenshtein, name_literal_match, symmetric_aligned_levenshtein from nomenklatura.matching.compare.names import last_name_mismatch from nomenklatura.matching.compare.names import name_fingerprint_levenshtein from nomenklatura.matching.compare.names import person_name_jaro_winkler @@ -320,6 +320,45 @@ def test_name_fingerprint_levenshtein(): assert name_fingerprint_levenshtein(query, result) > 0.5 +def test_symmetric_aligned_levenshtein(): + # Joint Stock Company Zlatoustovsky Machine Building Plant (JSC Zlatmash) + name1 = "jsc zlatoustovsky machine building plant" + name2 = name1 + " jsc zlatmash" + score1 = symmetric_aligned_levenshtein(name1, name2) + score2 = symmetric_aligned_levenshtein(name2, name1) + assert score1 == score2, (score1, score2) + #assert 0.71 < score1 < 0.72 , score1 + assert score1 == 1, score1 + + name1 = "jsc zlatoustovsky machine building plant" + name2 = "jsc zlatostofsky machine building plant" + score1 = symmetric_aligned_levenshtein(name1, name2) + score2 = symmetric_aligned_levenshtein(name2, name1) + assert score1 == score2, (score1, score2) + assert 0.94 < score1 < 0.95 , score1 + + name2 = name2 + " jsc zlatmash" + score1 = symmetric_aligned_levenshtein(name1, name2) + score2 = symmetric_aligned_levenshtein(name2, name1) + assert score1 == score2, (score1, score2) + assert 0.94 < score1 < 0.95 , score1 + + name1 = "jsc zlatoustovsky machine building plant" + name2 = "jsc zlatoustovsky machine building zavod" + score1 = symmetric_aligned_levenshtein(name1, name2) + score2 = symmetric_aligned_levenshtein(name2, name1) + assert score1 == score2, (score1, score2) + assert score1 == 0, score1 + + name1 = "jsc zlatoustovsky machine building plant" + name2 = "zlatoustovsky machine building" + score1 = symmetric_aligned_levenshtein(name1, name2) + score2 = symmetric_aligned_levenshtein(name2, name1) + assert score1 == score2, (score1, score2) + assert score1 == 1, score1 + + + def test_org_name_partial_match(): query = e("Company", name="CRYSTALORD LIMITED") result = e("Company", name="CRYSTALORD LTD")