Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reg v3 #174

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion nomenklatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@

from nomenklatura.cache import Cache
from nomenklatura.index import Index, INDEX_TYPES
from nomenklatura.matching import train_v2_matcher, train_v1_matcher
from nomenklatura.matching import (
train_v3_matcher,
train_v2_matcher,
train_v1_matcher,
)
from nomenklatura.store import load_entity_file_store
from nomenklatura.resolver import Resolver
from nomenklatura.dataset import Dataset, DefaultDataset
Expand Down Expand Up @@ -193,6 +197,13 @@ def train_v2_matcher_(pairs_file: Path) -> None:
train_v2_matcher(pairs_file)


@cli.command("train-v3-matcher", help="Train a matching model from judgement pairs")
@click.argument("pairs_file", type=InPath)
@click.option("-s", "--splits", type=int, default=1)
def train_v3_matcher_(pairs_file: Path, splits: int = 1) -> None:
train_v3_matcher(pairs_file, splits)


@cli.command("match", help="Generate matches from an enrichment source")
@click.argument("config", type=InPath)
@click.argument("entities", type=InPath)
Expand Down
Binary file added nomenklatura/data/regression-v3.pkl
Binary file not shown.
5 changes: 5 additions & 0 deletions nomenklatura/matching/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from nomenklatura.matching.regression_v1.train import train_matcher as train_v1_matcher
from nomenklatura.matching.regression_v2.model import RegressionV2
from nomenklatura.matching.regression_v2.train import train_matcher as train_v2_matcher
from nomenklatura.matching.regression_v3.model import RegressionV3
from nomenklatura.matching.regression_v3.train import train_matcher as train_v3_matcher
from nomenklatura.matching.name_based import NameMatcher, NameQualifiedMatcher
from nomenklatura.matching.logic import LogicV1
from nomenklatura.matching.types import ScoringAlgorithm
Expand All @@ -13,6 +15,7 @@
NameQualifiedMatcher,
RegressionV1,
RegressionV2,
RegressionV3,
]

DefaultAlgorithm = RegressionV2
Expand All @@ -31,6 +34,8 @@ def get_algorithm(name: str) -> Optional[Type[ScoringAlgorithm]]:
"train_v1_matcher",
"RegressionV2",
"train_v2_matcher",
"RegressionV3",
"train_v3_matcher",
"DefaultAlgorithm",
"ScoringAlgorithm",
"NameMatcher",
Expand Down
13 changes: 12 additions & 1 deletion nomenklatura/matching/compare/countries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,18 @@
from followthemoney.types import registry

from nomenklatura.matching.util import type_pair
from nomenklatura.matching.compare.util import is_disjoint
from nomenklatura.matching.compare.util import is_disjoint, has_overlap


def country_match(query: E, result: E) -> float:
"""Both entities are linked to the same country."""
qv, rv = type_pair(query, result, registry.country)
if qv and rv:
if has_overlap(qv, rv):
return 1.0
elif is_disjoint(qv, rv):
return -1.0
return 0.0


def country_mismatch(query: E, result: E) -> float:
Expand Down
54 changes: 54 additions & 0 deletions nomenklatura/matching/compare/dates.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from typing import Iterable, Set
from prefixdate import Precision
from followthemoney.proxy import E
from rigour.text.distance import dam_levenshtein
from itertools import product

from nomenklatura.matching.compare.util import has_overlap, is_disjoint
from nomenklatura.matching.util import props_pair


MAX_YEARS = 2


def _dates_precision(values: Iterable[str], precision: Precision) -> Set[str]:
dates = set()
for value in values:
Expand Down Expand Up @@ -70,3 +75,52 @@ def dob_year_disjoint(query: E, result: E) -> float:
if is_disjoint(query_years, result_years):
return 1.0
return 0.0


def dob_similarity(query: E, result: E) -> float:
"""
Provide a similarity score for the birth dates of two entities taking
date precision into account.

1.0: precise dates match
0.75: years match
0.5: dates within 1 edit from each other
0.25: years within 2 years from each other
-0.2: imprecise dates are disjoint
-0.3: precise dates are disjoint
"""
query_dates, result_dates = props_pair(query, result, ["birthDate"])

# missing data
if len(query_dates) == 0 or len(result_dates) == 0:
return 0.0

# exact match on precise dates
result_days = _dates_precision(result_dates, Precision.DAY)
query_days = _dates_precision(query_dates, Precision.DAY)
if has_overlap(query_days, result_days):
return 1.0

# clerical errors on precise dates
for qd, rd in product(query_days, result_days):
if dam_levenshtein(qd, rd) <= 1:
return 0.5

# precise dates available but have no common values
if is_disjoint(query_days, result_days):
return -0.3

# years overlap
query_years = _dates_precision(query_dates, Precision.YEAR)
result_years = _dates_precision(result_dates, Precision.YEAR)
if has_overlap(query_years, result_years):
return 0.75

# years are close
for qy, ry in product(query_years, result_years):
years_difference = abs(int(qy) - int(ry))
if years_difference <= MAX_YEARS:
return 0.25

# dates exist but are disjoint other than above options
return -0.2
52 changes: 28 additions & 24 deletions nomenklatura/matching/compare/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,33 @@ def person_name_jaro_winkler(query: E, result: E) -> float:
return score


def aligned_levenshtein(qfp: str, rfp: str) -> float:
qtokens = name_words(qfp, min_length=2)
rtokens = name_words(rfp, min_length=2)
for part in name_words(clean_name_ascii(rfp), min_length=2):
if part not in rtokens:
rtokens.append(part)

scores: Dict[Tuple[str, str], float] = {}
# compute all pairwise scores for name parts:
for q, r in product(set(qtokens), set(rtokens)):
scores[(q, r)] = levenshtein_similarity(q, r)
aligned: List[Tuple[str, str, float]] = []
# find the best pairing for each name part by score:
for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
# one name part can only be used once, but can show up multiple times:
while q in qtokens and r in rtokens:
qtokens.remove(q)
rtokens.remove(r)
aligned.append((q, r, score))
# assume there should be at least a candidate for each query name part:
if len(qtokens):
return 0.0
qaligned = "".join(p[0] for p in aligned)
raligned = "".join(p[1] for p in aligned)
return levenshtein_similarity(qaligned, raligned)


def name_fingerprint_levenshtein(query: E, result: E) -> float:
"""Two non-person entities have similar fingerprinted names. This includes
simplifying entity type names (e.g. "Limited" -> "Ltd") and uses the
Expand All @@ -85,30 +112,7 @@ def name_fingerprint_levenshtein(query: E, result: E) -> float:
continue
score = levenshtein_similarity(qfp.replace(" ", ""), rfp.replace(" ", ""))
max_score = max(max_score, score)
qtokens = name_words(qfp, min_length=2)
rtokens = name_words(rfp, min_length=2)
for part in name_words(clean_name_ascii(rfp), min_length=2):
if part not in rtokens:
rtokens.append(part)

scores: Dict[Tuple[str, str], float] = {}
# compute all pairwise scores for name parts:
for q, r in product(set(qtokens), set(rtokens)):
scores[(q, r)] = levenshtein_similarity(q, r)
aligned: List[Tuple[str, str, float]] = []
# find the best pairing for each name part by score:
for (q, r), score in sorted(scores.items(), key=lambda i: i[1], reverse=True):
# one name part can only be used once, but can show up multiple times:
while q in qtokens and r in rtokens:
qtokens.remove(q)
rtokens.remove(r)
aligned.append((q, r, score))
# assume there should be at least a candidate for each query name part:
if len(qtokens):
continue
qaligned = "".join(p[0] for p in aligned)
raligned = "".join(p[1] for p in aligned)
score = levenshtein_similarity(qaligned, raligned)
score = aligned_levenshtein(qfp, rfp)
max_score = max(max_score, score)
return max_score

Expand Down
15 changes: 10 additions & 5 deletions nomenklatura/matching/pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,27 @@


class JudgedPair(object):
"""A pair of two entities which have been judged to be the same
(or not) by a user."""
"""
A pair of two entities which have been judged to be the same
(or not) by a user.
"""

__slots__ = ("left", "right", "judgement")
__slots__ = ("left", "right", "judgement", "group")

def __init__(
self, left: EntityProxy, right: EntityProxy, judgement: Judgement
self, left: EntityProxy, right: EntityProxy, judgement: Judgement, group: int
) -> None:
self.left = left
self.right = right
self.judgement = judgement
self.group = group

def to_dict(self) -> Dict[str, Any]:
return {
"left": self.left.to_dict(),
"right": self.right.to_dict(),
"judgement": self.judgement.value,
"group": self.group,
}


Expand All @@ -38,4 +42,5 @@ def read_pairs(pairs_file: PathLike) -> Generator[JudgedPair, None, None]:
judgement = Judgement(data["judgement"])
if judgement not in (Judgement.POSITIVE, Judgement.NEGATIVE):
continue
yield JudgedPair(left_entity, right_entity, judgement)
group = data.get("group", None)
yield JudgedPair(left_entity, right_entity, judgement, group)
Empty file.
63 changes: 63 additions & 0 deletions nomenklatura/matching/regression_v3/misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from followthemoney.proxy import E
from followthemoney.types import registry

from nomenklatura.matching.regression_v1.util import tokenize_pair, compare_levenshtein
from nomenklatura.matching.compare.util import has_overlap, extract_numbers
from nomenklatura.matching.util import props_pair, type_pair
from nomenklatura.matching.util import max_in_sets, has_schema
from nomenklatura.util import normalize_name


def birth_place(query: E, result: E) -> float:
"""Same place of birth."""
lv, rv = tokenize_pair(props_pair(query, result, ["birthPlace"]))
tokens = min(len(lv), len(rv))
return float(len(lv.intersection(rv))) / float(max(2.0, tokens))


def address_match(query: E, result: E) -> float:
"""Text similarity between addresses."""
lv, rv = type_pair(query, result, registry.address)
lvn = [normalize_name(v) for v in lv]
rvn = [normalize_name(v) for v in rv]
return max_in_sets(lvn, rvn, compare_levenshtein)


def address_numbers(query: E, result: E) -> float:
"""Find if names contain numbers, score if the numbers are different."""
lv, rv = type_pair(query, result, registry.address)
lvn = extract_numbers(lv)
rvn = extract_numbers(rv)
common = len(lvn.intersection(rvn))
disjoint = len(lvn.difference(rvn))
return common - disjoint


def phone_match(query: E, result: E) -> float:
"""Matching phone numbers between the two entities."""
lv, rv = type_pair(query, result, registry.phone)
return 1.0 if has_overlap(lv, rv) else 0.0


def email_match(query: E, result: E) -> float:
"""Matching email addresses between the two entities."""
lv, rv = type_pair(query, result, registry.email)
return 1.0 if has_overlap(lv, rv) else 0.0


def identifier_match(query: E, result: E) -> float:
"""Matching identifiers (e.g. passports, national ID cards, registration or
tax numbers) between the two entities."""
if has_schema(query, result, "Organization"):
return 0.0
lv, rv = type_pair(query, result, registry.identifier)
return 1.0 if has_overlap(lv, rv) else 0.0


def org_identifier_match(query: E, result: E) -> float:
"""Matching identifiers (e.g. registration or tax numbers) between two
organizations or companies."""
if not has_schema(query, result, "Organization"):
return 0.0
lv, rv = type_pair(query, result, registry.identifier)
return 1.0 if has_overlap(lv, rv) else 0.0
Loading
Loading