Skip to content

Commit

Permalink
Merge pull request #512 from adamdecaf/weighted-jaro-scoring
Browse files Browse the repository at this point in the history
cmd/server: weight term score by length similarity
  • Loading branch information
adamdecaf authored Nov 13, 2023
2 parents e3971be + 6adb3db commit 83c92c0
Show file tree
Hide file tree
Showing 7 changed files with 77 additions and 81 deletions.
2 changes: 1 addition & 1 deletion cmd/server/issue115_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ func TestIssue115__TopSDNs(t *testing.T) {
eql(t, "george bush jaroWinkler", score, 0.896)

score = jaroWinkler("g", "geoergebush")
eql(t, "g vs geoergebush", score, 0.697)
eql(t, "g vs geoergebush", score, 0.070)

pipe := noLogPipeliner
s := newSearcher(log.NewNopLogger(), pipe, 1)
Expand Down
6 changes: 3 additions & 3 deletions cmd/server/issue326_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ func TestIssue326(t *testing.T) {

// Cuba
score := jaroWinkler(precompute("Huawei Cuba"), precompute("Huawei"))
assert.Equal(t, 0.8055555555555556, score)
assert.Equal(t, 0.7444444444444445, score)

// India
score = jaroWinkler(india, precompute("Huawei"))
assert.Equal(t, 0.4846031746031746, score)
score = jaroWinkler(india, precompute("Huawei Technologies"))
assert.Equal(t, 0.6903174603174603, score)
assert.Equal(t, 0.6084415584415584, score)

// Investment
score = jaroWinkler(investment, precompute("Huawei"))
assert.Equal(t, 0.3788888888888889, score)
score = jaroWinkler(investment, precompute("Huawei Technologies"))
assert.Equal(t, 0.7377777777777779, score)
assert.Equal(t, 0.5419191919191919, score)
}
26 changes: 19 additions & 7 deletions cmd/server/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -660,25 +660,27 @@ var (
)

func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
maxMatch := func(word string, s1Idx int, parts []string) float64 {
if len(parts) == 0 {
return 0.0
maxMatch := func(word string, s1Idx int, parts []string) (float64, string) {
if word == "" || len(parts) == 0 {
return 0.0, ""
}

// We're only looking for the highest match close
start := s1Idx - adjacentSimilarityPositions
end := s1Idx + adjacentSimilarityPositions

var max float64
var maxTerm string
for i := start; i < end; i++ {
if i >= 0 && len(parts) > i {
score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize)
if score > max {
max = score
maxTerm = parts[i]
}
}
}
return max
return max, maxTerm
}

s1Parts, s2Parts := strings.Fields(s1), strings.Fields(s2)
Expand All @@ -688,16 +690,26 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {

var scores []float64
for i := range s1Parts {
max := maxMatch(s1Parts[i], i, s2Parts)
max, term := maxMatch(s1Parts[i], i, s2Parts)
if max >= 1.0 {
// Perfect match
max += favoritism
scores = append(scores, max)
} else {
// Apply an additional weight based on similarity of term lengths,
// so terms which are closer in length match higher.
s1 := float64(len(s1Parts[i]))
t := float64(len(term)) - 1
weight := math.Min(math.Abs(s1/t), 1.0)

scores = append(scores, max*weight)
}
scores = append(scores, max)
}

// average the highest N scores where N is the words in our query (s2).
// Only truncate scores if there are enough words (aka more than First/Last).
sort.Float64s(scores)
if len(s1Parts) > len(s2Parts) && len(s2Parts) > 2 {
if len(s1Parts) > len(s2Parts) && len(s2Parts) > 5 {
scores = scores[len(s1Parts)-len(s2Parts):]
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/server/search_eu_csl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func TestSearch__EU_CSL(t *testing.T) {
w.Flush()

require.Equal(t, http.StatusOK, w.Code)
require.Contains(t, w.Body.String(), `"match":0.7388888`)
require.Contains(t, w.Body.String(), `"match":0.65555`)

var wrapper struct {
EUConsolidatedSanctionsList []csl.EUCSLRecord `json:"euConsolidatedSanctionsList"`
Expand Down
43 changes: 11 additions & 32 deletions cmd/server/search_handlers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,8 @@ func TestSearch__Address(t *testing.T) {
router.ServeHTTP(w, req)
w.Flush()

if w.Code != http.StatusOK {
t.Errorf("bogus status code: %d", w.Code)
}

if v := w.Body.String(); !strings.Contains(v, `"match":1`) {
t.Fatalf("%#v", v)
}
require.Equal(t, http.StatusOK, w.Code)
require.Contains(t, w.Body.String(), `"match":0.88194`)

var wrapper struct {
Addresses []*ofac.Address `json:"addresses"`
Expand Down Expand Up @@ -265,25 +260,14 @@ func TestSearch__NameAndAltName(t *testing.T) {
}

// OFAC
if wrapper.SDNs[0].EntityID != "2681" {
t.Errorf("%#v", wrapper.SDNs[0])
}
if wrapper.AltNames[0].EntityID != "4691" {
t.Errorf("%#v", wrapper.AltNames[0].EntityID)
}
if wrapper.Addresses[0].EntityID != "735" {
t.Errorf("%#v", wrapper.Addresses[0].EntityID)
}
if wrapper.SectoralSanctions[0].EntityID != "18782" {
t.Errorf("%#v", wrapper.SectoralSanctions[0].EntityID)
}
require.Equal(t, "2681", wrapper.SDNs[0].EntityID)
require.Equal(t, "4691", wrapper.AltNames[0].EntityID)
require.Equal(t, "735", wrapper.Addresses[0].EntityID)
require.Equal(t, "18782", wrapper.SectoralSanctions[0].EntityID)

// BIS
if wrapper.DeniedPersons[0].StreetAddress != "P.O. BOX 28360" {
t.Errorf("%#v", wrapper.DeniedPersons[0].StreetAddress)
}
if wrapper.BISEntities[0].Name != "Mohammad Jan Khan Mangal" {
t.Errorf("%#v", wrapper.BISEntities[0])
}
require.Equal(t, "P.O. BOX 28360", wrapper.DeniedPersons[0].StreetAddress)
require.Equal(t, "Luqman Yasin Yunus Shgragi", wrapper.BISEntities[0].Name)
}

func TestSearch__Name(t *testing.T) {
Expand All @@ -304,13 +288,8 @@ func TestSearch__Name(t *testing.T) {
router.ServeHTTP(w, req)
w.Flush()

if w.Code != http.StatusOK {
t.Errorf("bogus status code: %d", w.Code)
}

if v := w.Body.String(); !strings.Contains(v, `"match":1`) {
t.Error(v)
}
require.Equal(t, http.StatusOK, w.Code)
require.Contains(t, w.Body.String(), `"match":0.89166`)

var wrapper struct {
// OFAC
Expand Down
75 changes: 41 additions & 34 deletions cmd/server/search_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -432,34 +432,42 @@ func TestJaroWinkler(t *testing.T) {
s1, s2 string
match float64
}{
// examples
{"wei, zhao", "wei, Zhao", 0.917},
{"WEI, Zhao", "WEI, Zhao", 1.0},
{"WEI Zhao", "WEI Zhao", 1.0},
{strings.ToLower("WEI Zhao"), precompute("WEI, Zhao"), 1.0},
// make sure jaroWinkler is communative
{"jane doe", "jan lahore", 0.721},

// apply jaroWinkler in both directions
{"jane doe", "jan lahore", 0.621},
{"jan lahore", "jane doe", 0.776},

// real world case
{"john doe", "paul john", 0.764},
{"john doe", "john othername", 0.815},
{"john doe", "john othername", 0.618},

// close match
{"jane doe", "jane doe2", 0.971},

// real-ish world examples
{"kalamity linden", "kala limited", 0.771},
{"kala limited", "kalamity linden", 0.795},
{"kala limited", "kalamity linden", 0.602},

// examples used in demos / commonly
{"nicolas", "nicolas", 1.0},
{"nicolas moros maduro", "nicolas maduro", 0.91},
{"nicolas maduro", "nicolas moros maduro", 1.0},

// customer examples
{"ian mckinley", "tian xiang 7", 0.750},
{"bindaree food group pty ltd", "independent insurance group ltd", 0.812},
{"ian mckinley", "tian xiang 7", 0.75},
{"bindaree food group pty ltd", "independent insurance group ltd", 0.728},
{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.753},
{"zincum llc", "easy verification inc.", 0.639},
{"transpetrochart co ltd", "jx metals trading co.", 0.725},
{"technolab", "moomoo technologies inc", 0.87222},
{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.740},
{"transpetrochart co ltd", "jx metals trading co.", 0.616},
{"technolab", "moomoo technologies inc", 0.714},
{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.673},
{"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0},

// example cases
{"nicolas maduro", "nicolás maduro", 0.961},
{"nicolas maduro", precompute("nicolás maduro"), 1.0},
Expand All @@ -471,36 +479,37 @@ func TestJaroWinkler(t *testing.T) {
{"nicolas maduro moros", "nicolás maduro", 0.884},
{"nicolas, maduro moros", "maduro", 0.720},
{"nicolas, maduro moros", "nicolas maduro", 0.902},
{"nicolas, maduro moros", "nicolás", 0.627},
{"nicolas, maduro moros", "nicolás", 0.554},
{"nicolas, maduro moros", "maduro", 0.720},
{"nicolas, maduro moros", "nicolás maduro", 0.877},
{"africada financial services bureau change", "skylight", 0.266},
{"africada financial services bureau change", "skylight financial inc", 0.72},
{"africada financial services bureau change", "skylight services inc", 0.806},
{"africada financial services bureau change", "skylight financial services", 0.887},
{"africada financial services bureau change", "skylight financial services inc", 0.79},
{"africada financial services bureau change", "skylight financial inc", 0.596},
{"africada financial services bureau change", "skylight services inc", 0.645},
{"africada financial services bureau change", "skylight financial services", 0.67},
{"africada financial services bureau change", "skylight financial services inc", 0.696},

// stopwords tests
{"the group for the preservation of the holy sites", "the bridgespan group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 1.00},
{"group preservation holy sites", "bridgespan group", 0.689},
{"the group for the preservation of the holy sites", "the logan group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 1.00},
{"the group for the preservation of the holy sites", "the bridgespan group", 0.448},
{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.448},
{"group preservation holy sites", "bridgespan group", 0.619},
{"the group for the preservation of the holy sites", "the logan group", 0.424},
{precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.424},
{"group preservation holy sites", "logan group", 0.478},
{"the group for the preservation of the holy sites", "the anything group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 1.00},
{"group preservation holy sites", "anything group", 0.617},
{"the group for the preservation of the holy sites", "the hello world group", 0.922},
{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.922},
{"group preservation holy sites", "hello world group", 0.687},
{"the group for the preservation of the holy sites", "the group", 0.431},
{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.431},
{"the group for the preservation of the holy sites", "the anything group", 0.437},
{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.437},
{"group preservation holy sites", "anything group", 0.585},
{"the group for the preservation of the holy sites", "the hello world group", 0.47},
{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.47},
{"group preservation holy sites", "hello world group", 0.515},
{"the group for the preservation of the holy sites", "the group", 0.416},
{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.416},
{"group preservation holy sites", "group", 0.460},
{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.517},
{precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.572},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.418},
{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.403},
{precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.459},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.329},

// precompute
{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.667},
{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.5},
{precompute("A.I.C. SOGO KENKYUSHO"), "sogo kenkyusho", 0.667},
}
for i := range cases {
Expand Down Expand Up @@ -725,9 +734,7 @@ func TestSearch__TopSDNs(t *testing.T) {
if len(sdns) == 0 {
t.Fatal("empty SDNs")
}
if sdns[0].EntityID != "2676" {
t.Errorf("%#v", sdns[0].SDN)
}
require.Equal(t, "2681", sdns[0].EntityID)
}

func TestSearch__TopDPs(t *testing.T) {
Expand Down
4 changes: 1 addition & 3 deletions cmd/server/search_us_csl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,7 @@ func TestSearcher_TopSSIs_limit(t *testing.T) {
if len(ssis) != 2 {
t.Fatalf("Expected 2 results, found %d", len(ssis))
}
if ssis[0].Data.EntityID != "18736" {
t.Errorf("%#v", ssis[0].Data)
}
require.Equal(t, "18782", ssis[0].Data.EntityID)
}

func TestSearcher_TopSSIs_reportAltNameWeight(t *testing.T) {
Expand Down

0 comments on commit 83c92c0

Please sign in to comment.