From 6adb3db3c5c31c952fc75e63ab228d47ba5b6635 Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Mon, 13 Nov 2023 14:50:08 -0600 Subject: [PATCH] cmd/server: weight term score by length similarity Terms are more equal the closer they are in length, so favor closer length terms. --- cmd/server/issue115_test.go | 2 +- cmd/server/issue326_test.go | 6 +-- cmd/server/search.go | 26 ++++++++--- cmd/server/search_eu_csl_test.go | 2 +- cmd/server/search_handlers_test.go | 43 +++++------------ cmd/server/search_test.go | 75 ++++++++++++++++-------------- cmd/server/search_us_csl_test.go | 4 +- 7 files changed, 77 insertions(+), 81 deletions(-) diff --git a/cmd/server/issue115_test.go b/cmd/server/issue115_test.go index e3601c4b..f21b944d 100644 --- a/cmd/server/issue115_test.go +++ b/cmd/server/issue115_test.go @@ -16,7 +16,7 @@ func TestIssue115__TopSDNs(t *testing.T) { eql(t, "george bush jaroWinkler", score, 0.896) score = jaroWinkler("g", "geoergebush") - eql(t, "g vs geoergebush", score, 0.697) + eql(t, "g vs geoergebush", score, 0.070) pipe := noLogPipeliner s := newSearcher(log.NewNopLogger(), pipe, 1) diff --git a/cmd/server/issue326_test.go b/cmd/server/issue326_test.go index 808bbe23..ec3187ab 100644 --- a/cmd/server/issue326_test.go +++ b/cmd/server/issue326_test.go @@ -16,17 +16,17 @@ func TestIssue326(t *testing.T) { // Cuba score := jaroWinkler(precompute("Huawei Cuba"), precompute("Huawei")) - assert.Equal(t, 0.8055555555555556, score) + assert.Equal(t, 0.7444444444444445, score) // India score = jaroWinkler(india, precompute("Huawei")) assert.Equal(t, 0.4846031746031746, score) score = jaroWinkler(india, precompute("Huawei Technologies")) - assert.Equal(t, 0.6903174603174603, score) + assert.Equal(t, 0.6084415584415584, score) // Investment score = jaroWinkler(investment, precompute("Huawei")) assert.Equal(t, 0.3788888888888889, score) score = jaroWinkler(investment, precompute("Huawei Technologies")) - assert.Equal(t, 0.7377777777777779, score) + assert.Equal(t, 0.5419191919191919, score) } diff --git a/cmd/server/search.go b/cmd/server/search.go index d9f5fcc5..9e1f52bc 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -660,9 +660,9 @@ var ( ) func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { - maxMatch := func(word string, s1Idx int, parts []string) float64 { - if len(parts) == 0 { - return 0.0 + maxMatch := func(word string, s1Idx int, parts []string) (float64, string) { + if word == "" || len(parts) == 0 { + return 0.0, "" } // We're only looking for the highest match close @@ -670,15 +670,17 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { end := s1Idx + adjacentSimilarityPositions var max float64 + var maxTerm string for i := start; i < end; i++ { if i >= 0 && len(parts) > i { score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize) if score > max { max = score + maxTerm = parts[i] } } } - return max + return max, maxTerm } s1Parts, s2Parts := strings.Fields(s1), strings.Fields(s2) @@ -688,16 +690,26 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { var scores []float64 for i := range s1Parts { - max := maxMatch(s1Parts[i], i, s2Parts) + max, term := maxMatch(s1Parts[i], i, s2Parts) if max >= 1.0 { + // Perfect match max += favoritism + scores = append(scores, max) + } else { + // Apply an additional weight based on similarity of term lengths, + // so terms which are closer in length match higher. + s1 := float64(len(s1Parts[i])) + t := float64(len(term)) - 1 + weight := math.Min(math.Abs(s1/t), 1.0) + + scores = append(scores, max*weight) } - scores = append(scores, max) } // average the highest N scores where N is the words in our query (s2). + // Only truncate scores if there are enough words (aka more than First/Last). sort.Float64s(scores) - if len(s1Parts) > len(s2Parts) && len(s2Parts) > 2 { + if len(s1Parts) > len(s2Parts) && len(s2Parts) > 5 { scores = scores[len(s1Parts)-len(s2Parts):] } diff --git a/cmd/server/search_eu_csl_test.go b/cmd/server/search_eu_csl_test.go index 5057bae3..714c9d80 100644 --- a/cmd/server/search_eu_csl_test.go +++ b/cmd/server/search_eu_csl_test.go @@ -28,7 +28,7 @@ func TestSearch__EU_CSL(t *testing.T) { w.Flush() require.Equal(t, http.StatusOK, w.Code) - require.Contains(t, w.Body.String(), `"match":0.7388888`) + require.Contains(t, w.Body.String(), `"match":0.65555`) var wrapper struct { EUConsolidatedSanctionsList []csl.EUCSLRecord `json:"euConsolidatedSanctionsList"` diff --git a/cmd/server/search_handlers_test.go b/cmd/server/search_handlers_test.go index 14a6003e..aca8e5d0 100644 --- a/cmd/server/search_handlers_test.go +++ b/cmd/server/search_handlers_test.go @@ -30,13 +30,8 @@ func TestSearch__Address(t *testing.T) { router.ServeHTTP(w, req) w.Flush() - if w.Code != http.StatusOK { - t.Errorf("bogus status code: %d", w.Code) - } - - if v := w.Body.String(); !strings.Contains(v, `"match":1`) { - t.Fatalf("%#v", v) - } + require.Equal(t, http.StatusOK, w.Code) + require.Contains(t, w.Body.String(), `"match":0.88194`) var wrapper struct { Addresses []*ofac.Address `json:"addresses"` @@ -265,25 +260,14 @@ func TestSearch__NameAndAltName(t *testing.T) { } // OFAC - if wrapper.SDNs[0].EntityID != "2681" { - t.Errorf("%#v", wrapper.SDNs[0]) - } - if wrapper.AltNames[0].EntityID != "4691" { - t.Errorf("%#v", wrapper.AltNames[0].EntityID) - } - if wrapper.Addresses[0].EntityID != "735" { - t.Errorf("%#v", wrapper.Addresses[0].EntityID) - } - if wrapper.SectoralSanctions[0].EntityID != "18782" { - t.Errorf("%#v", wrapper.SectoralSanctions[0].EntityID) - } + require.Equal(t, "2681", wrapper.SDNs[0].EntityID) + require.Equal(t, "4691", wrapper.AltNames[0].EntityID) + require.Equal(t, "735", wrapper.Addresses[0].EntityID) + require.Equal(t, "18782", wrapper.SectoralSanctions[0].EntityID) + // BIS - if wrapper.DeniedPersons[0].StreetAddress != "P.O. BOX 28360" { - t.Errorf("%#v", wrapper.DeniedPersons[0].StreetAddress) - } - if wrapper.BISEntities[0].Name != "Mohammad Jan Khan Mangal" { - t.Errorf("%#v", wrapper.BISEntities[0]) - } + require.Equal(t, "P.O. BOX 28360", wrapper.DeniedPersons[0].StreetAddress) + require.Equal(t, "Luqman Yasin Yunus Shgragi", wrapper.BISEntities[0].Name) } func TestSearch__Name(t *testing.T) { @@ -304,13 +288,8 @@ func TestSearch__Name(t *testing.T) { router.ServeHTTP(w, req) w.Flush() - if w.Code != http.StatusOK { - t.Errorf("bogus status code: %d", w.Code) - } - - if v := w.Body.String(); !strings.Contains(v, `"match":1`) { - t.Error(v) - } + require.Equal(t, http.StatusOK, w.Code) + require.Contains(t, w.Body.String(), `"match":0.89166`) var wrapper struct { // OFAC diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go index 0738d508..427a0cd7 100644 --- a/cmd/server/search_test.go +++ b/cmd/server/search_test.go @@ -432,34 +432,42 @@ func TestJaroWinkler(t *testing.T) { s1, s2 string match float64 }{ + // examples {"wei, zhao", "wei, Zhao", 0.917}, {"WEI, Zhao", "WEI, Zhao", 1.0}, {"WEI Zhao", "WEI Zhao", 1.0}, {strings.ToLower("WEI Zhao"), precompute("WEI, Zhao"), 1.0}, - // make sure jaroWinkler is communative - {"jane doe", "jan lahore", 0.721}, + + // apply jaroWinkler in both directions + {"jane doe", "jan lahore", 0.621}, {"jan lahore", "jane doe", 0.776}, + // real world case {"john doe", "paul john", 0.764}, - {"john doe", "john othername", 0.815}, + {"john doe", "john othername", 0.618}, + // close match {"jane doe", "jane doe2", 0.971}, + // real-ish world examples {"kalamity linden", "kala limited", 0.771}, - {"kala limited", "kalamity linden", 0.795}, + {"kala limited", "kalamity linden", 0.602}, + // examples used in demos / commonly {"nicolas", "nicolas", 1.0}, {"nicolas moros maduro", "nicolas maduro", 0.91}, {"nicolas maduro", "nicolas moros maduro", 1.0}, + // customer examples - {"ian mckinley", "tian xiang 7", 0.750}, - {"bindaree food group pty ltd", "independent insurance group ltd", 0.812}, + {"ian mckinley", "tian xiang 7", 0.75}, + {"bindaree food group pty ltd", "independent insurance group ltd", 0.728}, {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.753}, {"zincum llc", "easy verification inc.", 0.639}, - {"transpetrochart co ltd", "jx metals trading co.", 0.725}, - {"technolab", "moomoo technologies inc", 0.87222}, - {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.740}, + {"transpetrochart co ltd", "jx metals trading co.", 0.616}, + {"technolab", "moomoo technologies inc", 0.714}, + {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.673}, {"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0}, + // example cases {"nicolas maduro", "nicolás maduro", 0.961}, {"nicolas maduro", precompute("nicolás maduro"), 1.0}, @@ -471,36 +479,37 @@ func TestJaroWinkler(t *testing.T) { {"nicolas maduro moros", "nicolás maduro", 0.884}, {"nicolas, maduro moros", "maduro", 0.720}, {"nicolas, maduro moros", "nicolas maduro", 0.902}, - {"nicolas, maduro moros", "nicolás", 0.627}, + {"nicolas, maduro moros", "nicolás", 0.554}, {"nicolas, maduro moros", "maduro", 0.720}, {"nicolas, maduro moros", "nicolás maduro", 0.877}, {"africada financial services bureau change", "skylight", 0.266}, - {"africada financial services bureau change", "skylight financial inc", 0.72}, - {"africada financial services bureau change", "skylight services inc", 0.806}, - {"africada financial services bureau change", "skylight financial services", 0.887}, - {"africada financial services bureau change", "skylight financial services inc", 0.79}, + {"africada financial services bureau change", "skylight financial inc", 0.596}, + {"africada financial services bureau change", "skylight services inc", 0.645}, + {"africada financial services bureau change", "skylight financial services", 0.67}, + {"africada financial services bureau change", "skylight financial services inc", 0.696}, + // stopwords tests - {"the group for the preservation of the holy sites", "the bridgespan group", 1.00}, - {precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 1.00}, - {"group preservation holy sites", "bridgespan group", 0.689}, - {"the group for the preservation of the holy sites", "the logan group", 1.00}, - {precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 1.00}, + {"the group for the preservation of the holy sites", "the bridgespan group", 0.448}, + {precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.448}, + {"group preservation holy sites", "bridgespan group", 0.619}, + {"the group for the preservation of the holy sites", "the logan group", 0.424}, + {precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.424}, {"group preservation holy sites", "logan group", 0.478}, - {"the group for the preservation of the holy sites", "the anything group", 1.00}, - {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 1.00}, - {"group preservation holy sites", "anything group", 0.617}, - {"the group for the preservation of the holy sites", "the hello world group", 0.922}, - {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.922}, - {"group preservation holy sites", "hello world group", 0.687}, - {"the group for the preservation of the holy sites", "the group", 0.431}, - {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.431}, + {"the group for the preservation of the holy sites", "the anything group", 0.437}, + {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.437}, + {"group preservation holy sites", "anything group", 0.585}, + {"the group for the preservation of the holy sites", "the hello world group", 0.47}, + {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.47}, + {"group preservation holy sites", "hello world group", 0.515}, + {"the group for the preservation of the holy sites", "the group", 0.416}, + {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.416}, {"group preservation holy sites", "group", 0.460}, - {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.517}, - {precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.572}, - {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.418}, + {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.403}, + {precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.459}, + {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.329}, // precompute - {"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.667}, + {"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.5}, {precompute("A.I.C. SOGO KENKYUSHO"), "sogo kenkyusho", 0.667}, } for i := range cases { @@ -725,9 +734,7 @@ func TestSearch__TopSDNs(t *testing.T) { if len(sdns) == 0 { t.Fatal("empty SDNs") } - if sdns[0].EntityID != "2676" { - t.Errorf("%#v", sdns[0].SDN) - } + require.Equal(t, "2681", sdns[0].EntityID) } func TestSearch__TopDPs(t *testing.T) { diff --git a/cmd/server/search_us_csl_test.go b/cmd/server/search_us_csl_test.go index fc11fe15..86ac83c3 100644 --- a/cmd/server/search_us_csl_test.go +++ b/cmd/server/search_us_csl_test.go @@ -97,9 +97,7 @@ func TestSearcher_TopSSIs_limit(t *testing.T) { if len(ssis) != 2 { t.Fatalf("Expected 2 results, found %d", len(ssis)) } - if ssis[0].Data.EntityID != "18736" { - t.Errorf("%#v", ssis[0].Data) - } + require.Equal(t, "18782", ssis[0].Data.EntityID) } func TestSearcher_TopSSIs_reportAltNameWeight(t *testing.T) {