From f8382b038149d5cc7730ae4be57237fdc2f0f4cc Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Thu, 16 Nov 2023 16:08:44 -0600 Subject: [PATCH] search: apply more edge case logic to decrease bad scoring --- cmd/server/search.go | 25 ++++++++++++++++++++++- cmd/server/search_test.go | 35 ++++++++++++++++++++++++-------- cmd/server/search_us_csl_test.go | 2 +- 3 files changed, 51 insertions(+), 11 deletions(-) diff --git a/cmd/server/search.go b/cmd/server/search.go index 9e1f52bc..a2cad095 100644 --- a/cmd/server/search.go +++ b/cmd/server/search.go @@ -692,10 +692,33 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 { for i := range s1Parts { max, term := maxMatch(s1Parts[i], i, s2Parts) if max >= 1.0 { - // Perfect match + // If the query is longer than our indexed term (and both are longer than most names) + // we want to reduce the maximum weight proportionally by the term difference, which + // forces more terms to match instead of one or two dominating the weight. + if (len(s2Parts) > len(s1Parts)) && (len(s1Parts) > 3 || len(s2Parts) > 3) { + max *= (float64(len(s1Parts)) / float64(len(s2Parts))) + goto add + } + // If the indexed term is really short cap the match at 90%. + // This sill allows names to match highly with a couple different characters. + if len(s1Parts) < 2 && len(s2Parts) > 1 { + max *= 0.9 + goto add + } + // Otherwise, apply Perfect match favoritism max += favoritism + add: scores = append(scores, max) } else { + // If there are more terms in the user's query than what's indexed then + // adjust the max lower by the proportion of different terms. + // + // We do this to decrease the importance of a short (often common) term. + if len(s2Parts) > len(s1Parts) { + scores = append(scores, max*float64(len(s1Parts))/float64(len(s2Parts))) + continue + } + // Apply an additional weight based on similarity of term lengths, // so terms which are closer in length match higher. s1 := float64(len(s1Parts[i])) diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go index 427a0cd7..412d3a08 100644 --- a/cmd/server/search_test.go +++ b/cmd/server/search_test.go @@ -459,18 +459,26 @@ func TestJaroWinkler(t *testing.T) { {"nicolas maduro", "nicolas moros maduro", 1.0}, // customer examples - {"ian mckinley", "tian xiang 7", 0.75}, - {"bindaree food group pty ltd", "independent insurance group ltd", 0.728}, - {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.753}, - {"zincum llc", "easy verification inc.", 0.639}, - {"transpetrochart co ltd", "jx metals trading co.", 0.616}, - {"technolab", "moomoo technologies inc", 0.714}, - {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.673}, + {"ian", "ian mckinley", 0.9}, + {"iap", "ian mckinley", 0.411}, + {"ian mckinley", "ian", 0.819}, + {"ian mckinley", "iap", 0.654}, + {"ian mckinley", "tian xiang 7", 0.5}, + {"bindaree food group pty", precompute("independent insurance group ltd"), 0.659}, // precompute removes ltd + {"bindaree food group pty ltd", "independent insurance group ltd", 0.728}, // only matches higher from 'ltd' + {"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.602}, + {"zincum llc", "easy verification inc.", 0.426}, + {"transpetrochart co ltd", "jx metals trading co.", 0.544}, + {"technolab", "moomoo technologies inc", 0.291}, + {"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.247}, {"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0}, // example cases {"nicolas maduro", "nicolás maduro", 0.961}, {"nicolas maduro", precompute("nicolás maduro"), 1.0}, + {"nic maduro", "nicolas maduro", 0.717}, + {"nick maduro", "nicolas maduro", 0.769}, + {"nicolas maduroo", "nicolas maduro", 0.986}, {"nicolas maduro", "nicolas maduro", 1.0}, {"maduro, nicolas", "maduro, nicolas", 1.0}, {"maduro moros, nicolas", "maduro moros, nicolas", 1.0}, @@ -492,21 +500,30 @@ func TestJaroWinkler(t *testing.T) { {"the group for the preservation of the holy sites", "the bridgespan group", 0.448}, {precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.448}, {"group preservation holy sites", "bridgespan group", 0.619}, + {"the group for the preservation of the holy sites", "the logan group", 0.424}, {precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.424}, {"group preservation holy sites", "logan group", 0.478}, + {"the group for the preservation of the holy sites", "the anything group", 0.437}, {precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.437}, {"group preservation holy sites", "anything group", 0.585}, + {"the group for the preservation of the holy sites", "the hello world group", 0.47}, {precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.47}, {"group preservation holy sites", "hello world group", 0.515}, + {"the group for the preservation of the holy sites", "the group", 0.416}, {precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.416}, {"group preservation holy sites", "group", 0.460}, + {"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.403}, - {precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.459}, - {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.329}, + { + precompute("the group for the preservation of the holy sites"), + precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), + 0.459, + }, + {"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.239}, // precompute {"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.5}, diff --git a/cmd/server/search_us_csl_test.go b/cmd/server/search_us_csl_test.go index 86ac83c3..4e7218d2 100644 --- a/cmd/server/search_us_csl_test.go +++ b/cmd/server/search_us_csl_test.go @@ -97,7 +97,7 @@ func TestSearcher_TopSSIs_limit(t *testing.T) { if len(ssis) != 2 { t.Fatalf("Expected 2 results, found %d", len(ssis)) } - require.Equal(t, "18782", ssis[0].Data.EntityID) + require.Equal(t, "18736", ssis[0].Data.EntityID) } func TestSearcher_TopSSIs_reportAltNameWeight(t *testing.T) {