Skip to content

Commit

Permalink
Merge pull request #520 from adamdecaf/more-search-edge-cases
Browse files Browse the repository at this point in the history
search: apply more edge case logic to decrease bad scoring
  • Loading branch information
adamdecaf authored Nov 16, 2023
2 parents edce202 + f8382b0 commit 00affd7
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 11 deletions.
25 changes: 24 additions & 1 deletion cmd/server/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -692,10 +692,33 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
for i := range s1Parts {
max, term := maxMatch(s1Parts[i], i, s2Parts)
if max >= 1.0 {
// Perfect match
// If the query is longer than our indexed term (and both are longer than most names)
// we want to reduce the maximum weight proportionally by the term difference, which
// forces more terms to match instead of one or two dominating the weight.
if (len(s2Parts) > len(s1Parts)) && (len(s1Parts) > 3 || len(s2Parts) > 3) {
max *= (float64(len(s1Parts)) / float64(len(s2Parts)))
goto add
}
// If the indexed term is really short cap the match at 90%.
// This sill allows names to match highly with a couple different characters.
if len(s1Parts) < 2 && len(s2Parts) > 1 {
max *= 0.9
goto add
}
// Otherwise, apply Perfect match favoritism
max += favoritism
add:
scores = append(scores, max)
} else {
// If there are more terms in the user's query than what's indexed then
// adjust the max lower by the proportion of different terms.
//
// We do this to decrease the importance of a short (often common) term.
if len(s2Parts) > len(s1Parts) {
scores = append(scores, max*float64(len(s1Parts))/float64(len(s2Parts)))
continue
}

// Apply an additional weight based on similarity of term lengths,
// so terms which are closer in length match higher.
s1 := float64(len(s1Parts[i]))
Expand Down
35 changes: 26 additions & 9 deletions cmd/server/search_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -459,18 +459,26 @@ func TestJaroWinkler(t *testing.T) {
{"nicolas maduro", "nicolas moros maduro", 1.0},

// customer examples
{"ian mckinley", "tian xiang 7", 0.75},
{"bindaree food group pty ltd", "independent insurance group ltd", 0.728},
{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.753},
{"zincum llc", "easy verification inc.", 0.639},
{"transpetrochart co ltd", "jx metals trading co.", 0.616},
{"technolab", "moomoo technologies inc", 0.714},
{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.673},
{"ian", "ian mckinley", 0.9},
{"iap", "ian mckinley", 0.411},
{"ian mckinley", "ian", 0.819},
{"ian mckinley", "iap", 0.654},
{"ian mckinley", "tian xiang 7", 0.5},
{"bindaree food group pty", precompute("independent insurance group ltd"), 0.659}, // precompute removes ltd
{"bindaree food group pty ltd", "independent insurance group ltd", 0.728}, // only matches higher from 'ltd'
{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.602},
{"zincum llc", "easy verification inc.", 0.426},
{"transpetrochart co ltd", "jx metals trading co.", 0.544},
{"technolab", "moomoo technologies inc", 0.291},
{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.247},
{"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0},

// example cases
{"nicolas maduro", "nicolás maduro", 0.961},
{"nicolas maduro", precompute("nicolás maduro"), 1.0},
{"nic maduro", "nicolas maduro", 0.717},
{"nick maduro", "nicolas maduro", 0.769},
{"nicolas maduroo", "nicolas maduro", 0.986},
{"nicolas maduro", "nicolas maduro", 1.0},
{"maduro, nicolas", "maduro, nicolas", 1.0},
{"maduro moros, nicolas", "maduro moros, nicolas", 1.0},
Expand All @@ -492,21 +500,30 @@ func TestJaroWinkler(t *testing.T) {
{"the group for the preservation of the holy sites", "the bridgespan group", 0.448},
{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.448},
{"group preservation holy sites", "bridgespan group", 0.619},

{"the group for the preservation of the holy sites", "the logan group", 0.424},
{precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.424},
{"group preservation holy sites", "logan group", 0.478},

{"the group for the preservation of the holy sites", "the anything group", 0.437},
{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.437},
{"group preservation holy sites", "anything group", 0.585},

{"the group for the preservation of the holy sites", "the hello world group", 0.47},
{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.47},
{"group preservation holy sites", "hello world group", 0.515},

{"the group for the preservation of the holy sites", "the group", 0.416},
{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.416},
{"group preservation holy sites", "group", 0.460},

{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.403},
{precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.459},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.329},
{
precompute("the group for the preservation of the holy sites"),
precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"),
0.459,
},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.239},

// precompute
{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.5},
Expand Down
2 changes: 1 addition & 1 deletion cmd/server/search_us_csl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func TestSearcher_TopSSIs_limit(t *testing.T) {
if len(ssis) != 2 {
t.Fatalf("Expected 2 results, found %d", len(ssis))
}
require.Equal(t, "18782", ssis[0].Data.EntityID)
require.Equal(t, "18736", ssis[0].Data.EntityID)
}

func TestSearcher_TopSSIs_reportAltNameWeight(t *testing.T) {
Expand Down

0 comments on commit 00affd7

Please sign in to comment.