Skip to content

Commit

Permalink
Merge pull request #511 from adamdecaf/reduce-jaro-term-proximity
Browse files Browse the repository at this point in the history
Reduce Jaro term proximity
  • Loading branch information
adamdecaf authored Nov 13, 2023
2 parents 26e9851 + 489fc9e commit e3971be
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 21 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ PONG
|-----|-----|-----|
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
Expand Down
10 changes: 5 additions & 5 deletions cmd/server/issue326_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@ func TestIssue326(t *testing.T) {

// Cuba
score := jaroWinkler(precompute("Huawei Cuba"), precompute("Huawei"))
assert.Equal(t, score, 0.8055555555555556)
assert.Equal(t, 0.8055555555555556, score)

// India
score = jaroWinkler(india, precompute("Huawei"))
assert.Equal(t, score, 0.5592063492063492)
assert.Equal(t, 0.4846031746031746, score)
score = jaroWinkler(india, precompute("Huawei Technologies"))
assert.Equal(t, score, 0.6903174603174603)
assert.Equal(t, 0.6903174603174603, score)

// Investment
score = jaroWinkler(investment, precompute("Huawei"))
assert.Equal(t, score, 0.3788888888888889)
assert.Equal(t, 0.3788888888888889, score)
score = jaroWinkler(investment, precompute("Huawei Technologies"))
assert.Equal(t, score, 0.7377777777777779)
assert.Equal(t, 0.7377777777777779, score)
}
26 changes: 19 additions & 7 deletions cmd/server/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,21 +648,34 @@ func readInt(override string, value int) int {
// jaroWinkler runs the similarly named algorithm over the two input strings and averages their match percentages
// according to the second string (assumed to be the user's query)
//
// Terms are compared between a few adjacent terms and accumulate the highest near-neighbor match.
//
// For more details see https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
func jaroWinkler(s1, s2 string) float64 {
return jaroWinklerWithFavoritism(s1, s2, exactMatchFavoritism)
}

var (
adjacentSimilarityPositions = readInt(os.Getenv("ADJACENT_SIMILARITY_POSITIONS"), 3)
)

func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
maxMatch := func(word string, parts []string) float64 {
maxMatch := func(word string, s1Idx int, parts []string) float64 {
if len(parts) == 0 {
return 0.0
}

max := smetrics.JaroWinkler(word, parts[0], boostThreshold, prefixSize)
for i := 1; i < len(parts); i++ {
if score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize); score > max {
max = score
// We're only looking for the highest match close
start := s1Idx - adjacentSimilarityPositions
end := s1Idx + adjacentSimilarityPositions

var max float64
for i := start; i < end; i++ {
if i >= 0 && len(parts) > i {
score := smetrics.JaroWinkler(word, parts[i], boostThreshold, prefixSize)
if score > max {
max = score
}
}
}
return max
Expand All @@ -675,7 +688,7 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {

var scores []float64
for i := range s1Parts {
max := maxMatch(s1Parts[i], s2Parts)
max := maxMatch(s1Parts[i], i, s2Parts)
if max >= 1.0 {
max += favoritism
}
Expand All @@ -692,7 +705,6 @@ func jaroWinklerWithFavoritism(s1, s2 string, favoritism float64) float64 {
for i := range scores {
sum += scores[i]
}

return math.Min(sum/float64(len(scores)), 1.00)
}

Expand Down
26 changes: 17 additions & 9 deletions cmd/server/search_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,15 @@ func TestJaroWinkler(t *testing.T) {
{"nicolas", "nicolas", 1.0},
{"nicolas moros maduro", "nicolas maduro", 0.91},
{"nicolas maduro", "nicolas moros maduro", 1.0},
// customer examples
{"ian mckinley", "tian xiang 7", 0.750},
{"bindaree food group pty ltd", "independent insurance group ltd", 0.812},
{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.753},
{"zincum llc", "easy verification inc.", 0.639},
{"transpetrochart co ltd", "jx metals trading co.", 0.725},
{"technolab", "moomoo technologies inc", 0.87222},
{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.740},
{"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.0},
// example cases
{"nicolas maduro", "nicolás maduro", 0.961},
{"nicolas maduro", precompute("nicolás maduro"), 1.0},
Expand All @@ -465,12 +474,11 @@ func TestJaroWinkler(t *testing.T) {
{"nicolas, maduro moros", "nicolás", 0.627},
{"nicolas, maduro moros", "maduro", 0.720},
{"nicolas, maduro moros", "nicolás maduro", 0.877},
{"africada financial services bureau change", "skylight", 0.352},
{"africada financial services bureau change", "skylight", 0.266},
{"africada financial services bureau change", "skylight financial inc", 0.72},
{"africada financial services bureau change", "skylight services inc", 0.806},
{"africada financial services bureau change", "skylight financial services", 0.887},
{"africada financial services bureau change", "skylight financial services inc", 0.79},

// stopwords tests
{"the group for the preservation of the holy sites", "the bridgespan group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 1.00},
Expand All @@ -481,15 +489,15 @@ func TestJaroWinkler(t *testing.T) {
{"the group for the preservation of the holy sites", "the anything group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 1.00},
{"group preservation holy sites", "anything group", 0.617},
{"the group for the preservation of the holy sites", "the hello world group", 1.00},
{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 1.00},
{"the group for the preservation of the holy sites", "the hello world group", 0.922},
{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.922},
{"group preservation holy sites", "hello world group", 0.687},
{"the group for the preservation of the holy sites", "the group", 0.67},
{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.67},
{"the group for the preservation of the holy sites", "the group", 0.431},
{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.431},
{"group preservation holy sites", "group", 0.460},
{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.699},
{precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), .783},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.590},
{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.517},
{precompute("the group for the preservation of the holy sites"), precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"), 0.572},
{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.418},

// precompute
{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.667},
Expand Down
1 change: 1 addition & 0 deletions docs/usage-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ menubar: docs-menu
|-----|-----|-----|
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
Expand Down

0 comments on commit e3971be

Please sign in to comment.