Skip to content

Commit

Permalink
Merge pull request #524 from tomdaffurn/tom/fuzzy_match_experiment
Browse files Browse the repository at this point in the history
Experimental Improved Search Algorithm
  • Loading branch information
adamdecaf authored Dec 14, 2023
2 parents 193f8ec + ddb46e7 commit f476bbd
Show file tree
Hide file tree
Showing 9 changed files with 305 additions and 105 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,18 @@ You should get this response:
PONG
```

### Configuration settings
### Configuration settings

| Environmental Variable | Description | Default |
|-----|-----|-----|
| `DATA_REFRESH_INTERVAL` | Interval for data redownload and reparse. `off` disables this refreshing. | 12h |
| `INITIAL_DATA_DIRECTORY` | Directory filepath with initial files to use instead of downloading. Periodic downloads will replace the initial files. | Empty |
| `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
| `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9 |
| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3 |
| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9 |
| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15 |
| `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
| `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
| `WEBHOOK_BATCH_SIZE` | How many watches to read from database per batch of async searches. | 100 |
Expand Down
6 changes: 3 additions & 3 deletions cmd/server/issue115_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ func TestIssue115__TopSDNs(t *testing.T) {
s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "2680", SDNName: "HABBASH, George", SDNType: "INDIVIDUAL"}}, nil, pipe)

out := s.TopSDNs(1, 0.00, "george bush", keeper)
eql(t, "issue115: top SDN 2680", out[0].match, 0.732)
eql(t, "issue115: top SDN 2680", out[0].match, 0.687)

// was 88.3% match
s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}, nil, pipe)

out = s.TopSDNs(1, 0.00, "george bush", keeper)
eql(t, "issue115: top SDN 18996", out[0].match, 0.764)
eql(t, "issue115: top SDN 18996", out[0].match, 0.650)

// another example
s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}, nil, pipe)
Expand All @@ -47,5 +47,5 @@ func TestIssue115__TopSDNs(t *testing.T) {
eql(t, "issue115: top SDN 0", out[0].match, 1.0)

out = s.TopSDNs(1, 0.00, "george bush", keeper)
eql(t, "issue115: top SDN 0", out[0].match, 0.667)
eql(t, "issue115: top SDN 0", out[0].match, 0.986)
}
76 changes: 76 additions & 0 deletions cmd/server/new_algorithm_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2022 The Moov Authors
// Use of this source code is governed by an Apache License
// license that can be found in the LICENSE file.

package main

import (
"strings"
"testing"
)

func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) {
// Words in the query should be matched against at most one indexed word. Doubled names on the sanctioned list can
// skew results
// 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich
oldScore, newScore := compareAlgorithms("vladimirov vladimir vladimirovich", "vladimir levenshtein")
eql(t, "Score is too high", oldScore, 0.961)
eql(t, "New score is better", newScore, 0.603)

// 2. SDN Entity 7788 "SHAQIRI, Shaqir"
oldScore, newScore = compareAlgorithms("shaqiri shaqir", "zaid shakir")
eql(t, "Score is too high", oldScore, 0.908)
eql(t, "New score is better", newScore, 0.704)

// Single-word sanctioned names shouldn't match any query with that name part
// 1. SDN Entity 15050 "HADI"
oldScore, newScore = compareAlgorithms("hadi", "hadi alwai")
eql(t, "Score is too high", oldScore, 0.900)
eql(t, "New score is better", newScore, 0.615)

// Name-part scores should be weighted by the character length. If not, small words can have unfair weight
// 1. SDN Entity "LI, Shangfu"
oldScore, newScore = compareAlgorithms("li shangfu", "li shanlan")
eql(t, "Score is too high", oldScore, 0.914)
eql(t, "New score is better", newScore, 0.867)

// Words with different lengths shouldn't match very highly
oldScore, newScore = compareAlgorithms("browningweight", "brown")
eql(t, "Score is too high", oldScore, 0.871)
eql(t, "New score is better", newScore, 0.703)

// Words that start with different letters shouldn't match very highly
oldScore, newScore = compareAlgorithms("dominguez", "jimenez")
eql(t, "Score is too high", oldScore, 0.690)
eql(t, "New score is better", newScore, 0.580)
}

func TestBestPairsJaroWinkler__TruePositives(t *testing.T) {
// Unmatched indexed words had a large weight, causing false negatives for missing "middle names"
// 1. Saddam Hussein
oldScore, newScore := compareAlgorithms("saddam hussein al tikriti", "saddam hussien")
eql(t, "Score is too low", oldScore, 0.656)
eql(t, "New score is better", newScore, 0.924)

// 2. SDN Entity 7574 "VALENCIA TRUJILLO, Joaquin Mario"
oldScore, newScore = compareAlgorithms("valencia trujillo joaquin mario", "valencia trujillo joaquin")
eql(t, "Score is too low", oldScore, 0.868)
eql(t, "New score is better", newScore, 0.973)

// 3. SDN Entity 9760 "LUKASHENKO, Alexander Grigoryevich"
oldScore, newScore = compareAlgorithms("lukashenko alexander grigoryevich", "alexander lukashenko")
eql(t, "Score is too low", oldScore, 0.765)
eql(t, "New score is better", newScore, 0.942)

// Small words had too much weight, causing false negatives
// 1. SDN Entity 4691 "A.I.C. SOGO KENKYUSHO"
oldScore, newScore = compareAlgorithms("a i c sogo kenkyusho", "sogo kenkyusho")
eql(t, "Score is too low", oldScore, 0.400)
eql(t, "New score is better", newScore, 0.972)
}

func compareAlgorithms(indexedName string, query string) (float64, float64) {
oldScore := jaroWinkler(indexedName, query)
newScore := bestPairsJaroWinkler(strings.Fields(query), indexedName)
return oldScore, newScore
}
Loading

0 comments on commit f476bbd

Please sign in to comment.