cmd/server: filter results based on phonetic similarly

This greatly reduces the number of jaro-winkler comparisons performed. │ before.txt │ after2.txt │ │ sec/op │ sec/op vs base │ JaroWinkler/bestPairsJaroWinkler-16 4.535µ ± ∞ ¹ 3.161µ ± ∞ ¹ ~ (p=1.000 n=1) ² │ before.txt │ after2.txt │ │ B/op │ B/op vs base │ JaroWinkler/bestPairsJaroWinkler-16 862.0 ± ∞ ¹ 359.0 ± ∞ ¹ ~ (p=1.000 n=1) ² │ before.txt │ after2.txt │ │ allocs/op │ allocs/op vs base │ JaroWinkler/bestPairsJaroWinkler-16 32.00 ± ∞ ¹ 15.00 ± ∞ ¹ ~ (p=1.000 n=1) ²
moov-io · Oct 24, 2024 · 82d4656 · 82d4656
1 parent 6866dda
commit 82d4656
Show file tree

Hide file tree

Showing 11 changed files with 147 additions and 65 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,12 @@
+## v0.30.0 (Released 2024-10-24)
+
+ADDITIONS
+
+Watchman now filters out indexed records based on the first character's phonetic match. This is helpful to eliminate most
+low scoring results and reduces CPU usage.
+
+You can force scoring search terms against every indexed record by setting `DISABLE_PHONETIC_FILTERING=yes`.
+
 ## v0.29.2 (Released 2024-10-23)
 
 IMPROVEMENTS

diff --git a/README.md b/README.md
@@ -193,6 +193,7 @@ PONG
 | `SEARCH_MAX_WORKERS` | Maximum number of goroutines used for search. | 1024 |
 | `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
 | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
+| `DISABLE_PHONETIC_FILTERING` | Force scoring search terms against every indexed record. | `false` |
 | `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9       |
 | `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3    |
 | `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9   |

diff --git a/cmd/server/issue115_test.go b/cmd/server/issue115_test.go
@@ -35,7 +35,7 @@ func TestIssue115__TopSDNs(t *testing.T) {
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "9432", SDNName: "CHIWESHE, George", SDNType: "INDIVIDUAL"}}, nil, pipe)
 
 	out = s.TopSDNs(1, 0.00, "george bush", keeper)
-	eql(t, "issue115: top SDN 18996", out[0].match, 0.650)
+	eql(t, "issue115: top SDN 18996", out[0].match, 0.686)
 
 	// another example
 	s.SDNs = precomputeSDNs([]*ofac.SDN{{EntityID: "0", SDNName: "Bush, George W", SDNType: "INDIVIDUAL"}}, nil, pipe)

diff --git a/cmd/server/new_algorithm_test.go b/cmd/server/new_algorithm_test.go
@@ -15,7 +15,7 @@ func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) {
 	// 1. SDN Entity 40273, VLADIMIROV, Vladimir Vladimirovich
 	oldScore, newScore := compareAlgorithms("vladimirov vladimir vladimirovich", "vladimir levenshtein")
 	eql(t, "Score is too high", oldScore, 0.961)
-	eql(t, "New score is better", newScore, 0.603)
+	eql(t, "New score is better", newScore, 0.527)
 
 	// 2. SDN Entity 7788 "SHAQIRI, Shaqir"
 	oldScore, newScore = compareAlgorithms("shaqiri shaqir", "zaid shakir")
@@ -42,7 +42,7 @@ func TestBestPairsJaroWinkler__FalsePositives(t *testing.T) {
 	// Words that start with different letters shouldn't match very highly
 	oldScore, newScore = compareAlgorithms("dominguez", "jimenez")
 	eql(t, "Score is too high", oldScore, 0.690)
-	eql(t, "New score is better", newScore, 0.580)
+	eql(t, "New score is better", newScore, 0.0)
 }
 
 func TestBestPairsJaroWinkler__TruePositives(t *testing.T) {

diff --git a/cmd/server/phonetics.go b/cmd/server/phonetics.go
@@ -0,0 +1,39 @@
+package main
+
+import (
+	"unicode"
+)
+
+var soundexMap = map[rune]rune{
+	'A': 'A', 'E': 'A', 'I': 'A', 'O': 'A', 'U': 'A', 'Y': 'A', // vowels
+	'B': 'B', 'F': 'B', 'P': 'B', 'V': 'B', // similar sounds
+	'C': 'C', 'G': 'C', 'J': 'C', 'K': 'C', 'Q': 'C', 'S': 'C', 'X': 'C', 'Z': 'C', // sibilants
+	'D': 'D', 'T': 'D', // dental sounds
+	'L': 'L',           // liquids
+	'M': 'M', 'N': 'M', // nasal sounds
+	'R': 'R',           // trills
+	'H': 'H', 'W': 'H', // breathy sounds
+}
+
+// getPhoneticClass returns the phonetic class of the first letter in a string
+func getPhoneticClass(s string) rune {
+	if s == "" {
+		return ' '
+	}
+	// Return the first rune mapped with partial soundex
+	for _, r := range s {
+		firstLetter := unicode.ToUpper(r)
+		if phonetic, ok := soundexMap[firstLetter]; ok {
+			return phonetic
+		}
+		return firstLetter
+	}
+	return ' '
+}
+
+func firstCharacterSoundexMatch(s1, s2 string) bool {
+	if s1 == "" || s2 == "" {
+		return false
+	}
+	return getPhoneticClass(s1) == getPhoneticClass(s2)
+}
diff --git a/cmd/server/phonetics_test.go b/cmd/server/phonetics_test.go
@@ -0,0 +1,35 @@
+package main
+
+import (
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+)
+
+func TestFirstCharacterSoundexMatch(t *testing.T) {
+	require.True(t, firstCharacterSoundexMatch("a", "A"))
+	require.True(t, firstCharacterSoundexMatch("Catherine", "Katherine"))
+	require.True(t, firstCharacterSoundexMatch("Fone", "Phone"))
+	require.True(t, firstCharacterSoundexMatch("Vibe", "Bribe"))
+	require.True(t, firstCharacterSoundexMatch("mine", "nine"))
+
+	require.False(t, firstCharacterSoundexMatch("a", ""))
+	require.False(t, firstCharacterSoundexMatch("", "A"))
+	require.False(t, firstCharacterSoundexMatch("Dave", "Eve"))
+}
+
+func TestDisablePhoneticFiltering(t *testing.T) {
+	search := strings.Fields("ian mckinley")
+	indexed := "tian xiang 7"
+
+	t.Setenv("DISABLE_PHONETIC_FILTERING", "no")
+	score := bestPairsJaroWinkler(search, indexed)
+	require.InDelta(t, 0.00, score, 0.01)
+
+	// Disable filtering (force the compare)
+	t.Setenv("DISABLE_PHONETIC_FILTERING", "yes")
+
+	score = bestPairsJaroWinkler(search, indexed)
+	require.InDelta(t, 0.544, score, 0.01)
+}
diff --git a/cmd/server/search.go b/cmd/server/search.go
@@ -18,6 +18,7 @@ import (
 	"time"
 
 	"github.com/moov-io/base/log"
+	"github.com/moov-io/base/strx"
 	"github.com/moov-io/watchman/pkg/csl"
 	"github.com/moov-io/watchman/pkg/dpl"
 	"github.com/moov-io/watchman/pkg/ofac"
@@ -312,12 +313,21 @@ func bestPairsJaroWinkler(searchTokens []string, indexed string) float64 {
 	searchTokensLength := sumLength(searchTokens)
 	indexTokensLength := sumLength(indexedTokens)
 
+	disablePhoneticFiltering := strx.Yes(os.Getenv("DISABLE_PHONETIC_FILTERING"))
+
 	//Compare each search token to each indexed token. Sort the results in descending order
-	scores := make([]Score, 0, len(searchTokens)+len(indexedTokens))
+	scoresCapacity := (len(searchTokens) + len(indexedTokens))
+	if !disablePhoneticFiltering {
+		scoresCapacity /= 5 // reduce the capacity as many terms don't phonetically match
+	}
+	scores := make([]Score, 0, scoresCapacity)
 	for searchIdx, searchToken := range searchTokens {
 		for indexIdx, indexedToken := range indexedTokens {
-			score := customJaroWinkler(indexedToken, searchToken)
-			scores = append(scores, Score{score, searchIdx, indexIdx})
+			// Compare the first letters phonetically and only run jaro-winkler on those which are similar
+			if disablePhoneticFiltering || firstCharacterSoundexMatch(indexedToken, searchToken) {
+				score := customJaroWinkler(indexedToken, searchToken)
+				scores = append(scores, Score{score, searchIdx, indexIdx})
+			}
 		}
 	}
 	sort.Slice(scores[:], func(i, j int) bool {

diff --git a/cmd/server/search_handlers_bench_test.go b/cmd/server/search_handlers_bench_test.go
@@ -84,19 +84,3 @@ func BenchmarkJaroWinkler(b *testing.B) {
 		}
 	})
 }
-
-// goos: darwin
-// goarch: amd64
-// pkg: github.com/moov-io/watchman/cmd/server
-// cpu: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz
-// BenchmarkSearchHandler-16    	    2728	 131 213 518 ns/op	34812129 B/op	 1486792 allocs/op
-// PASS
-// ok  	github.com/moov-io/watchman/cmd/server	413.248s
-
-// goos: darwin
-// goarch: amd64
-// pkg: github.com/moov-io/watchman/cmd/server
-// cpu: Intel(R) Core(TM) i9-9880H CPU @ 2.30GHz
-// BenchmarkSearchHandler-16    	    2079	 174 594 246 ns/op	49797019 B/op	 1638732 allocs/op
-// PASS
-// ok  	github.com/moov-io/watchman/cmd/server	419.284s
diff --git a/cmd/server/search_handlers_test.go b/cmd/server/search_handlers_test.go
@@ -241,13 +241,14 @@ func TestSearch__NameAndAltName(t *testing.T) {
 
 	// OFAC
 	require.Equal(t, "2681", wrapper.SDNs[0].EntityID)
-	require.Equal(t, "4691", wrapper.AltNames[0].EntityID)
+	require.Equal(t, "HAWATMA, Nayif", wrapper.SDNs[0].SDNName)
+	require.Equal(t, "559", wrapper.AltNames[0].EntityID)
 	require.Equal(t, "735", wrapper.Addresses[0].EntityID)
-	require.Equal(t, "18782", wrapper.SectoralSanctions[0].EntityID)
+	require.Equal(t, "18736", wrapper.SectoralSanctions[0].EntityID)
 
 	// BIS
 	require.Equal(t, "P.O. BOX 28360", wrapper.DeniedPersons[0].StreetAddress)
-	require.Equal(t, "Luqman Yasin Yunus Shgragi", wrapper.BISEntities[0].Name)
+	require.Equal(t, "Mohammad Jan Khan Mangal", wrapper.BISEntities[0].Name)
 }
 
 func TestSearch__Name(t *testing.T) {
@@ -288,21 +289,13 @@ func TestSearch__Name(t *testing.T) {
 		t.Fatalf("SDNs=%d Alts=%d SSIs=%d DPs=%d ELs=%d",
 			len(wrapper.SDNs), len(wrapper.Alts), len(wrapper.SSIs), len(wrapper.DPs), len(wrapper.ELs))
 	}
-	if wrapper.SDNs[0].EntityID != "2676" {
-		t.Errorf("%#v", wrapper.SDNs[0])
-	}
-	if wrapper.Alts[0].EntityID != "4691" {
-		t.Errorf("%#v", wrapper.Alts[0])
-	}
-	if wrapper.SSIs[0].EntityID != "18782" {
-		t.Errorf("%#v", wrapper.SSIs[0])
-	}
-	if wrapper.DPs[0].Name != "AL NASER WINGS AIRLINES" {
-		t.Errorf("%#v", wrapper.DPs[0])
-	}
-	if wrapper.ELs[0].Name != "Luqman Yasin Yunus Shgragi" {
-		t.Errorf("%#v", wrapper.ELs[0])
-	}
+
+	require.Equal(t, "2676", wrapper.SDNs[0].EntityID)
+	require.Equal(t, "4691", wrapper.Alts[0].EntityID)
+
+	require.Equal(t, "18736", wrapper.SSIs[0].EntityID)
+	require.Equal(t, "AL NASER WINGS AIRLINES", wrapper.DPs[0].Name)
+	require.Equal(t, "Luqman Yasin Yunus Shgragi", wrapper.ELs[0].Name)
 }
 
 func TestSearch__AltName(t *testing.T) {

diff --git a/cmd/server/search_test.go b/cmd/server/search_test.go
@@ -441,12 +441,12 @@ func TestJaroWinkler(t *testing.T) {
 		{strings.ToLower("WEI Zhao"), precompute("WEI, Zhao"), 1.0},
 
 		// apply jaroWinkler in both directions
-		{"jane doe", "jan lahore", 0.596},
-		{"jan lahore", "jane doe", 0.596},
+		{"jane doe", "jan lahore", 0.439},
+		{"jan lahore", "jane doe", 0.549},
 
 		// real world case
-		{"john doe", "paul john", 0.533},
-		{"john doe", "john othername", 0.672},
+		{"john doe", "paul john", 0.624},
+		{"john doe", "john othername", 0.440},
 
 		// close match
 		{"jane doe", "jane doe2", 0.940},
@@ -465,12 +465,12 @@ func TestJaroWinkler(t *testing.T) {
 		{"iap", "ian mckinley", 0.352},
 		{"ian mckinley", "ian", 0.891},
 		{"ian mckinley", "iap", 0.733},
-		{"ian mckinley", "tian xiang 7", 0.526},
-		{"bindaree food group pty", precompute("independent insurance group ltd"), 0.576}, // precompute removes ltd
-		{"bindaree food group pty ltd", "independent insurance group ltd", 0.631},         // only matches higher from 'ltd'
-		{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.658},
-		{"zincum llc", "easy verification inc.", 0.380},
-		{"transpetrochart co ltd", "jx metals trading co.", 0.496},
+		{"ian mckinley", "tian xiang 7", 0.000},
+		{"bindaree food group pty", precompute("independent insurance group ltd"), 0.269}, // precompute removes ltd
+		{"bindaree food group pty ltd", "independent insurance group ltd", 0.401},         // only matches higher from 'ltd'
+		{"p.c.c. (singapore) private limited", "culver max entertainment private limited", 0.514},
+		{"zincum llc", "easy verification inc.", 0.000},
+		{"transpetrochart co ltd", "jx metals trading co.", 0.431},
 		{"technolab", "moomoo technologies inc", 0.565},
 		{"sewa security services", "sesa - safety & environmental services australia pty ltd", 0.480},
 		{"bueno", "20/f rykadan capital twr135 hoi bun rd, kwun tong 135 hoi bun rd., kwun tong", 0.094},
@@ -494,7 +494,7 @@ func TestJaroWinkler(t *testing.T) {
 		{"nicolas, maduro moros", "nicolás maduro", 0.906},
 		{"africada financial services bureau change", "skylight", 0.441},
 		{"africada financial services bureau change", "skylight financial inc", 0.658},
-		{"africada financial services bureau change", "skylight services inc", 0.621},
+		{"africada financial services bureau change", "skylight services inc", 0.599},
 		{"africada financial services bureau change", "skylight financial services", 0.761},
 		{"africada financial services bureau change", "skylight financial services inc", 0.730},
 
@@ -503,29 +503,29 @@ func TestJaroWinkler(t *testing.T) {
 		{precompute("the group for the preservation of the holy sites"), precompute("the bridgespan group"), 0.682},
 		{"group preservation holy sites", "bridgespan group", 0.652},
 
-		{"the group for the preservation of the holy sites", "the logan group", 0.730},
-		{precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.730},
-		{"group preservation holy sites", "logan group", 0.649},
+		{"the group for the preservation of the holy sites", "the logan group", 0.670},
+		{precompute("the group for the preservation of the holy sites"), precompute("the logan group"), 0.670},
+		{"group preservation holy sites", "logan group", 0.586},
 
-		{"the group for the preservation of the holy sites", "the anything group", 0.698},
-		{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.698},
-		{"group preservation holy sites", "anything group", 0.585},
+		{"the group for the preservation of the holy sites", "the anything group", 0.546},
+		{precompute("the group for the preservation of the holy sites"), precompute("the anything group"), 0.546},
+		{"group preservation holy sites", "anything group", 0.488},
 
-		{"the group for the preservation of the holy sites", "the hello world group", 0.706},
-		{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.706},
-		{"group preservation holy sites", "hello world group", 0.560},
+		{"the group for the preservation of the holy sites", "the hello world group", 0.637},
+		{precompute("the group for the preservation of the holy sites"), precompute("the hello world group"), 0.637},
+		{"group preservation holy sites", "hello world group", 0.577},
 
 		{"the group for the preservation of the holy sites", "the group", 0.880},
 		{precompute("the group for the preservation of the holy sites"), precompute("the group"), 0.880},
 		{"group preservation holy sites", "group", 0.879},
 
-		{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.426},
+		{"the group for the preservation of the holy sites", "The flibbity jibbity flobbity jobbity grobbity zobbity group", 0.345},
 		{
 			precompute("the group for the preservation of the holy sites"),
 			precompute("the flibbity jibbity flobbity jobbity grobbity zobbity group"),
-			0.446,
+			0.366,
 		},
-		{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.334},
+		{"group preservation holy sites", "flibbity jibbity flobbity jobbity grobbity zobbity group", 0.263},
 
 		// precompute
 		{"i c sogo kenkyusho", precompute("A.I.C. SOGO KENKYUSHO"), 0.858},

diff --git a/docs/usage-configuration.md b/docs/usage-configuration.md
@@ -15,6 +15,11 @@ menubar: docs-menu
 | `SEARCH_MAX_WORKERS` | Maximum number of goroutines used for search. | 1024 |
 | `ADJACENT_SIMILARITY_POSITIONS` | How many nearby words to search for highest max similarly score. | 3 |
 | `EXACT_MATCH_FAVORITISM` | Extra weighting assigned to exact matches. | 0.0 |
+| `DISABLE_PHONETIC_FILTERING` | Force scoring search terms against every indexed record. | `false` |
+| `LENGTH_DIFFERENCE_CUTOFF_FACTOR` | Minimum ratio for the length of two matching tokens, before they score is penalised. | 0.9       |
+| `LENGTH_DIFFERENCE_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens have different lengths. | 0.3    |
+| `DIFFERENT_LETTER_PENALTY_WEIGHT` | Weight of penalty applied to scores when two matching tokens begin with different letters. | 0.9   |
+| `UNMATCHED_INDEX_TOKEN_WEIGHT` | Weight of penalty applied to scores when part of the indexed name isn't matched. | 0.15    |
 | `JARO_WINKLER_BOOST_THRESHOLD` | Jaro-Winkler boost threshold. | 0.7 |
 | `JARO_WINKLER_PREFIX_SIZE` | Jaro-Winkler prefix size. | 4 |
 | `LOG_FORMAT` | Format for logging lines to be written as. | Options: `json`, `plain` - Default: `plain` |
@@ -34,9 +39,15 @@ menubar: docs-menu
 |-----|-----|-----|
 | `OFAC_DOWNLOAD_TEMPLATE` | HTTP address for downloading raw OFAC files. | `https://www.treasury.gov/ofac/downloads/%s` |
 | `DPL_DOWNLOAD_TEMPLATE` | HTTP address for downloading the DPL. | `https://www.bis.doc.gov/dpl/%s` |
-| `CSL_DOWNLOAD_TEMPLATE` | HTTP address for downloading the Consolidated Screening List (CSL), which is a collection of US government sanctions lists. | `https://api.trade.gov/consolidated_screening_list/%s` |
+| `EU_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading EU Consolidated Screening List | Subresource of `webgate.ec.europa.eu` |
+| `WITH_EU_SCREENING_LIST` | Download and parse the EU Consolidated Screening List | Default: `true` |
+| `UK_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading UK Consolidated Screening List | Subresource of `www.gov.uk` |
+| `UK_SANCTIONS_LIST_URL` | Use an alternate URL for downloading UK Sanctions List | Subresource of `www.gov.uk` |
+| `WITH_UK_SANCTIONS_LIST` | Download and parse the UK Sanctions List on startup. | Default: `false` |
+| `US_CSL_DOWNLOAD_URL` | Use an alternate URL for downloading US Consolidated Screening List | Subresource of `api.trade.gov` |
+| `CSL_DOWNLOAD_TEMPLATE` | Same as `US_CSL_DOWNLOAD_URL` | |
 | `KEEP_STOPWORDS` | Boolean to keep stopwords in names. | `false` |
-| `DEBUG_NAME_PIPELINE` | Boolean to pring debug messages for each name (SDN, SSI) processing step. | `false` |
+| `DEBUG_NAME_PIPELINE` | Boolean to print debug messages for each name (SDN, SSI) processing step. | `false` |
 
 ## Data persistence