forked from rebolsource/r3
-
-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
FEAT: improved the speed of the
soundex
code and fixed it to produc…
…e the same results as PHP.
- Loading branch information
Showing
3 changed files
with
91 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,81 @@ | ||
REBOL [ | ||
Title: "Soundex" | ||
Date: 16-Jul-2024 | ||
File: %soundex.r | ||
Author: "Allen Kamp, Oldes" | ||
Purpose: {Soundex Encoding returns similar codes for similar sounding words or names. eg Stephens, Stevens are both S315, Smith and Smythe are both S53. Useful for adding Sounds-like searching to databases} | ||
Comment: { | ||
This is the basic Soundex algorithm (There are a number of different | ||
one floating around) | ||
Title: "Soundex" | ||
Date: 16-Jul-2024 | ||
File: %soundex.reb | ||
Author: "Allen Kamp, Oldes" | ||
Purpose: {Soundex Encoding returns similar codes for similar sounding words or names. eg Stephens, Stevens are both S315, Smith and Smythe are both S53. Useful for adding Sounds-like searching to databases} | ||
Comment: { | ||
This is the basic Soundex algorithm: https://en.wikipedia.org/wiki/Soundex | ||
1. Remove vowels, H, W and Y | ||
2. Encode each char with its code value | ||
3. Remove adjacent duplicate numbers | ||
1. Remove vowels, H, W and Y | ||
2. Encode each char with its code value | ||
3. Remove adjacent duplicate numbers | ||
4. Return First letter, followed by the next 3 letter's code | ||
numbers, if they exist. | ||
4. Return First letter, followed by the next 3 letter's code | ||
numbers, if they exist. | ||
TODO: Other algorithms: Extended Soundex, Metaphone and the LC Cutter table | ||
} | ||
Language: "English" | ||
Email: %allenk--powerup--com--au | ||
library: [ | ||
level: 'intermediate | ||
platform: 'all | ||
type: 'tool | ||
domain: [DB text text-processing] | ||
tested-under: none | ||
support: none | ||
license: none | ||
see-also: none | ||
] | ||
Version: 1.1.0 | ||
Type: module | ||
Exports: [soundex] | ||
Needs: 3.0.0 | ||
History: [ | ||
17-Jul-1999 @Allen "Initial version" | ||
16-Jul-2024 @Oldes "Ported to Rebol3" | ||
TODO: Other algorithms: Extended Soundex, Metaphone and the LC Cutter table | ||
} | ||
Version: 2.0.0 | ||
Type: module | ||
Name: soundex | ||
Exports: [soundex] | ||
Needs: 3.0.0 | ||
History: [ | ||
17-Jul-1999 @Allen "Initial version" | ||
16-Jul-2024 @Oldes "Ported to Rebol3" | ||
|
||
] | ||
] | ||
] | ||
|
||
soundex: function/with [ | ||
{Returns the Census Soundex Code for the given string} | ||
string [any-string!] "String to Encode" | ||
{Returns the Census Soundex Code for the given string} | ||
string [any-string!] "String to Encode" | ||
][ | ||
code: make string! 4 | ||
prev: none | ||
|
||
code: make string! "" | ||
if empty? string [return "0000"] | ||
|
||
either all [string? string string <> ""] [ | ||
string: uppercase trim copy string | ||
|
||
foreach letter string [ | ||
parse to string! letter [soundex-match | soundex-no-match] | ||
if 4 = length? code [break] ;maximum length for code is 4 | ||
] | ||
] [ | ||
return string ; return unchanged | ||
] | ||
change code first string ; replace first number with first letter | ||
return code | ||
foreach letter string [ | ||
either val: mapping/:letter [ | ||
if val != prev [append code val prev: val] | ||
][ | ||
if find "aeiouhwy" letter [prev: #" "] | ||
if empty? code [append code #"0"] | ||
] | ||
if 4 = length? code [break] ;maximum length for code is 4 | ||
] | ||
change code uppercase first string | ||
pad/with code 4 #"0" | ||
code | ||
][ | ||
code: val: none | ||
; Create Rules | ||
set1: [[#"B" | #"F" | #"P" | #"V"](val: #"1")] | ||
set2: [[#"C" | #"G" | #"J" | #"K" | #"Q" | #"S" | #"X" | #"Z"](val: #"2")] | ||
set3: [[#"D" | #"T"](val: #"3")] | ||
set4: [[#"L"](val: "4")] | ||
set5: [[#"M" | #"N"] (val: #"5")] | ||
set6: [[#"R"](val: #"6")] | ||
; Append val to code if not a duplicate of previous code val | ||
soundex-match: [[set1 | set2 | set3 | set4 | set5 | set6 ] | ||
(if val <> back tail code [append code val]) ] | ||
code: val: prev: none | ||
mapping: make map! [ | ||
;Set1 | ||
#"B" #"1" | ||
#"F" #"1" | ||
#"P" #"1" | ||
#"V" #"1" | ||
;Set2 | ||
#"C" #"2" | ||
#"G" #"2" | ||
#"J" #"2" | ||
#"K" #"2" | ||
#"Q" #"2" | ||
#"S" #"2" | ||
#"X" #"2" | ||
#"Z" #"2" | ||
;Set3 | ||
#"D" #"3" | ||
#"T" #"3" | ||
;Set4 | ||
#"L" #"4" | ||
;Set5 | ||
#"M" #"5" | ||
#"N" #"5" | ||
;Set6 | ||
#"R" #"6" | ||
] | ||
] | ||
|
||
; If letter not a matched letter its val is 0, but we only care | ||
; about it if it is the first letter. | ||
soundex-no-match: [(if (length? code) = 0 [append code "0"])] | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Rebol [ | ||
Title: "Test Soundex function" | ||
Date: 16-Jul-2024 | ||
Author: "Oldes" | ||
File: %test-soundex.r3 | ||
Version: 1.0.0 | ||
] | ||
use [tmp][ | ||
tmp: none | ||
foreach [code name] [ | ||
"R163" "Robert" | ||
"R163" "Rupert" | ||
"R150" "Rubin" | ||
"A226" "Ashcraft" | ||
"A226" "Ashcroft" | ||
"T522" "Tymczak" ;; the chars 'z' and 'k' in the name are coded as 2 twice since a vowel lies in between them | ||
"P236" "Pfister" | ||
"H555" "Honeyman" | ||
][ | ||
printf [5 9 5] reduce [code name tmp: soundex name code == tmp] | ||
] | ||
] |