From 997ae04bfc60db9da121c830abc43090ba12dbd0 Mon Sep 17 00:00:00 2001 From: 13xforever Date: Wed, 3 Mar 2021 02:03:09 +0500 Subject: [PATCH] Replace DiceCoefficient implementation Removes input string padding as it introduces bias for the first and last character matches. As a consequence, comparing with one-letter string will return score of 0 as it has zero bigrams now. Fixes #8 --- .gitignore | 2 + .../DiceCoefficientExtensions.cs | 68 ++++++++++++++----- .../DuoVia.FuzzyStringsTests/FuzzyTests.cs | 22 +++++- 3 files changed, 75 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index 7fdd307..68ee45a 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,5 @@ pip-log.txt /src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/DesignTimeBuild /src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/004.testlog /src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/testlog.manifest + +.idea/ \ No newline at end of file diff --git a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs index 4936882..e5ae2b6 100644 --- a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs +++ b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs @@ -20,9 +20,15 @@ public static class DiceCoefficientExtensions /// public static double DiceCoefficient(this string input, string comparedTo) { - var ngrams = input.ToBiGrams(); - var compareToNgrams = comparedTo.ToBiGrams(); - return ngrams.DiceCoefficient(compareToNgrams); + if (input == comparedTo) + return 1.0d; + + if (input.Length < 2 || comparedTo.Length < 2) + return 0.0d; + + var biGrams = input.ToBiGrams(false); + var compareToBiGrams = comparedTo.ToBiGrams(false); + return DiceCoefficient(biGrams, compareToBiGrams); } /// @@ -33,27 +39,56 @@ public static double DiceCoefficient(this string input, string comparedTo) /// public static double DiceCoefficient(this string[] nGrams, string[] compareToNGrams) { - int matches = nGrams.Intersect(compareToNGrams).Count(); - if (matches == 0) return 0.0d; + var nGramMap = new Dictionary(nGrams.Length); + var compareToNGramMap = new Dictionary(compareToNGrams.Length); + var nGramSet = new HashSet(); + var compareToNGramSet = new HashSet(); + foreach (var nGram in nGrams) + { + if (nGramSet.Add(nGram)) + nGramMap[nGram] = 1; + else + nGramMap[nGram]++; + } + foreach (var nGram in compareToNGrams) + { + if (compareToNGramSet.Add(nGram)) + compareToNGramMap[nGram] = 1; + else + compareToNGramMap[nGram]++; + } + nGramSet.IntersectWith(compareToNGramSet); + var matches = 0; + foreach (var nGram in nGramSet) + matches += Math.Min(nGramMap[nGram], compareToNGramMap[nGram]); + if (matches == 0) + return 0.0d; + double totalBigrams = nGrams.Length + compareToNGrams.Length; return (2 * matches) / totalBigrams; } - public static string[] ToBiGrams(this string input) + public static string[] ToBiGrams(this string input, bool usePadding = true) { - // nLength == 2 - // from Jackson, return %j ja ac ck ks so on n# - // from Main, return #m ma ai in n# - input = SinglePercent + input + SinglePound; + if (usePadding) + { + // nLength == 2 + // from Jackson, return %j ja ac ck ks so on n# + // from Main, return %m ma ai in n# + input = SinglePercent + input + SinglePound; + } + if (input.Length < 2) + return new string[0]; + return ToNGrams(input, 2); } public static string[] ToTriGrams(this string input) { // nLength == 3 - // from Jackson, return %%j %ja jac ack cks kso son on# n## - // from Main, return ##m #ma mai ain in# n## - input = DoublePercent + input + DoublePount; + // from Jackson, return &&j &ja jac ack cks kso son on# n## + // from Main, return &&m &ma mai ain in# n## + input = DoubleAmpersand + input + DoublePound; return ToNGrams(input, 3); } @@ -61,13 +96,14 @@ private static string[] ToNGrams(string input, int nLength) { int itemsCount = input.Length - 1; string[] ngrams = new string[input.Length - 1]; - for (int i = 0; i < itemsCount; i++) ngrams[i] = input.Substring(i, nLength); + for (int i = 0; i < itemsCount; i++) + ngrams[i] = input.Substring(i, nLength); return ngrams; } private const string SinglePercent = "%"; private const string SinglePound = "#"; - private const string DoublePercent = "&&"; - private const string DoublePount = "##"; + private const string DoubleAmpersand = "&&"; + private const string DoublePound = "##"; } } diff --git a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs index 183d9fc..528168c 100644 --- a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs +++ b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs @@ -50,6 +50,7 @@ public void FuzzyMatchTests(string input, string match) } [Theory] + [InlineData("wwww", "w")] [InlineData("test", "w")] [InlineData("test", "W")] [InlineData("test", "w ")] @@ -78,11 +79,30 @@ public void FuzzyMatchTests(string input, string match) [InlineData("2130 South Fort Union Blvd.", "Rural Route 2 Box 29")] [InlineData("2130 South Fort Union Blvd.", "PO Box 3487")] [InlineData("2130 South Fort Union Blvd.", "3 Harvard Square")] + [InlineData("eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee", "ee")] + [InlineData("aaaaaaaaa", "aaaaaaaaa")] public void DiceCoefficientTests(string input, string match) { var result = input.DiceCoefficient(match); + var reversedResult = match.DiceCoefficient(input); + var inputBiGrams = input.ToBiGrams(false); + var matchBiGrams = match.ToBiGrams(false); + var biGramResult = inputBiGrams.DiceCoefficient(matchBiGrams); + var reversedBiGramResult = matchBiGrams.DiceCoefficient(inputBiGrams); + output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result} (reversed was {reversedResult}), biGramResult was {biGramResult} (reversed was {reversedBiGramResult})."); + + Assert.True(Math.Abs(result - reversedResult) < double.Epsilon); + Assert.True(Math.Abs(biGramResult - reversedBiGramResult) < double.Epsilon); + Assert.True(Math.Abs(result - biGramResult) < double.Epsilon); Assert.True(result >= 0.0); - output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result}."); + Assert.True(result <= 1.0); + Assert.True(biGramResult >= 0.0); + Assert.True(biGramResult <= 1.0); + if (input == match) + { + Assert.True(Math.Abs(result - 1.0) < double.Epsilon); + Assert.True(Math.Abs(biGramResult - 1.0) < double.Epsilon); + } } [Theory]