diff --git a/.gitignore b/.gitignore
index 7fdd307..66d840b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,7 +161,6 @@ pip-log.txt
# Mac crap
.DS_Store
-/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v15/Server/sqlite3
-/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/DesignTimeBuild
-/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/004.testlog
-/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/testlog.manifest
+.vs/
+.vscode/
+.idea/
diff --git a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs
index 4936882..33b98e3 100644
--- a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs
+++ b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStrings/DiceCoefficientExtensions.cs
@@ -20,9 +20,15 @@ public static class DiceCoefficientExtensions
///
public static double DiceCoefficient(this string input, string comparedTo)
{
- var ngrams = input.ToBiGrams();
- var compareToNgrams = comparedTo.ToBiGrams();
- return ngrams.DiceCoefficient(compareToNgrams);
+ if (input == comparedTo)
+ return 1.0d;
+
+ if (input.Length < 2 || comparedTo.Length < 2)
+ return 0.0d;
+
+ var biGrams = input.ToBiGrams(false);
+ var compareToBiGrams = comparedTo.ToBiGrams(false);
+ return DiceCoefficient(biGrams, compareToBiGrams);
}
///
@@ -33,27 +39,57 @@ public static double DiceCoefficient(this string input, string comparedTo)
///
public static double DiceCoefficient(this string[] nGrams, string[] compareToNGrams)
{
- int matches = nGrams.Intersect(compareToNGrams).Count();
- if (matches == 0) return 0.0d;
+ var nGramMap = new Dictionary(nGrams.Length);
+ var compareToNGramMap = new Dictionary(compareToNGrams.Length);
+ var nGramSet = new HashSet();
+ var compareToNGramSet = new HashSet();
+ foreach (var nGram in nGrams)
+ {
+ if (nGramSet.Add(nGram))
+ nGramMap[nGram] = 1;
+ else
+ nGramMap[nGram]++;
+ }
+ foreach (var nGram in compareToNGrams)
+ {
+ if (compareToNGramSet.Add(nGram))
+ compareToNGramMap[nGram] = 1;
+ else
+ compareToNGramMap[nGram]++;
+ }
+ nGramSet.IntersectWith(compareToNGramSet);
+ if (nGramSet.Count == 0)
+ return 0.0d;
+
+ var matches = 0;
+ foreach (var nGram in nGramSet)
+ matches += Math.Min(nGramMap[nGram], compareToNGramMap[nGram]);
+
double totalBigrams = nGrams.Length + compareToNGrams.Length;
return (2 * matches) / totalBigrams;
}
- public static string[] ToBiGrams(this string input)
+ public static string[] ToBiGrams(this string input, bool usePadding = true)
{
- // nLength == 2
- // from Jackson, return %j ja ac ck ks so on n#
- // from Main, return #m ma ai in n#
- input = SinglePercent + input + SinglePound;
+ if (usePadding)
+ {
+ // nLength == 2
+ // from Jackson, return %j ja ac ck ks so on n#
+ // from Main, return %m ma ai in n#
+ input = SinglePercent + input + SinglePound;
+ }
+ if (input.Length < 2)
+ return new string[0];
+
return ToNGrams(input, 2);
}
public static string[] ToTriGrams(this string input)
{
// nLength == 3
- // from Jackson, return %%j %ja jac ack cks kso son on# n##
- // from Main, return ##m #ma mai ain in# n##
- input = DoublePercent + input + DoublePount;
+ // from Jackson, return &&j &ja jac ack cks kso son on# n##
+ // from Main, return &&m &ma mai ain in# n##
+ input = DoubleAmpersand + input + DoublePound;
return ToNGrams(input, 3);
}
@@ -61,13 +97,14 @@ private static string[] ToNGrams(string input, int nLength)
{
int itemsCount = input.Length - 1;
string[] ngrams = new string[input.Length - 1];
- for (int i = 0; i < itemsCount; i++) ngrams[i] = input.Substring(i, nLength);
+ for (int i = 0; i < itemsCount; i++)
+ ngrams[i] = input.Substring(i, nLength);
return ngrams;
}
private const string SinglePercent = "%";
private const string SinglePound = "#";
- private const string DoublePercent = "&&";
- private const string DoublePount = "##";
+ private const string DoubleAmpersand = "&&";
+ private const string DoublePound = "##";
}
}
diff --git a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs
index 183d9fc..528168c 100644
--- a/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs
+++ b/src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs
@@ -50,6 +50,7 @@ public void FuzzyMatchTests(string input, string match)
}
[Theory]
+ [InlineData("wwww", "w")]
[InlineData("test", "w")]
[InlineData("test", "W")]
[InlineData("test", "w ")]
@@ -78,11 +79,30 @@ public void FuzzyMatchTests(string input, string match)
[InlineData("2130 South Fort Union Blvd.", "Rural Route 2 Box 29")]
[InlineData("2130 South Fort Union Blvd.", "PO Box 3487")]
[InlineData("2130 South Fort Union Blvd.", "3 Harvard Square")]
+ [InlineData("eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee", "ee")]
+ [InlineData("aaaaaaaaa", "aaaaaaaaa")]
public void DiceCoefficientTests(string input, string match)
{
var result = input.DiceCoefficient(match);
+ var reversedResult = match.DiceCoefficient(input);
+ var inputBiGrams = input.ToBiGrams(false);
+ var matchBiGrams = match.ToBiGrams(false);
+ var biGramResult = inputBiGrams.DiceCoefficient(matchBiGrams);
+ var reversedBiGramResult = matchBiGrams.DiceCoefficient(inputBiGrams);
+ output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result} (reversed was {reversedResult}), biGramResult was {biGramResult} (reversed was {reversedBiGramResult}).");
+
+ Assert.True(Math.Abs(result - reversedResult) < double.Epsilon);
+ Assert.True(Math.Abs(biGramResult - reversedBiGramResult) < double.Epsilon);
+ Assert.True(Math.Abs(result - biGramResult) < double.Epsilon);
Assert.True(result >= 0.0);
- output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result}.");
+ Assert.True(result <= 1.0);
+ Assert.True(biGramResult >= 0.0);
+ Assert.True(biGramResult <= 1.0);
+ if (input == match)
+ {
+ Assert.True(Math.Abs(result - 1.0) < double.Epsilon);
+ Assert.True(Math.Abs(biGramResult - 1.0) < double.Epsilon);
+ }
}
[Theory]