Skip to content

Commit

Permalink
Replace DiceCoefficient implementation
Browse files Browse the repository at this point in the history
Removes input string padding as it introduces bias for the first and last character matches.
As a consequence, comparing with one-letter string will return score of 0 as it has zero bigrams now.

Fixes tylerjensen#8
  • Loading branch information
13xforever committed Mar 2, 2021
1 parent 6399781 commit 997ae04
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,5 @@ pip-log.txt
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/DesignTimeBuild
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/004.testlog
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/testlog.manifest

.idea/
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@ public static class DiceCoefficientExtensions
/// <returns></returns>
public static double DiceCoefficient(this string input, string comparedTo)
{
var ngrams = input.ToBiGrams();
var compareToNgrams = comparedTo.ToBiGrams();
return ngrams.DiceCoefficient(compareToNgrams);
if (input == comparedTo)
return 1.0d;

if (input.Length < 2 || comparedTo.Length < 2)
return 0.0d;

var biGrams = input.ToBiGrams(false);
var compareToBiGrams = comparedTo.ToBiGrams(false);
return DiceCoefficient(biGrams, compareToBiGrams);
}

/// <summary>
Expand All @@ -33,41 +39,71 @@ public static double DiceCoefficient(this string input, string comparedTo)
/// <returns></returns>
public static double DiceCoefficient(this string[] nGrams, string[] compareToNGrams)
{
int matches = nGrams.Intersect(compareToNGrams).Count();
if (matches == 0) return 0.0d;
var nGramMap = new Dictionary<string, int>(nGrams.Length);
var compareToNGramMap = new Dictionary<string, int>(compareToNGrams.Length);
var nGramSet = new HashSet<string>();
var compareToNGramSet = new HashSet<string>();
foreach (var nGram in nGrams)
{
if (nGramSet.Add(nGram))
nGramMap[nGram] = 1;
else
nGramMap[nGram]++;
}
foreach (var nGram in compareToNGrams)
{
if (compareToNGramSet.Add(nGram))
compareToNGramMap[nGram] = 1;
else
compareToNGramMap[nGram]++;
}
nGramSet.IntersectWith(compareToNGramSet);
var matches = 0;
foreach (var nGram in nGramSet)
matches += Math.Min(nGramMap[nGram], compareToNGramMap[nGram]);
if (matches == 0)
return 0.0d;

double totalBigrams = nGrams.Length + compareToNGrams.Length;
return (2 * matches) / totalBigrams;
}

public static string[] ToBiGrams(this string input)
public static string[] ToBiGrams(this string input, bool usePadding = true)
{
// nLength == 2
// from Jackson, return %j ja ac ck ks so on n#
// from Main, return #m ma ai in n#
input = SinglePercent + input + SinglePound;
if (usePadding)
{
// nLength == 2
// from Jackson, return %j ja ac ck ks so on n#
// from Main, return %m ma ai in n#
input = SinglePercent + input + SinglePound;
}
if (input.Length < 2)
return new string[0];

return ToNGrams(input, 2);
}

public static string[] ToTriGrams(this string input)
{
// nLength == 3
// from Jackson, return %%j %ja jac ack cks kso son on# n##
// from Main, return ##m #ma mai ain in# n##
input = DoublePercent + input + DoublePount;
// from Jackson, return &&j &ja jac ack cks kso son on# n##
// from Main, return &&m &ma mai ain in# n##
input = DoubleAmpersand + input + DoublePound;
return ToNGrams(input, 3);
}

private static string[] ToNGrams(string input, int nLength)
{
int itemsCount = input.Length - 1;
string[] ngrams = new string[input.Length - 1];
for (int i = 0; i < itemsCount; i++) ngrams[i] = input.Substring(i, nLength);
for (int i = 0; i < itemsCount; i++)
ngrams[i] = input.Substring(i, nLength);
return ngrams;
}

private const string SinglePercent = "%";
private const string SinglePound = "#";
private const string DoublePercent = "&&";
private const string DoublePount = "##";
private const string DoubleAmpersand = "&&";
private const string DoublePound = "##";
}
}
22 changes: 21 additions & 1 deletion src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public void FuzzyMatchTests(string input, string match)
}

[Theory]
[InlineData("wwww", "w")]
[InlineData("test", "w")]
[InlineData("test", "W")]
[InlineData("test", "w ")]
Expand Down Expand Up @@ -78,11 +79,30 @@ public void FuzzyMatchTests(string input, string match)
[InlineData("2130 South Fort Union Blvd.", "Rural Route 2 Box 29")]
[InlineData("2130 South Fort Union Blvd.", "PO Box 3487")]
[InlineData("2130 South Fort Union Blvd.", "3 Harvard Square")]
[InlineData("eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee", "ee")]
[InlineData("aaaaaaaaa", "aaaaaaaaa")]
public void DiceCoefficientTests(string input, string match)
{
var result = input.DiceCoefficient(match);
var reversedResult = match.DiceCoefficient(input);
var inputBiGrams = input.ToBiGrams(false);
var matchBiGrams = match.ToBiGrams(false);
var biGramResult = inputBiGrams.DiceCoefficient(matchBiGrams);
var reversedBiGramResult = matchBiGrams.DiceCoefficient(inputBiGrams);
output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result} (reversed was {reversedResult}), biGramResult was {biGramResult} (reversed was {reversedBiGramResult}).");

Assert.True(Math.Abs(result - reversedResult) < double.Epsilon);
Assert.True(Math.Abs(biGramResult - reversedBiGramResult) < double.Epsilon);
Assert.True(Math.Abs(result - biGramResult) < double.Epsilon);
Assert.True(result >= 0.0);
output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result}.");
Assert.True(result <= 1.0);
Assert.True(biGramResult >= 0.0);
Assert.True(biGramResult <= 1.0);
if (input == match)
{
Assert.True(Math.Abs(result - 1.0) < double.Epsilon);
Assert.True(Math.Abs(biGramResult - 1.0) < double.Epsilon);
}
}

[Theory]
Expand Down

0 comments on commit 997ae04

Please sign in to comment.