Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace DiceCoefficient implementation #9

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,6 @@ pip-log.txt

# Mac crap
.DS_Store
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v15/Server/sqlite3
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/DesignTimeBuild
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/004.testlog
/src/DuoVia.FuzzyStrings/.vs/DuoVia.FuzzyStrings/v16/TestStore/0/testlog.manifest
.vs/
.vscode/
.idea/
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@ public static class DiceCoefficientExtensions
/// <returns></returns>
public static double DiceCoefficient(this string input, string comparedTo)
{
var ngrams = input.ToBiGrams();
var compareToNgrams = comparedTo.ToBiGrams();
return ngrams.DiceCoefficient(compareToNgrams);
if (input == comparedTo)
return 1.0d;

if (input.Length < 2 || comparedTo.Length < 2)
return 0.0d;

var biGrams = input.ToBiGrams(false);
var compareToBiGrams = comparedTo.ToBiGrams(false);
return DiceCoefficient(biGrams, compareToBiGrams);
}

/// <summary>
Expand All @@ -33,41 +39,72 @@ public static double DiceCoefficient(this string input, string comparedTo)
/// <returns></returns>
public static double DiceCoefficient(this string[] nGrams, string[] compareToNGrams)
{
int matches = nGrams.Intersect(compareToNGrams).Count();
if (matches == 0) return 0.0d;
var nGramMap = new Dictionary<string, int>(nGrams.Length);
var compareToNGramMap = new Dictionary<string, int>(compareToNGrams.Length);
var nGramSet = new HashSet<string>();
var compareToNGramSet = new HashSet<string>();
foreach (var nGram in nGrams)
{
if (nGramSet.Add(nGram))
nGramMap[nGram] = 1;
else
nGramMap[nGram]++;
}
foreach (var nGram in compareToNGrams)
{
if (compareToNGramSet.Add(nGram))
compareToNGramMap[nGram] = 1;
else
compareToNGramMap[nGram]++;
}
nGramSet.IntersectWith(compareToNGramSet);
if (nGramSet.Count == 0)
return 0.0d;

var matches = 0;
foreach (var nGram in nGramSet)
matches += Math.Min(nGramMap[nGram], compareToNGramMap[nGram]);

double totalBigrams = nGrams.Length + compareToNGrams.Length;
return (2 * matches) / totalBigrams;
}

public static string[] ToBiGrams(this string input)
public static string[] ToBiGrams(this string input, bool usePadding = true)
{
// nLength == 2
// from Jackson, return %j ja ac ck ks so on n#
// from Main, return #m ma ai in n#
input = SinglePercent + input + SinglePound;
if (usePadding)
{
// nLength == 2
// from Jackson, return %j ja ac ck ks so on n#
// from Main, return %m ma ai in n#
input = SinglePercent + input + SinglePound;
}
if (input.Length < 2)
return new string[0];

return ToNGrams(input, 2);
}

public static string[] ToTriGrams(this string input)
{
// nLength == 3
// from Jackson, return %%j %ja jac ack cks kso son on# n##
// from Main, return ##m #ma mai ain in# n##
input = DoublePercent + input + DoublePount;
// from Jackson, return &&j &ja jac ack cks kso son on# n##
// from Main, return &&m &ma mai ain in# n##
input = DoubleAmpersand + input + DoublePound;
return ToNGrams(input, 3);
}

private static string[] ToNGrams(string input, int nLength)
{
int itemsCount = input.Length - 1;
string[] ngrams = new string[input.Length - 1];
for (int i = 0; i < itemsCount; i++) ngrams[i] = input.Substring(i, nLength);
for (int i = 0; i < itemsCount; i++)
ngrams[i] = input.Substring(i, nLength);
return ngrams;
}

private const string SinglePercent = "%";
private const string SinglePound = "#";
private const string DoublePercent = "&&";
private const string DoublePount = "##";
private const string DoubleAmpersand = "&&";
private const string DoublePound = "##";
}
}
22 changes: 21 additions & 1 deletion src/DuoVia.FuzzyStrings/DuoVia.FuzzyStringsTests/FuzzyTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ public void FuzzyMatchTests(string input, string match)
}

[Theory]
[InlineData("wwww", "w")]
[InlineData("test", "w")]
[InlineData("test", "W")]
[InlineData("test", "w ")]
Expand Down Expand Up @@ -78,11 +79,30 @@ public void FuzzyMatchTests(string input, string match)
[InlineData("2130 South Fort Union Blvd.", "Rural Route 2 Box 29")]
[InlineData("2130 South Fort Union Blvd.", "PO Box 3487")]
[InlineData("2130 South Fort Union Blvd.", "3 Harvard Square")]
[InlineData("eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee", "ee")]
[InlineData("aaaaaaaaa", "aaaaaaaaa")]
public void DiceCoefficientTests(string input, string match)
{
var result = input.DiceCoefficient(match);
var reversedResult = match.DiceCoefficient(input);
var inputBiGrams = input.ToBiGrams(false);
var matchBiGrams = match.ToBiGrams(false);
var biGramResult = inputBiGrams.DiceCoefficient(matchBiGrams);
var reversedBiGramResult = matchBiGrams.DiceCoefficient(inputBiGrams);
output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result} (reversed was {reversedResult}), biGramResult was {biGramResult} (reversed was {reversedBiGramResult}).");

Assert.True(Math.Abs(result - reversedResult) < double.Epsilon);
Assert.True(Math.Abs(biGramResult - reversedBiGramResult) < double.Epsilon);
Assert.True(Math.Abs(result - biGramResult) < double.Epsilon);
Assert.True(result >= 0.0);
output.WriteLine($"DiceCoefficient of \"{match}\" against \"{input}\" was {result}.");
Assert.True(result <= 1.0);
Assert.True(biGramResult >= 0.0);
Assert.True(biGramResult <= 1.0);
if (input == match)
{
Assert.True(Math.Abs(result - 1.0) < double.Epsilon);
Assert.True(Math.Abs(biGramResult - 1.0) < double.Epsilon);
}
}

[Theory]
Expand Down