Skip to content

Commit

Permalink
feat: optimize find duplicates parallel processing (#42)
Browse files Browse the repository at this point in the history
  • Loading branch information
gabynevada authored Feb 3, 2023
1 parent 8ef416a commit 6eaabb5
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 37 deletions.
1 change: 1 addition & 0 deletions MatchingEngine.CLI/MatchingEngine.CLI.csproj
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<Version>0.4.0</Version>
<OutputType>Exe</OutputType>
<TargetFramework>net7.0</TargetFramework>
<AssemblyName>matching</AssemblyName>
Expand Down
67 changes: 32 additions & 35 deletions MatchingEngine.Domain/Duplicate.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
using System.Collections.Concurrent;
using MatchingEngine.Domain.Helpers;
using MatchingEngine.Domain.Models;

namespace MatchingEngine.Domain;
Expand Down Expand Up @@ -37,55 +36,53 @@ private static Dictionary<char, int[]> GetCharactersStartAndEndIndex(ReadOnlySpa
return characterIndex;
}

public static ConcurrentBag<PotentialDuplicate> GetPotentialDuplicates(PatientRecord[] records1,
PatientRecord[] records2, double lowerScoreThreshold, double upperScoreThreshold)
public static ConcurrentBag<PotentialDuplicate> GetPotentialDuplicates(Memory<PatientRecord> records1,
Memory<PatientRecord> records2, double lowerScoreThreshold, double upperScoreThreshold)
{
var characterStartAndEndIndex = GetCharactersStartAndEndIndex(records2);
var potentialDuplicates = new ConcurrentBag<PotentialDuplicate>();
Parallel.For(0, records1.Length,
primaryRecordIndex =>
{
CompareRecords(potentialDuplicates, primaryRecordIndex, records1, records2, characterStartAndEndIndex,
lowerScoreThreshold, upperScoreThreshold);
});
return potentialDuplicates;
}

private static void CompareRecords(ConcurrentBag<PotentialDuplicate> potentialDuplicates, int primaryIndex,
ReadOnlySpan<PatientRecord> primaryRecords, PatientRecord[] recordsToCompare,
IReadOnlyDictionary<char, int[]> characterStartAndEndIndex, double lowerScoreThreshold,
double upperScoreThreshold)
{
var primaryRecord = primaryRecords[primaryIndex];
int[]? indices = null;
_ = primaryRecord.FirstName.Length > 0 &&
characterStartAndEndIndex.TryGetValue(primaryRecord.FirstName[0], out indices);

var start = indices != null ? indices[0] : 0;
var end = indices != null ? indices[1] : recordsToCompare.Length;
var records1CharacterStartAndEndIndex = GetCharactersStartAndEndIndex(records1.Span);
var records2CharacterStartAndEndIndex = GetCharactersStartAndEndIndex(records2.Span);

Parallel.ForEach(records1CharacterStartAndEndIndex, record1StartAndEnd =>
{
var records2StartAndEndFound =
records2CharacterStartAndEndIndex.TryGetValue(record1StartAndEnd.Key, out var records2StartAndEnd);

Parallel.For(start, end, list2Index =>
var records1ToCompare = records1.Slice(record1StartAndEnd.Value[0],
record1StartAndEnd.Value[1] - record1StartAndEnd.Value[0]);

var records2ToCompare = records2StartAndEndFound && records2StartAndEnd != null
? records2.Slice(records2StartAndEnd[0], records2StartAndEnd[1] - records2StartAndEnd[0])
: records2;

Parallel.For(0, records1ToCompare.Length, recordToCompareIndex =>
{
CompareRecordsInnerLoop(potentialDuplicates, primaryRecord, list2Index, recordsToCompare,
lowerScoreThreshold, upperScoreThreshold);
var primaryRecord = records1ToCompare.Span[recordToCompareIndex];
for (var i = 0; i < records2ToCompare.Length; i++)
{
var secondaryRecord = records2ToCompare.Span[i];
CompareRecords(potentialDuplicates, ref primaryRecord, ref secondaryRecord,
lowerScoreThreshold, upperScoreThreshold);
}
});
});

return potentialDuplicates;
}

private static void CompareRecordsInnerLoop(ConcurrentBag<PotentialDuplicate> potentialDuplicates,
PatientRecord primaryRecord, int recordToCompareIndex, ReadOnlySpan<PatientRecord> recordsToCompare,
private static void CompareRecords(ConcurrentBag<PotentialDuplicate> potentialDuplicates,
ref PatientRecord primaryRecord, ref PatientRecord secondaryRecord,
double lowerScoreThreshold, double upperScoreThreshold)
{
var tempRecord = recordsToCompare[recordToCompareIndex];
//check if the first character of the first name is equal
if (!StringHelpers.FirstCharactersAreEqual(primaryRecord.FirstName, tempRecord.FirstName) ||
primaryRecord.RecordId == tempRecord.RecordId) return;
if (primaryRecord.RecordId == secondaryRecord.RecordId) return;
//get the distance vector for the ith vector of the first table and the jth record of the second table
var distanceVector = DistanceVector.CalculateDistance(primaryRecord, tempRecord);
var distanceVector = DistanceVector.CalculateDistance(ref primaryRecord, ref secondaryRecord);
var tempScore = Score.CalculateFinalScore(ref distanceVector);
if (tempScore >= lowerScoreThreshold && tempScore <= upperScoreThreshold)
{
potentialDuplicates.Add(
new PotentialDuplicate(primaryRecord, tempRecord, distanceVector, tempScore));
new PotentialDuplicate(primaryRecord, secondaryRecord, distanceVector, tempScore));
}
}
}
2 changes: 1 addition & 1 deletion MatchingEngine.Domain/Models/DistanceVector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ public readonly record struct DistanceVector
public int CityDistance { get; private init; }
public int PhoneNumberDistance { get; private init; }

public static DistanceVector CalculateDistance(PatientRecord firstRecord, PatientRecord secondRecord)
public static DistanceVector CalculateDistance(ref PatientRecord firstRecord, ref PatientRecord secondRecord)
{
return new DistanceVector
{
Expand Down
2 changes: 1 addition & 1 deletion MatchingEngine.Domain/Models/PatientRecord.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace MatchingEngine.Domain.Models;

public sealed record PatientRecord
public readonly record struct PatientRecord
(
string RecordId,
string FirstName,
Expand Down

0 comments on commit 6eaabb5

Please sign in to comment.