diff --git a/MatchingEngine.CLI/MatchingEngine.CLI.csproj b/MatchingEngine.CLI/MatchingEngine.CLI.csproj index d7a9247..80d0eb0 100644 --- a/MatchingEngine.CLI/MatchingEngine.CLI.csproj +++ b/MatchingEngine.CLI/MatchingEngine.CLI.csproj @@ -1,6 +1,7 @@ + 0.4.0 Exe net7.0 matching diff --git a/MatchingEngine.Domain/Duplicate.cs b/MatchingEngine.Domain/Duplicate.cs index e7437a3..4aab14f 100644 --- a/MatchingEngine.Domain/Duplicate.cs +++ b/MatchingEngine.Domain/Duplicate.cs @@ -1,5 +1,4 @@ using System.Collections.Concurrent; -using MatchingEngine.Domain.Helpers; using MatchingEngine.Domain.Models; namespace MatchingEngine.Domain; @@ -37,55 +36,53 @@ private static Dictionary GetCharactersStartAndEndIndex(ReadOnlySpa return characterIndex; } - public static ConcurrentBag GetPotentialDuplicates(PatientRecord[] records1, - PatientRecord[] records2, double lowerScoreThreshold, double upperScoreThreshold) + public static ConcurrentBag GetPotentialDuplicates(Memory records1, + Memory records2, double lowerScoreThreshold, double upperScoreThreshold) { - var characterStartAndEndIndex = GetCharactersStartAndEndIndex(records2); var potentialDuplicates = new ConcurrentBag(); - Parallel.For(0, records1.Length, - primaryRecordIndex => - { - CompareRecords(potentialDuplicates, primaryRecordIndex, records1, records2, characterStartAndEndIndex, - lowerScoreThreshold, upperScoreThreshold); - }); - return potentialDuplicates; - } - private static void CompareRecords(ConcurrentBag potentialDuplicates, int primaryIndex, - ReadOnlySpan primaryRecords, PatientRecord[] recordsToCompare, - IReadOnlyDictionary characterStartAndEndIndex, double lowerScoreThreshold, - double upperScoreThreshold) - { - var primaryRecord = primaryRecords[primaryIndex]; - int[]? indices = null; - _ = primaryRecord.FirstName.Length > 0 && - characterStartAndEndIndex.TryGetValue(primaryRecord.FirstName[0], out indices); - - var start = indices != null ? indices[0] : 0; - var end = indices != null ? indices[1] : recordsToCompare.Length; + var records1CharacterStartAndEndIndex = GetCharactersStartAndEndIndex(records1.Span); + var records2CharacterStartAndEndIndex = GetCharactersStartAndEndIndex(records2.Span); + + Parallel.ForEach(records1CharacterStartAndEndIndex, record1StartAndEnd => + { + var records2StartAndEndFound = + records2CharacterStartAndEndIndex.TryGetValue(record1StartAndEnd.Key, out var records2StartAndEnd); - Parallel.For(start, end, list2Index => + var records1ToCompare = records1.Slice(record1StartAndEnd.Value[0], + record1StartAndEnd.Value[1] - record1StartAndEnd.Value[0]); + + var records2ToCompare = records2StartAndEndFound && records2StartAndEnd != null + ? records2.Slice(records2StartAndEnd[0], records2StartAndEnd[1] - records2StartAndEnd[0]) + : records2; + + Parallel.For(0, records1ToCompare.Length, recordToCompareIndex => { - CompareRecordsInnerLoop(potentialDuplicates, primaryRecord, list2Index, recordsToCompare, - lowerScoreThreshold, upperScoreThreshold); + var primaryRecord = records1ToCompare.Span[recordToCompareIndex]; + for (var i = 0; i < records2ToCompare.Length; i++) + { + var secondaryRecord = records2ToCompare.Span[i]; + CompareRecords(potentialDuplicates, ref primaryRecord, ref secondaryRecord, + lowerScoreThreshold, upperScoreThreshold); + } }); + }); + + return potentialDuplicates; } - private static void CompareRecordsInnerLoop(ConcurrentBag potentialDuplicates, - PatientRecord primaryRecord, int recordToCompareIndex, ReadOnlySpan recordsToCompare, + private static void CompareRecords(ConcurrentBag potentialDuplicates, + ref PatientRecord primaryRecord, ref PatientRecord secondaryRecord, double lowerScoreThreshold, double upperScoreThreshold) { - var tempRecord = recordsToCompare[recordToCompareIndex]; - //check if the first character of the first name is equal - if (!StringHelpers.FirstCharactersAreEqual(primaryRecord.FirstName, tempRecord.FirstName) || - primaryRecord.RecordId == tempRecord.RecordId) return; + if (primaryRecord.RecordId == secondaryRecord.RecordId) return; //get the distance vector for the ith vector of the first table and the jth record of the second table - var distanceVector = DistanceVector.CalculateDistance(primaryRecord, tempRecord); + var distanceVector = DistanceVector.CalculateDistance(ref primaryRecord, ref secondaryRecord); var tempScore = Score.CalculateFinalScore(ref distanceVector); if (tempScore >= lowerScoreThreshold && tempScore <= upperScoreThreshold) { potentialDuplicates.Add( - new PotentialDuplicate(primaryRecord, tempRecord, distanceVector, tempScore)); + new PotentialDuplicate(primaryRecord, secondaryRecord, distanceVector, tempScore)); } } } \ No newline at end of file diff --git a/MatchingEngine.Domain/Models/DistanceVector.cs b/MatchingEngine.Domain/Models/DistanceVector.cs index 27a0c7b..55b46b5 100644 --- a/MatchingEngine.Domain/Models/DistanceVector.cs +++ b/MatchingEngine.Domain/Models/DistanceVector.cs @@ -10,7 +10,7 @@ public readonly record struct DistanceVector public int CityDistance { get; private init; } public int PhoneNumberDistance { get; private init; } - public static DistanceVector CalculateDistance(PatientRecord firstRecord, PatientRecord secondRecord) + public static DistanceVector CalculateDistance(ref PatientRecord firstRecord, ref PatientRecord secondRecord) { return new DistanceVector { diff --git a/MatchingEngine.Domain/Models/PatientRecord.cs b/MatchingEngine.Domain/Models/PatientRecord.cs index 3804297..f931c71 100644 --- a/MatchingEngine.Domain/Models/PatientRecord.cs +++ b/MatchingEngine.Domain/Models/PatientRecord.cs @@ -1,6 +1,6 @@ namespace MatchingEngine.Domain.Models; -public sealed record PatientRecord +public readonly record struct PatientRecord ( string RecordId, string FirstName,