From 2b64d03b44e05bcb4b18ef5c9f194e8a29999328 Mon Sep 17 00:00:00 2001 From: Elvis Nieves Date: Tue, 16 May 2023 09:42:39 -0400 Subject: [PATCH] feat: add initial matching service implementation (#63) --- src/Biomatch.CLI/Biomatch.CLI.csproj | 2 +- src/Biomatch.Domain/Biomatch.Domain.csproj | 4 +- .../Helpers/StringExtensions.cs | 3 +- src/Biomatch.Domain/Match.cs | 53 +++++++- src/Biomatch.Domain/Preprocess.cs | 56 +++++++- .../Services/MatchingService.cs | 76 +++++++++++ src/Biomatch.Domain/WordDictionary.cs | 35 +++++ .../Services/MatchingServiceTests.cs | 120 ++++++++++++++++++ 8 files changed, 340 insertions(+), 9 deletions(-) create mode 100644 src/Biomatch.Domain/Services/MatchingService.cs create mode 100644 tests/Biomatch.Domain.Tests.Unit/Services/MatchingServiceTests.cs diff --git a/src/Biomatch.CLI/Biomatch.CLI.csproj b/src/Biomatch.CLI/Biomatch.CLI.csproj index a979fd5..e6a2e4f 100644 --- a/src/Biomatch.CLI/Biomatch.CLI.csproj +++ b/src/Biomatch.CLI/Biomatch.CLI.csproj @@ -9,7 +9,7 @@ enable Linux true - true + Speed diff --git a/src/Biomatch.Domain/Biomatch.Domain.csproj b/src/Biomatch.Domain/Biomatch.Domain.csproj index c42b38c..2514e33 100644 --- a/src/Biomatch.Domain/Biomatch.Domain.csproj +++ b/src/Biomatch.Domain/Biomatch.Domain.csproj @@ -1,7 +1,7 @@ - net7.0;net8.0 + net8.0 enable enable true @@ -22,7 +22,7 @@ - + diff --git a/src/Biomatch.Domain/Helpers/StringExtensions.cs b/src/Biomatch.Domain/Helpers/StringExtensions.cs index c8432db..6e8e06e 100644 --- a/src/Biomatch.Domain/Helpers/StringExtensions.cs +++ b/src/Biomatch.Domain/Helpers/StringExtensions.cs @@ -1,3 +1,4 @@ +using System.Collections.Frozen; using System.Globalization; using System.Text; using Biomatch.Domain.Enums; @@ -84,7 +85,7 @@ public static StringBuilder NormalizeWord(this string word) return sb; } - public static IEnumerable RemoveWords(this IEnumerable words, HashSet wordsToRemove) + public static IEnumerable RemoveWords(this IEnumerable words, FrozenSet wordsToRemove) { foreach (var word in words) { diff --git a/src/Biomatch.Domain/Match.cs b/src/Biomatch.Domain/Match.cs index a473752..795d828 100644 --- a/src/Biomatch.Domain/Match.cs +++ b/src/Biomatch.Domain/Match.cs @@ -34,9 +34,9 @@ public static IEnumerable FindBestMatches(IEnumerable GetPotentialMatchesFromDifferentDataSet(Memory records1, - Memory records2, double lowerScoreThreshold, double upperScoreThreshold, - IProgress? matchProgressReport = null) + public static ConcurrentBag GetPotentialMatchesFromDifferentDataSet( + Memory records1, Memory records2, double lowerScoreThreshold, + double upperScoreThreshold, IProgress? matchProgressReport = null) { var potentialMatches = new ConcurrentBag(); @@ -77,6 +77,33 @@ public static ConcurrentBag GetPotentialMatchesFromDifferentData return potentialMatches; } + public static IEnumerable GetPotentialMatchesFromSameDataSet(PersonRecordForMatch record, + Span recordsToMatch, double lowerScoreThreshold, double upperScoreThreshold) + { + var potentialMatches = new List(); + var recordLetterIndexFromFirstCharacter = record.FirstName[0] - 'a'; + + var recordsToMatchCharacterStartAndEndIndex = GetCharactersStartAndEndIndex(recordsToMatch); + var recordsToMatchStartAndEnd = recordsToMatchCharacterStartAndEndIndex[recordLetterIndexFromFirstCharacter]; + + var recordsToCompare = recordsToMatchStartAndEnd.Item1 == -1 + ? recordsToMatch + : recordsToMatch.Slice(recordsToMatchStartAndEnd.Item1, + recordsToMatchStartAndEnd.Item2 - recordsToMatchStartAndEnd.Item1 + 1); + + for (var i = 0; i < recordsToCompare.Length; i++) + { + ref var secondaryRecord = ref recordsToCompare[i]; + if (record.RecordId == secondaryRecord.RecordId) continue; + var potentialMatch = CompareRecords(ref record, ref secondaryRecord, lowerScoreThreshold, + upperScoreThreshold); + if (potentialMatch != null) + potentialMatches.Add(potentialMatch.Value); + } + + return potentialMatches; + } + public static ConcurrentBag GetPotentialMatchesFromSameDataSet(Memory records1, Memory records2, double lowerScoreThreshold, double upperScoreThreshold, IProgress? matchProgressReport = null) @@ -142,6 +169,26 @@ private static void CompareRecords(ConcurrentBag potentialMatche } } + private static PotentialMatch? CompareRecords(ref PersonRecordForMatch primaryRecord, + ref PersonRecordForMatch secondaryRecord, double lowerScoreThreshold, double upperScoreThreshold) + { + //get the distance vector for the ith vector of the first table and the jth record of the second table + var distanceVector = DistanceVector.CalculateDistance(ref primaryRecord, ref secondaryRecord); + var tempScore = Score.CalculateFinalScore(ref distanceVector); + if (tempScore >= lowerScoreThreshold && tempScore <= upperScoreThreshold) + { + return new PotentialMatch + ( + primaryRecord, + secondaryRecord, + distanceVector, + tempScore + ); + } + + return null; + } + private static (int, int)[] GetCharactersStartAndEndIndex(ReadOnlySpan records) { var characterIndex = new (int, int)[26]; // 26 letters in the alphabet diff --git a/src/Biomatch.Domain/Preprocess.cs b/src/Biomatch.Domain/Preprocess.cs index 1efd583..c0f4c8e 100644 --- a/src/Biomatch.Domain/Preprocess.cs +++ b/src/Biomatch.Domain/Preprocess.cs @@ -1,4 +1,5 @@ using System.Collections.Concurrent; +using System.Collections.Frozen; using Biomatch.Domain.Enums; using Biomatch.Domain.Helpers; using Biomatch.Domain.Models; @@ -7,6 +8,57 @@ namespace Biomatch.Domain; public static class Preprocess { + public static PersonRecordForMatch PreprocessRecord(this IPersonRecord patientRecord, + FrozenSet prepositionsToRemove, FrozenSet suffixesToRemove, + WordDictionary? firstNamesDictionary = null, WordDictionary? middleNamesDictionary = null, + WordDictionary? lastNamesDictionary = null) + { + var normalizedFirstNames = patientRecord.FirstName + .NormalizeNames(NameType.Name) + .RemoveWords(prepositionsToRemove) + .RemoveWords(suffixesToRemove); + var normalizedMiddleNames = patientRecord.MiddleName + .NormalizeNames(NameType.Name) + .RemoveWords(prepositionsToRemove) + .RemoveWords(suffixesToRemove); + var normalizedLastNames = patientRecord.LastName + .NormalizeNames(NameType.LastName) + .RemoveWords(prepositionsToRemove); + var normalizedSecondLastNames = patientRecord.SecondLastName + .NormalizeNames(NameType.LastName) + .RemoveWords(prepositionsToRemove); + + var personName = OrganizeNames(normalizedFirstNames, normalizedMiddleNames.ToList(), + normalizedLastNames, normalizedSecondLastNames, lastNamesDictionary); + + var firstNames = personName.FirstName + .SanitizeName(NameType.Name, firstNamesDictionary); + + var middleNames = personName.MiddleName + .SanitizeName(NameType.Name, middleNamesDictionary); + + var lastNames = personName.LastName + .SanitizeName(NameType.LastName, lastNamesDictionary); + + var secondLastNames = personName.SecondLastName + .SanitizeName(NameType.LastName, lastNamesDictionary); + + return new PersonRecordForMatch + ( + patientRecord.RecordId, + string.Concat(firstNames), + string.Concat(middleNames), + string.Concat(lastNames), + string.Concat(secondLastNames), + patientRecord.BirthDate.SanitizeBirthDate(), + patientRecord.BirthDate.HasValue + ? patientRecord.BirthDate.Value.ToByteArray() + : Array.Empty(), + patientRecord.City.SanitizeWord().ToString(), + PhoneNumberHelpers.Parse(patientRecord.PhoneNumber) + ); + } + public static IEnumerable PreprocessData(this IEnumerable patientRecords, WordDictionary? firstNamesDictionary = null, WordDictionary? middleNamesDictionary = null, WordDictionary? lastNamesDictionary = null) @@ -24,11 +76,11 @@ private static IEnumerable SanitizeRecords(this IEnumerabl var prepositions = new HashSet { "el", "la", "los", "las", "de", "del", "en", "y", "a", "di", "da", "le", "san" - }; + }.ToFrozenSet(true); var suffixes = new HashSet { "lcdo", "lcda", "dr", "dra", "sor", "jr", "junior", "sr", "sra", "ii", "iii", "mr", "ms", "mrs" - }; + }.ToFrozenSet(true); var processedPatientRecords = new ConcurrentBag(); Parallel.For(0, patientRecordsList.Length, index => { diff --git a/src/Biomatch.Domain/Services/MatchingService.cs b/src/Biomatch.Domain/Services/MatchingService.cs new file mode 100644 index 0000000..abec5ad --- /dev/null +++ b/src/Biomatch.Domain/Services/MatchingService.cs @@ -0,0 +1,76 @@ +using System.Collections.Frozen; +using Biomatch.Domain.Models; + +namespace Biomatch.Domain.Services; + +public class MatchingService +{ + private readonly FrozenSet _prepositionsToRemove; + private readonly FrozenSet _suffixesToRemove; + private readonly WordDictionary? _firstNamesDictionary; + private readonly WordDictionary? _middleNamesDictionary; + private readonly WordDictionary? _lastNamesDictionary; + + private PersonRecordForMatch[] _preprocessedRecordsToMatch; + + public MatchingService(IEnumerable recordsToMatch) + { + _prepositionsToRemove = new HashSet + { + "el", "la", "los", "las", "de", "del", "en", "y", "a", "di", "da", "le", "san" + }.ToFrozenSet(true); + _suffixesToRemove = new HashSet + { + "lcdo", "lcda", "dr", "dra", "sor", "jr", "junior", "sr", "sra", "ii", "iii", "mr", "ms", "mrs" + }.ToFrozenSet(true); + _preprocessedRecordsToMatch = recordsToMatch.PreprocessData().ToArray(); + var firstNameFrequencyDictionary = _preprocessedRecordsToMatch + .GroupBy(e => e.FirstName) + .Where(e => e.Count() > 20 && e.Key.Length > 3) + .Select(e => new FrequencyDictionary + ( + e.Key, + e.Count() + )); + var middleNameFrequencyDictionary = _preprocessedRecordsToMatch + .GroupBy(e => e.MiddleName) + .Where(e => e.Count() > 20 && e.Key.Length > 3) + .Select(e => new FrequencyDictionary + ( + e.Key, + e.Count() + )); + var firstLastNameFrequencyDictionary = _preprocessedRecordsToMatch + .GroupBy(e => e.LastName) + .Where(e => e.Count() > 20 && e.Key.Length > 3) + .Select(e => new FrequencyDictionary + ( + e.Key, + e.Count() + )); + _firstNamesDictionary = WordDictionary.CreateWordDictionary(firstNameFrequencyDictionary); + _middleNamesDictionary = WordDictionary.CreateWordDictionary(middleNameFrequencyDictionary); + _lastNamesDictionary = WordDictionary.CreateWordDictionary(firstLastNameFrequencyDictionary); + } + + public IEnumerable FindPotentialMatches(IPersonRecord record, double matchScoreThreshold) + { + var preprocessedRecord = + record.PreprocessRecord(_prepositionsToRemove, _suffixesToRemove, _firstNamesDictionary, _middleNamesDictionary, + _lastNamesDictionary); + + return Match.GetPotentialMatchesFromSameDataSet(preprocessedRecord, _preprocessedRecordsToMatch, + matchScoreThreshold, 1.0); + } + + public void AddPersonToMatchData(IPersonRecord record) + { + var preprocessedRecord = + record.PreprocessRecord(_prepositionsToRemove, _suffixesToRemove, _firstNamesDictionary, _middleNamesDictionary, + _lastNamesDictionary); + + _preprocessedRecordsToMatch = _preprocessedRecordsToMatch.Append(preprocessedRecord) + .OrderBy(e => e.FirstName) + .ToArray(); + } +} diff --git a/src/Biomatch.Domain/WordDictionary.cs b/src/Biomatch.Domain/WordDictionary.cs index 4c2fd87..cbe5d57 100644 --- a/src/Biomatch.Domain/WordDictionary.cs +++ b/src/Biomatch.Domain/WordDictionary.cs @@ -1,3 +1,6 @@ +using System.Text; +using Biomatch.Domain.Models; + namespace Biomatch.Domain; public sealed class WordDictionary @@ -23,6 +26,38 @@ public WordDictionary(FileInfo dictionaryFilePath) } } + private WordDictionary(SymSpell symSpell) + { + _symSpell = symSpell; + } + + public static WordDictionary CreateWordDictionary(IEnumerable frequencyDictionary) + { + var content = new StringBuilder(); + + foreach (var wordItem in frequencyDictionary) + { + var line = $"{wordItem.Word}\t{wordItem.Frequency}"; + content.AppendLine(line); + } + + using var stream = new MemoryStream(Encoding.UTF8.GetBytes(content.ToString())); + //create object + const int initialCapacity = 82765; + const int maxEditDistanceDictionary = 2; //maximum edit distance per dictionary pre-calculation + var symSpell = new SymSpell(initialCapacity, maxEditDistanceDictionary); + + //load dictionary + const int termIndex = 0; //column of the term in the dictionary text file + const int countIndex = 1; //column of the term frequency in the dictionary text file + if (!symSpell.LoadDictionary(stream, termIndex, countIndex)) + { + throw new Exception("Fail to load dictionary"); + } + + return new WordDictionary(symSpell); + } + public List TrySpellCheck(string recordToSearch) { //lookup suggestions for single-word input strings diff --git a/tests/Biomatch.Domain.Tests.Unit/Services/MatchingServiceTests.cs b/tests/Biomatch.Domain.Tests.Unit/Services/MatchingServiceTests.cs new file mode 100644 index 0000000..5c862d7 --- /dev/null +++ b/tests/Biomatch.Domain.Tests.Unit/Services/MatchingServiceTests.cs @@ -0,0 +1,120 @@ +using Biomatch.Domain.Models; +using Biomatch.Domain.Services; +using FluentAssertions; + +namespace Biomatch.Domain.Tests.Unit.Services; + +public class MatchingServiceTests +{ + private readonly MatchingService _sut; + + public MatchingServiceTests() + { + var patientRecordsToMatch = new List + { + new PersonRecord + ( + "123456", + "Elvis", + "", + "Nieves", + "Miranda", + new DateOnly(1990, 02, 01), + "Aguada", + "7875982789" + ), + new PersonRecord + ( + "1234568", + "Elvis", + "Gabriel", + "Nieves", + "Miranda", + new DateOnly(1990, 02, 01), + "Aguada", + "7875982789" + ), + new PersonRecord + ( + "3688374", + "Juan", + "", + "Del Pueblo", + "", + new DateOnly(1990, 02, 01), + "Aguada", + "7875982789" + ), + new PersonRecord + ( + "3697831", + "Juan Del Puéblo", + "", + "", + "", + new DateOnly(1990, 02, 01), + "San Juan", + "7875982789" + ), + new PersonRecord + ( + "1238", + "Guillermo", + "", + "Perez", + "", + new DateOnly(1990, 01, 01), + "San Juan", + "" + ), + new PersonRecord + ( + "1230", + "Clara", + "", + "Pique", + "", + new DateOnly(1995, 01, 01), + "Adjuntas", + "" + ), + new PersonRecord + ( + "1276", + "Juan Del Puéblo", + "", + "", + "", + new DateOnly(1990, 01, 02), + "San Juan", + "7875982789" + ), + }; + _sut = new MatchingService(patientRecordsToMatch); + } + + [Fact] + public void FindPotentialMatches_ShouldReturnPotentialMatches_WhenRecordIsPassed() + { + // Arrange + var recordToMatch = new PersonRecord + ( + "1234", + "Elvis", + "", + "Nieves", + "Miranda", + new DateOnly(1990, 02, 01), + "Aguada", + "7875982789" + ); + + // Act + var possibleMatches = _sut.FindPotentialMatches(recordToMatch, 0.85) + .ToList(); + + // Assert + possibleMatches.Should().HaveCount(2); + possibleMatches[0].Score.Should().BeGreaterThan(0.85); + } +}