diff --git a/api_differences.md b/api_differences.md index b81c2d8e..a0907a19 100644 --- a/api_differences.md +++ b/api_differences.md @@ -13,14 +13,14 @@ This leads to different results depending on the version in use. `RapidFuzz` alw fallback implementation and the C++ based implementation to provide consistent matching results. ## partial_ratio implementation -`fuzzywuzzy` uses searches fo the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either: +`fuzzywuzzy` searches for the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either: 1) `difflib.SequenceMatcher.get_matching_blocks` (based on Ratcliff and Obershelp algorithm) 2) `Levenshtein.matching_blocks` (backtracks Levenshtein matrix) This implementation has a couple of issues: 1) in the pure Python implementation the automatic junk heuristic of difflib is not deactivated. This heuristic improves the performance for long strings, but can lead to completely incorrect results. -2) the accellerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses +2) the accelerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses one of multiple optimal alignment. There is no guarantee for this alignment to include the longest common substring. 3) the optimal substring is assumed to start at one of these `matching_blocks`. However this is not guaranteed. @@ -63,4 +63,4 @@ In `RapidFuzz` these functions are sometimes available under different names: - `extractOne` is available under the same name - `dedupe` is not available -In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument. \ No newline at end of file +In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument. diff --git a/src/rapidfuzz/CMakeLists.txt b/src/rapidfuzz/CMakeLists.txt index 6033b3cb..864fb992 100644 --- a/src/rapidfuzz/CMakeLists.txt +++ b/src/rapidfuzz/CMakeLists.txt @@ -88,7 +88,6 @@ if(RAPIDFUZZ_ARCH_X86) install(TARGETS fuzz_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz) endif() - create_cython_target(process_cpp_impl) rf_add_library(process_cpp_impl ${process_cpp_impl}) target_compile_features(process_cpp_impl PUBLIC cxx_std_17) @@ -145,19 +144,20 @@ if(NOT Windows) HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB) if(NOT HAVE_CXX_ATOMICS_INT_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" - ) + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!") elseif(NOT HAVE_CXX_ATOMICS_SIZE_T_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" - ) + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!") elseif(NOT HAVE_CXX_ATOMICS_VOID_PTR_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" - ) + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!") elseif(NOT HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB) message( - FATAL_ERROR "No native support for std::atomic, or libatomic not found!" + FATAL_ERROR + "No native support for std::atomic, or libatomic not found!" ) else() message(STATUS "Linking with libatomic for atomics support") diff --git a/src/rapidfuzz/_common_py.py b/src/rapidfuzz/_common_py.py index 64d39cd3..164b5713 100644 --- a/src/rapidfuzz/_common_py.py +++ b/src/rapidfuzz/_common_py.py @@ -43,3 +43,31 @@ def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> Sequence[H return s1, s2 return conv_sequence(s1), conv_sequence(s2) + + +def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int: + prefix_len = 0 + for ch1, ch2 in zip(s1, s2): + if ch1 != ch2: + break + + prefix_len += 1 + + return prefix_len + + +def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int: + suffix_len = 0 + for ch1, ch2 in zip(reversed(s1), reversed(s2)): + if ch1 != ch2: + break + + suffix_len += 1 + + return suffix_len + + +def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> (int, int): + prefix_len = common_prefix(s1, s2) + suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:]) + return (prefix_len, suffix_len) diff --git a/src/rapidfuzz/distance/CMakeLists.txt b/src/rapidfuzz/distance/CMakeLists.txt index de2bc020..7deacb48 100644 --- a/src/rapidfuzz/distance/CMakeLists.txt +++ b/src/rapidfuzz/distance/CMakeLists.txt @@ -87,4 +87,3 @@ if(RAPIDFUZZ_ARCH_X86) target_link_libraries(metrics_cpp_sse2 PRIVATE rapidfuzz::rapidfuzz) install(TARGETS metrics_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz/distance) endif() - diff --git a/src/rapidfuzz/distance/Indel_py.py b/src/rapidfuzz/distance/Indel_py.py index 6a1780fd..4eb1b673 100644 --- a/src/rapidfuzz/distance/Indel_py.py +++ b/src/rapidfuzz/distance/Indel_py.py @@ -9,6 +9,8 @@ from rapidfuzz._utils import is_none from rapidfuzz.distance._initialize_py import Editops, Opcodes from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity +from rapidfuzz.distance.LCSseq_py import editops as lcs_seq_editops +from rapidfuzz.distance.LCSseq_py import opcodes as lcs_seq_opcodes from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity @@ -300,12 +302,7 @@ def editops( insert s1[4] s2[2] insert s1[6] s2[5] """ - if processor is not None: - s1 = processor(s1) - s2 = processor(s2) - - s1, s2 = conv_sequences(s1, s2) - raise NotImplementedError + return lcs_seq_editops(s1, s2, processor=processor) def opcodes( @@ -358,4 +355,4 @@ def opcodes( equal a[4:6] (cd) b[3:5] (cd) insert a[6:6] () b[5:6] (f) """ - return editops(s1, s2, processor=processor).as_opcodes() + return lcs_seq_opcodes(s1, s2, processor=processor) diff --git a/src/rapidfuzz/distance/LCSseq_py.py b/src/rapidfuzz/distance/LCSseq_py.py index e592a14d..92e61a26 100644 --- a/src/rapidfuzz/distance/LCSseq_py.py +++ b/src/rapidfuzz/distance/LCSseq_py.py @@ -5,9 +5,9 @@ from typing import Callable, Hashable, Sequence -from rapidfuzz._common_py import conv_sequences +from rapidfuzz._common_py import common_affix, conv_sequences from rapidfuzz._utils import is_none -from rapidfuzz.distance._initialize_py import Editops, Opcodes +from rapidfuzz.distance._initialize_py import Editop, Editops, Opcodes def similarity( @@ -254,6 +254,30 @@ def normalized_similarity( return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0 +def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]): + if not s1: + return (0, []) + + S = (1 << len(s1)) - 1 + block: dict[Hashable, int] = {} + block_get = block.get + x = 1 + for ch1 in s1: + block[ch1] = block_get(ch1, 0) | x + x <<= 1 + + matrix = [] + for ch2 in s2: + Matches = block_get(ch2, 0) + u = S & Matches + S = (S + u) | (S - u) + matrix.append(S) + + # calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0 + sim = bin(S)[-len(s1) :].count("0") + return (sim, matrix) + + def editops( s1: Sequence[Hashable], s2: Sequence[Hashable], @@ -298,8 +322,56 @@ def editops( insert s1[4] s2[2] insert s1[6] s2[5] """ - _ = s1, s2, processor - raise NotImplementedError + if processor is not None: + s1 = processor(s1) + s2 = processor(s2) + + s1, s2 = conv_sequences(s1, s2) + prefix_len, suffix_len = common_affix(s1, s2) + s1 = s1[prefix_len : len(s1) - suffix_len] + s2 = s2[prefix_len : len(s2) - suffix_len] + sim, matrix = _matrix(s1, s2) + + editops = Editops([], 0, 0) + editops._src_len = len(s1) + prefix_len + suffix_len + editops._dest_len = len(s2) + prefix_len + suffix_len + + dist = len(s1) + len(s2) - 2 * sim + if dist == 0: + return editops + + editop_list = [None] * dist + col = len(s1) + row = len(s2) + while row != 0 and col != 0: + # deletion + if matrix[row - 1] & (1 << (col - 1)): + dist -= 1 + col -= 1 + editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len) + else: + row -= 1 + + # insertion + if row and not (matrix[row - 1] & (1 << (col - 1))): + dist -= 1 + editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len) + # match + else: + col -= 1 + + while col != 0: + dist -= 1 + col -= 1 + editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len) + + while row != 0: + dist -= 1 + row -= 1 + editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len) + + editops._editops = editop_list + return editops def opcodes( diff --git a/src/rapidfuzz/distance/_initialize_py.py b/src/rapidfuzz/distance/_initialize_py.py index 99158f95..2a1631a6 100644 --- a/src/rapidfuzz/distance/_initialize_py.py +++ b/src/rapidfuzz/distance/_initialize_py.py @@ -342,7 +342,7 @@ def as_list(self) -> list[Editop]: This is the equivalent of ``[x for x in editops]`` """ - return self._editops + return [tuple(op) for op in self._editops] def copy(self) -> Editops: """ @@ -472,15 +472,15 @@ def apply(self, source_string: str, destination_string: str) -> str: for op in self._editops: # matches between last and current editop - while src_pos < op.dest_pos: + while src_pos < op.src_pos: res_str += source_string[src_pos] src_pos += 1 if op.tag == "replace": - res_str += destination_string[src_pos] + res_str += destination_string[op.dest_pos] src_pos += 1 elif op.tag == "insert": - res_str += destination_string[src_pos] + res_str += destination_string[op.dest_pos] elif op.tag == "delete": src_pos += 1 @@ -618,7 +618,7 @@ def __iter__(self) -> Iterator[int | str]: def __repr__(self) -> str: return ( - f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, " + f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, " f"dest_start={self.dest_start}, dest_end={self.dest_end})" ) @@ -711,7 +711,7 @@ def as_list(self) -> list[Opcode]: This is the equivalent of ``[x for x in opcodes]`` """ - return self._opcodes[::] + return [tuple(op) for op in self._opcodes] def copy(self) -> Opcodes: """ diff --git a/src/rapidfuzz/fuzz_py.py b/src/rapidfuzz/fuzz_py.py index 335137a9..82796a40 100644 --- a/src/rapidfuzz/fuzz_py.py +++ b/src/rapidfuzz/fuzz_py.py @@ -3,7 +3,6 @@ from __future__ import annotations from math import ceil -import itertools from typing import Any, Callable, Hashable, Sequence from rapidfuzz._common_py import conv_sequences @@ -35,7 +34,7 @@ def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float: def _split_sequence(seq: Sequence[Hashable]) -> list[Sequence[Hashable]]: - if isinstance(seq, str) or isinstance(seq, bytes): + if isinstance(seq, (str, bytes)): return seq.split() splitted_seq = [[]] @@ -60,7 +59,7 @@ def _join_splitted_sequence(seq_list: list[Sequence[Hashable]]): joined = [] for seq in seq_list: joined += seq - joined += [ord(' ')] + joined += [ord(" ")] return joined[:-1] diff --git a/src/rapidfuzz/process_cpp.hpp b/src/rapidfuzz/process_cpp.hpp index 367facb2..214f4dab 100644 --- a/src/rapidfuzz/process_cpp.hpp +++ b/src/rapidfuzz/process_cpp.hpp @@ -1,8 +1,8 @@ #pragma once #include "cpp_common.hpp" #include "rapidfuzz.h" -#include "taskflow/taskflow.hpp" #include "taskflow/algorithm/for_each.hpp" +#include "taskflow/taskflow.hpp" #include #include #include diff --git a/tests/common.py b/tests/common.py index 0bcead83..7ef8b94c 100644 --- a/tests/common.py +++ b/tests/common.py @@ -40,7 +40,10 @@ def call_and_maybe_catch(call, *args, **kwargs): def compare_exceptions(e1, e2): - return type(e1) is type(e2) and str(e1) == str(e2) + try: + return str(e1) == str(e2) + except Exception: + return False def scorer_tester(scorer, s1, s2, **kwargs): @@ -156,6 +159,8 @@ class Scorer: similarity: Any normalized_distance: Any normalized_similarity: Any + editops: Any + opcodes: Any class GenericScorer: @@ -185,6 +190,28 @@ def validate_attrs(func1, func2): self.get_scorer_flags = get_scorer_flags + def _editops(self, s1, s2, **kwargs): + results = [call_and_maybe_catch(scorer.editops, s1, s2, **kwargs) for scorer in self.scorers] + + for result in results: + assert compare_exceptions(result, results[0]) + + if any(isinstance(result, Exception) for result in results): + raise results[0] + + return results[0] + + def _opcodes(self, s1, s2, **kwargs): + results = [call_and_maybe_catch(scorer.opcodes, s1, s2, **kwargs) for scorer in self.scorers] + + for result in results: + assert compare_exceptions(result, results[0]) + + if any(isinstance(result, Exception) for result in results): + raise results[0] + + return results[0] + def _distance(self, s1, s2, **kwargs): symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"] tester = symmetric_scorer_tester if symmetric else scorer_tester @@ -303,3 +330,17 @@ def normalized_similarity(self, s1, s2, **kwargs): if "score_cutoff" not in kwargs: return norm_sim return self._normalized_similarity(s1, s2, **kwargs) + + def editops(self, s1, s2, **kwargs): + editops_ = self._editops(s1, s2, **kwargs) + opcodes_ = self._opcodes(s1, s2, **kwargs) + assert opcodes_.as_editops() == editops_ + assert opcodes_ == editops_.as_opcodes() + return editops_ + + def opcodes(self, s1, s2, **kwargs): + editops_ = self._editops(s1, s2, **kwargs) + opcodes_ = self._opcodes(s1, s2, **kwargs) + assert opcodes_.as_editops() == editops_ + assert opcodes_ == editops_.as_opcodes() + return opcodes_ diff --git a/tests/distance/common.py b/tests/distance/common.py index 89189025..ca2c3940 100644 --- a/tests/distance/common.py +++ b/tests/distance/common.py @@ -21,6 +21,8 @@ def create_generic_scorer(func_name, get_scorer_flags): similarity=getattr(metrics_py, func_name + "_similarity"), normalized_distance=getattr(metrics_py, func_name + "_normalized_distance"), normalized_similarity=getattr(metrics_py, func_name + "_normalized_similarity"), + editops=getattr(metrics_py, func_name + "_editops", None), + opcodes=getattr(metrics_py, func_name + "_opcodes", None), ) ] @@ -30,6 +32,8 @@ def create_generic_scorer(func_name, get_scorer_flags): similarity=getattr(mod, func_name + "_similarity"), normalized_distance=getattr(mod, func_name + "_normalized_distance"), normalized_similarity=getattr(mod, func_name + "_normalized_similarity"), + editops=getattr(metrics_cpp, func_name + "_editops", None), + opcodes=getattr(metrics_cpp, func_name + "_opcodes", None), ) for mod in cpp_scorer_modules ] diff --git a/tests/distance/test_Indel.py b/tests/distance/test_Indel.py index ca5567dd..4d9ec816 100644 --- a/tests/distance/test_Indel.py +++ b/tests/distance/test_Indel.py @@ -19,3 +19,37 @@ def test_issue_196(): assert Indel.distance("South Korea", "North Korea", score_cutoff=2) == 3 assert Indel.distance("South Korea", "North Korea", score_cutoff=1) == 2 assert Indel.distance("South Korea", "North Korea", score_cutoff=0) == 1 + + +def test_Editops(): + """ + basic test for Indel.editops + """ + assert Indel.editops("0", "").as_list() == [("delete", 0, 0)] + assert Indel.editops("", "0").as_list() == [("insert", 0, 0)] + + assert Indel.editops("00", "0").as_list() == [("delete", 1, 1)] + assert Indel.editops("0", "00").as_list() == [("insert", 1, 1)] + + assert Indel.editops("qabxcd", "abycdf").as_list() == [ + ("delete", 0, 0), + ("insert", 3, 2), + ("delete", 3, 3), + ("insert", 6, 5), + ] + assert Indel.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [ + ("insert", 0, 0), + ("insert", 0, 1), + ("insert", 0, 2), + ("insert", 6, 9), + ("insert", 6, 10), + ("insert", 6, 11), + ("insert", 6, 12), + ("insert", 7, 14), + ("delete", 7, 15), + ("delete", 11, 18), + ] + + ops = Indel.editops("aaabaaa", "abbaaabba") + assert ops.src_len == 7 + assert ops.dest_len == 9 diff --git a/tests/distance/test_LCSseq.py b/tests/distance/test_LCSseq.py index 1b745700..420048f6 100644 --- a/tests/distance/test_LCSseq.py +++ b/tests/distance/test_LCSseq.py @@ -7,3 +7,37 @@ def test_basic(): assert LCSseq.distance("", "") == 0 assert LCSseq.distance("test", "test") == 0 assert LCSseq.distance("aaaa", "bbbb") == 4 + + +def test_Editops(): + """ + basic test for LCSseq.editops + """ + assert LCSseq.editops("0", "").as_list() == [("delete", 0, 0)] + assert LCSseq.editops("", "0").as_list() == [("insert", 0, 0)] + + assert LCSseq.editops("00", "0").as_list() == [("delete", 1, 1)] + assert LCSseq.editops("0", "00").as_list() == [("insert", 1, 1)] + + assert LCSseq.editops("qabxcd", "abycdf").as_list() == [ + ("delete", 0, 0), + ("insert", 3, 2), + ("delete", 3, 3), + ("insert", 6, 5), + ] + assert LCSseq.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [ + ("insert", 0, 0), + ("insert", 0, 1), + ("insert", 0, 2), + ("insert", 6, 9), + ("insert", 6, 10), + ("insert", 6, 11), + ("insert", 6, 12), + ("insert", 7, 14), + ("delete", 7, 15), + ("delete", 11, 18), + ] + + ops = LCSseq.editops("aaabaaa", "abbaaabba") + assert ops.src_len == 7 + assert ops.dest_len == 9 diff --git a/tests/test_fuzz.py b/tests/test_fuzz.py index 12f8d2fe..8fd7a137 100644 --- a/tests/test_fuzz.py +++ b/tests/test_fuzz.py @@ -106,6 +106,7 @@ def QRatio(*args, **kwargs): fuzz.QRatio, ] + def test_no_processor(): assert fuzz.ratio("new york mets", "new york mets") == 100 assert fuzz.ratio("new york mets", "new YORK mets") != 100 diff --git a/tests/test_hypothesis.py b/tests/test_hypothesis.py index 9c357973..3ffd2337 100644 --- a/tests/test_hypothesis.py +++ b/tests/test_hypothesis.py @@ -239,7 +239,7 @@ def test_indel_editops(s1, s2): """ test Indel.editops with any sizes """ - ops = metrics_cpp.indel_editops(s1, s2) + ops = Indel.editops(s1, s2) assert ops.apply(s1, s2) == s2 @@ -249,7 +249,7 @@ def test_indel_editops_block(s1, s2): """ test Indel.editops for long strings """ - ops = metrics_cpp.indel_editops(s1, s2) + ops = Indel.editops(s1, s2) assert ops.apply(s1, s2) == s2 @@ -279,7 +279,7 @@ def test_indel_opcodes(s1, s2): """ test Indel.opcodes with any sizes """ - ops = metrics_cpp.indel_opcodes(s1, s2) + ops = Indel.opcodes(s1, s2) assert ops.apply(s1, s2) == s2 @@ -289,7 +289,7 @@ def test_indel_opcodes_block(s1, s2): """ test Indel.opcodes for long strings """ - ops = metrics_cpp.indel_opcodes(s1, s2) + ops = Indel.opcodes(s1, s2) assert ops.apply(s1, s2) == s2