Skip to content

Commit

Permalink
implement pure Python editops for Indel/LCSseq
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Aug 16, 2023
1 parent 6b24ce8 commit e4bfaec
Show file tree
Hide file tree
Showing 15 changed files with 247 additions and 38 deletions.
6 changes: 3 additions & 3 deletions api_differences.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,14 @@ This leads to different results depending on the version in use. `RapidFuzz` alw
fallback implementation and the C++ based implementation to provide consistent matching results.

## partial_ratio implementation
`fuzzywuzzy` uses searches fo the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
`fuzzywuzzy` searches for the optimal matching substring and then calculates the similarity using `ratio`. This substring is searches using either:
1) `difflib.SequenceMatcher.get_matching_blocks` (based on Ratcliff and Obershelp algorithm)
2) `Levenshtein.matching_blocks` (backtracks Levenshtein matrix)

This implementation has a couple of issues:
1) in the pure Python implementation the automatic junk heuristic of difflib is not deactivated. This heuristic improves the performance for long strings,
but can lead to completely incorrect results.
2) the accellerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
2) the accelerated version backtracks the Levenshtein matrix to find the same alignment found by the Python implementation. However the algorithm just uses
one of multiple optimal alignment. There is no guarantee for this alignment to include the longest common substring.
3) the optimal substring is assumed to start at one of these `matching_blocks`. However this is not guaranteed.

Expand Down Expand Up @@ -63,4 +63,4 @@ In `RapidFuzz` these functions are sometimes available under different names:
- `extractOne` is available under the same name
- `dedupe` is not available

In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.
In addition these functions do not preprocess strings by default. However preprocessing can be enabled using the `processor` argument.
16 changes: 8 additions & 8 deletions src/rapidfuzz/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ if(RAPIDFUZZ_ARCH_X86)
install(TARGETS fuzz_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz)
endif()


create_cython_target(process_cpp_impl)
rf_add_library(process_cpp_impl ${process_cpp_impl})
target_compile_features(process_cpp_impl PUBLIC cxx_std_17)
Expand Down Expand Up @@ -145,19 +144,20 @@ if(NOT Windows)
HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
if(NOT HAVE_CXX_ATOMICS_INT_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<int>, or libatomic not found!"
)
FATAL_ERROR
"No native support for std::atomic<int>, or libatomic not found!")
elseif(NOT HAVE_CXX_ATOMICS_SIZE_T_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<size_t>, or libatomic not found!"
)
FATAL_ERROR
"No native support for std::atomic<size_t>, or libatomic not found!")
elseif(NOT HAVE_CXX_ATOMICS_VOID_PTR_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<void*>, or libatomic not found!"
)
FATAL_ERROR
"No native support for std::atomic<void*>, or libatomic not found!")
elseif(NOT HAVE_CXX_ATOMICS_UNSIGNED_WITH_LIB)
message(
FATAL_ERROR "No native support for std::atomic<unsigned>, or libatomic not found!"
FATAL_ERROR
"No native support for std::atomic<unsigned>, or libatomic not found!"
)
else()
message(STATUS "Linking with libatomic for atomics support")
Expand Down
28 changes: 28 additions & 0 deletions src/rapidfuzz/_common_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,31 @@ def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> Sequence[H
return s1, s2

return conv_sequence(s1), conv_sequence(s2)


def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
prefix_len = 0
for ch1, ch2 in zip(s1, s2):
if ch1 != ch2:
break

prefix_len += 1

return prefix_len


def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
suffix_len = 0
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
if ch1 != ch2:
break

suffix_len += 1

return suffix_len


def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> (int, int):
prefix_len = common_prefix(s1, s2)
suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:])
return (prefix_len, suffix_len)
1 change: 0 additions & 1 deletion src/rapidfuzz/distance/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,3 @@ if(RAPIDFUZZ_ARCH_X86)
target_link_libraries(metrics_cpp_sse2 PRIVATE rapidfuzz::rapidfuzz)
install(TARGETS metrics_cpp_sse2 LIBRARY DESTINATION src/rapidfuzz/distance)
endif()

11 changes: 4 additions & 7 deletions src/rapidfuzz/distance/Indel_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from rapidfuzz._utils import is_none
from rapidfuzz.distance._initialize_py import Editops, Opcodes
from rapidfuzz.distance.LCSseq_py import _block_similarity as lcs_seq_block_similarity
from rapidfuzz.distance.LCSseq_py import editops as lcs_seq_editops
from rapidfuzz.distance.LCSseq_py import opcodes as lcs_seq_opcodes
from rapidfuzz.distance.LCSseq_py import similarity as lcs_seq_similarity


Expand Down Expand Up @@ -300,12 +302,7 @@ def editops(
insert s1[4] s2[2]
insert s1[6] s2[5]
"""
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)

s1, s2 = conv_sequences(s1, s2)
raise NotImplementedError
return lcs_seq_editops(s1, s2, processor=processor)


def opcodes(
Expand Down Expand Up @@ -358,4 +355,4 @@ def opcodes(
equal a[4:6] (cd) b[3:5] (cd)
insert a[6:6] () b[5:6] (f)
"""
return editops(s1, s2, processor=processor).as_opcodes()
return lcs_seq_opcodes(s1, s2, processor=processor)
80 changes: 76 additions & 4 deletions src/rapidfuzz/distance/LCSseq_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

from typing import Callable, Hashable, Sequence

from rapidfuzz._common_py import conv_sequences
from rapidfuzz._common_py import common_affix, conv_sequences
from rapidfuzz._utils import is_none
from rapidfuzz.distance._initialize_py import Editops, Opcodes
from rapidfuzz.distance._initialize_py import Editop, Editops, Opcodes


def similarity(
Expand Down Expand Up @@ -254,6 +254,30 @@ def normalized_similarity(
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0


def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]):
if not s1:
return (0, [])

S = (1 << len(s1)) - 1
block: dict[Hashable, int] = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1

matrix = []
for ch2 in s2:
Matches = block_get(ch2, 0)
u = S & Matches
S = (S + u) | (S - u)
matrix.append(S)

# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
sim = bin(S)[-len(s1) :].count("0")
return (sim, matrix)


def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
Expand Down Expand Up @@ -298,8 +322,56 @@ def editops(
insert s1[4] s2[2]
insert s1[6] s2[5]
"""
_ = s1, s2, processor
raise NotImplementedError
if processor is not None:
s1 = processor(s1)
s2 = processor(s2)

s1, s2 = conv_sequences(s1, s2)
prefix_len, suffix_len = common_affix(s1, s2)
s1 = s1[prefix_len : len(s1) - suffix_len]
s2 = s2[prefix_len : len(s2) - suffix_len]
sim, matrix = _matrix(s1, s2)

editops = Editops([], 0, 0)
editops._src_len = len(s1) + prefix_len + suffix_len
editops._dest_len = len(s2) + prefix_len + suffix_len

dist = len(s1) + len(s2) - 2 * sim
if dist == 0:
return editops

editop_list = [None] * dist
col = len(s1)
row = len(s2)
while row != 0 and col != 0:
# deletion
if matrix[row - 1] & (1 << (col - 1)):
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
else:
row -= 1

# insertion
if row and not (matrix[row - 1] & (1 << (col - 1))):
dist -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
# match
else:
col -= 1

while col != 0:
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)

while row != 0:
dist -= 1
row -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)

editops._editops = editop_list
return editops


def opcodes(
Expand Down
12 changes: 6 additions & 6 deletions src/rapidfuzz/distance/_initialize_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def as_list(self) -> list[Editop]:
This is the equivalent of ``[x for x in editops]``
"""
return self._editops
return [tuple(op) for op in self._editops]

def copy(self) -> Editops:
"""
Expand Down Expand Up @@ -472,15 +472,15 @@ def apply(self, source_string: str, destination_string: str) -> str:

for op in self._editops:
# matches between last and current editop
while src_pos < op.dest_pos:
while src_pos < op.src_pos:
res_str += source_string[src_pos]
src_pos += 1

if op.tag == "replace":
res_str += destination_string[src_pos]
res_str += destination_string[op.dest_pos]
src_pos += 1
elif op.tag == "insert":
res_str += destination_string[src_pos]
res_str += destination_string[op.dest_pos]
elif op.tag == "delete":
src_pos += 1

Expand Down Expand Up @@ -618,7 +618,7 @@ def __iter__(self) -> Iterator[int | str]:

def __repr__(self) -> str:
return (
f"Opcode(tag={self.tag}, src_start={self.src_start}, src_end={self.src_end}, "
f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, "
f"dest_start={self.dest_start}, dest_end={self.dest_end})"
)

Expand Down Expand Up @@ -711,7 +711,7 @@ def as_list(self) -> list[Opcode]:
This is the equivalent of ``[x for x in opcodes]``
"""
return self._opcodes[::]
return [tuple(op) for op in self._opcodes]

def copy(self) -> Opcodes:
"""
Expand Down
5 changes: 2 additions & 3 deletions src/rapidfuzz/fuzz_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from __future__ import annotations

from math import ceil
import itertools
from typing import Any, Callable, Hashable, Sequence

from rapidfuzz._common_py import conv_sequences
Expand Down Expand Up @@ -35,7 +34,7 @@ def _norm_distance(dist: int, lensum: int, score_cutoff: float) -> float:


def _split_sequence(seq: Sequence[Hashable]) -> list[Sequence[Hashable]]:
if isinstance(seq, str) or isinstance(seq, bytes):
if isinstance(seq, (str, bytes)):
return seq.split()

splitted_seq = [[]]
Expand All @@ -60,7 +59,7 @@ def _join_splitted_sequence(seq_list: list[Sequence[Hashable]]):
joined = []
for seq in seq_list:
joined += seq
joined += [ord(' ')]
joined += [ord(" ")]
return joined[:-1]


Expand Down
2 changes: 1 addition & 1 deletion src/rapidfuzz/process_cpp.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#pragma once
#include "cpp_common.hpp"
#include "rapidfuzz.h"
#include "taskflow/taskflow.hpp"
#include "taskflow/algorithm/for_each.hpp"
#include "taskflow/taskflow.hpp"
#include <atomic>
#include <chrono>
#include <exception>
Expand Down
43 changes: 42 additions & 1 deletion tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ def call_and_maybe_catch(call, *args, **kwargs):


def compare_exceptions(e1, e2):
return type(e1) is type(e2) and str(e1) == str(e2)
try:
return str(e1) == str(e2)
except Exception:
return False


def scorer_tester(scorer, s1, s2, **kwargs):
Expand Down Expand Up @@ -156,6 +159,8 @@ class Scorer:
similarity: Any
normalized_distance: Any
normalized_similarity: Any
editops: Any
opcodes: Any


class GenericScorer:
Expand Down Expand Up @@ -185,6 +190,28 @@ def validate_attrs(func1, func2):

self.get_scorer_flags = get_scorer_flags

def _editops(self, s1, s2, **kwargs):
results = [call_and_maybe_catch(scorer.editops, s1, s2, **kwargs) for scorer in self.scorers]

for result in results:
assert compare_exceptions(result, results[0])

if any(isinstance(result, Exception) for result in results):
raise results[0]

return results[0]

def _opcodes(self, s1, s2, **kwargs):
results = [call_and_maybe_catch(scorer.opcodes, s1, s2, **kwargs) for scorer in self.scorers]

for result in results:
assert compare_exceptions(result, results[0])

if any(isinstance(result, Exception) for result in results):
raise results[0]

return results[0]

def _distance(self, s1, s2, **kwargs):
symmetric = self.get_scorer_flags(s1, s2, **kwargs)["symmetric"]
tester = symmetric_scorer_tester if symmetric else scorer_tester
Expand Down Expand Up @@ -303,3 +330,17 @@ def normalized_similarity(self, s1, s2, **kwargs):
if "score_cutoff" not in kwargs:
return norm_sim
return self._normalized_similarity(s1, s2, **kwargs)

def editops(self, s1, s2, **kwargs):
editops_ = self._editops(s1, s2, **kwargs)
opcodes_ = self._opcodes(s1, s2, **kwargs)
assert opcodes_.as_editops() == editops_
assert opcodes_ == editops_.as_opcodes()
return editops_

def opcodes(self, s1, s2, **kwargs):
editops_ = self._editops(s1, s2, **kwargs)
opcodes_ = self._opcodes(s1, s2, **kwargs)
assert opcodes_.as_editops() == editops_
assert opcodes_ == editops_.as_opcodes()
return opcodes_
4 changes: 4 additions & 0 deletions tests/distance/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
similarity=getattr(metrics_py, func_name + "_similarity"),
normalized_distance=getattr(metrics_py, func_name + "_normalized_distance"),
normalized_similarity=getattr(metrics_py, func_name + "_normalized_similarity"),
editops=getattr(metrics_py, func_name + "_editops", None),
opcodes=getattr(metrics_py, func_name + "_opcodes", None),
)
]

Expand All @@ -30,6 +32,8 @@ def create_generic_scorer(func_name, get_scorer_flags):
similarity=getattr(mod, func_name + "_similarity"),
normalized_distance=getattr(mod, func_name + "_normalized_distance"),
normalized_similarity=getattr(mod, func_name + "_normalized_similarity"),
editops=getattr(metrics_cpp, func_name + "_editops", None),
opcodes=getattr(metrics_cpp, func_name + "_opcodes", None),
)
for mod in cpp_scorer_modules
]
Expand Down
Loading

0 comments on commit e4bfaec

Please sign in to comment.