Skip to content

Commit

Permalink
implement Levenshtein.editops
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Aug 16, 2023
1 parent e4bfaec commit 1680e72
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 20 deletions.
2 changes: 1 addition & 1 deletion src/rapidfuzz/distance/LCSseq_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def normalized_similarity(
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0


def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]):
def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> ([int], int):
if not s1:
return (0, [])

Expand Down
94 changes: 91 additions & 3 deletions src/rapidfuzz/distance/Levenshtein_py.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

from typing import Callable, Hashable, Sequence

from rapidfuzz._common_py import conv_sequences
from rapidfuzz._common_py import common_affix, conv_sequences
from rapidfuzz._utils import is_none
from rapidfuzz.distance import Indel_py as Indel
from rapidfuzz.distance._initialize_py import Editops, Opcodes
from rapidfuzz.distance._initialize_py import Editop, Editops, Opcodes


def _levenshtein_maximum(s1: Sequence[Hashable], s2: Sequence[Hashable], weights: tuple[int, int, int]) -> int:
Expand Down Expand Up @@ -372,6 +372,47 @@ def normalized_similarity(
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0


def _matrix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
if not s1:
return (len(s2), [], [])

VP = (1 << len(s1)) - 1
VN = 0
currDist = len(s1)
mask = 1 << (len(s1) - 1)

block: dict[Hashable, int] = {}
block_get = block.get
x = 1
for ch1 in s1:
block[ch1] = block_get(ch1, 0) | x
x <<= 1

matrix_VP = []
matrix_VN = []
for ch2 in s2:
# Step 1: Computing D0
PM_j = block_get(ch2, 0)
X = PM_j
D0 = (((X & VP) + VP) ^ VP) | X | VN
# Step 2: Computing HP and HN
HP = VN | ~(D0 | VP)
HN = D0 & VP
# Step 3: Computing the value D[m,j]
currDist += (HP & mask) != 0
currDist -= (HN & mask) != 0
# Step 4: Computing Vp and VN
HP = (HP << 1) | 1
HN = HN << 1
VP = HN | ~(D0 | HP)
VN = HP & D0

matrix_VP.append(VP)
matrix_VN.append(VN)

return (currDist, matrix_VP, matrix_VN)


def editops(
s1: Sequence[Hashable],
s2: Sequence[Hashable],
Expand Down Expand Up @@ -425,7 +466,54 @@ def editops(
s2 = processor(s2)

s1, s2 = conv_sequences(s1, s2)
raise NotImplementedError
prefix_len, suffix_len = common_affix(s1, s2)
s1 = s1[prefix_len : len(s1) - suffix_len]
s2 = s2[prefix_len : len(s2) - suffix_len]
dist, VP, VN = _matrix(s1, s2)

editops = Editops([], 0, 0)
editops._src_len = len(s1) + prefix_len + suffix_len
editops._dest_len = len(s2) + prefix_len + suffix_len

if dist == 0:
return editops

editop_list = [None] * dist
col = len(s1)
row = len(s2)
while row != 0 and col != 0:
# deletion
if VP[row - 1] & (1 << (col - 1)):
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
else:
row -= 1

# insertion
if row and (VN[row - 1] & (1 << (col - 1))):
dist -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
else:
col -= 1

# replace (Matches are not recorded)
if s1[col] != s2[row]:
dist -= 1
editop_list[dist] = Editop("replace", col + prefix_len, row + prefix_len)

while col != 0:
dist -= 1
col -= 1
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)

while row != 0:
dist -= 1
row -= 1
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)

editops._editops = editop_list
return editops


def opcodes(
Expand Down
19 changes: 8 additions & 11 deletions tests/distance/test_Levenshtein.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

from rapidfuzz.distance import Opcode, Opcodes, metrics_cpp
from tests.distance.common import Levenshtein


Expand Down Expand Up @@ -82,18 +81,18 @@ def test_Editops():
"""
basic test for Levenshtein.editops
"""
assert metrics_cpp.levenshtein_editops("0", "").as_list() == [("delete", 0, 0)]
assert metrics_cpp.levenshtein_editops("", "0").as_list() == [("insert", 0, 0)]
assert Levenshtein.editops("0", "").as_list() == [("delete", 0, 0)]
assert Levenshtein.editops("", "0").as_list() == [("insert", 0, 0)]

assert metrics_cpp.levenshtein_editops("00", "0").as_list() == [("delete", 1, 1)]
assert metrics_cpp.levenshtein_editops("0", "00").as_list() == [("insert", 1, 1)]
assert Levenshtein.editops("00", "0").as_list() == [("delete", 1, 1)]
assert Levenshtein.editops("0", "00").as_list() == [("insert", 1, 1)]

assert metrics_cpp.levenshtein_editops("qabxcd", "abycdf").as_list() == [
assert Levenshtein.editops("qabxcd", "abycdf").as_list() == [
("delete", 0, 0),
("replace", 3, 2),
("insert", 6, 5),
]
assert metrics_cpp.levenshtein_editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
assert Levenshtein.editops("Lorem ipsum.", "XYZLorem ABC iPsum").as_list() == [
("insert", 0, 0),
("insert", 0, 1),
("insert", 0, 2),
Expand All @@ -105,7 +104,7 @@ def test_Editops():
("delete", 11, 18),
]

ops = metrics_cpp.levenshtein_editops("aaabaaa", "abbaaabba")
ops = Levenshtein.editops("aaabaaa", "abbaaabba")
assert ops.src_len == 7
assert ops.dest_len == 9

Expand All @@ -114,9 +113,7 @@ def test_Opcodes():
"""
basic test for Levenshtein.opcodes
"""
assert metrics_cpp.levenshtein_opcodes("", "abc") == Opcodes(
[Opcode(tag="insert", src_start=0, src_end=0, dest_start=0, dest_end=3)], 0, 3
)
assert Levenshtein.opcodes("", "abc").as_list() == [("insert", 0, 0, 0, 3)]


def test_mbleven():
Expand Down
10 changes: 5 additions & 5 deletions tests/test_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def test_matching_blocks(s1, s2):
"""
test correct matching block conversion
"""
ops = metrics_cpp.levenshtein_editops(s1, s2)
ops = Levenshtein.editops(s1, s2)
assert ops.as_matching_blocks() == ops.as_opcodes().as_matching_blocks()


Expand All @@ -219,7 +219,7 @@ def test_levenshtein_editops(s1, s2):
"""
test Levenshtein.editops with any sizes
"""
ops = metrics_cpp.levenshtein_editops(s1, s2)
ops = Levenshtein.editops(s1, s2)
assert ops.apply(s1, s2) == s2


Expand All @@ -229,7 +229,7 @@ def test_levenshtein_editops_block(s1, s2):
"""
test Levenshtein.editops for long strings
"""
ops = metrics_cpp.levenshtein_editops(s1, s2)
ops = Levenshtein.editops(s1, s2)
assert ops.apply(s1, s2) == s2


Expand Down Expand Up @@ -259,7 +259,7 @@ def test_levenshtein_opcodes(s1, s2):
"""
test Levenshtein.opcodes with any sizes
"""
ops = metrics_cpp.levenshtein_opcodes(s1, s2)
ops = Levenshtein.opcodes(s1, s2)
assert ops.apply(s1, s2) == s2


Expand All @@ -269,7 +269,7 @@ def test_levenshtein_opcodes_block(s1, s2):
"""
test Levenshtein.opcodes for long strings
"""
ops = metrics_cpp.levenshtein_opcodes(s1, s2)
ops = Levenshtein.opcodes(s1, s2)
assert ops.apply(s1, s2) == s2


Expand Down

0 comments on commit 1680e72

Please sign in to comment.