Skip to content

Commit

Permalink
release v2.6.0
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Aug 20, 2022
1 parent 397d7de commit dcf6746
Show file tree
Hide file tree
Showing 13 changed files with 1,694 additions and 12 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## Changelog

### [2.6.0] - 2022-08-
### [2.6.0] - 2022-08-20
#### Fixed
- fix hashing for custom classes

Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ else()
add_library(Taskflow::Taskflow ALIAS Taskflow)
endif()

find_package(rapidfuzz 1.1.1 QUIET)
find_package(rapidfuzz 1.2.0 QUIET)
if (rapidfuzz_FOUND)
message("Using system supplied version of rapidfuzz-cpp")
else()
Expand Down
46 changes: 46 additions & 0 deletions bench/benchmark_damerau_levenshtein.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# todo combine benchmarks of scorers into common code base
import timeit
import pandas

def benchmark(name, func, setup, lengths, count):
print(f"starting {name}")
start = timeit.default_timer()
results = []
from tqdm import tqdm
for length in tqdm(lengths):
#for length in lengths:
test = timeit.Timer(func, setup=setup.format(length, count))
results.append(min(test.timeit(number=1) for _ in range(7)) / count)
stop = timeit.default_timer()
print(f"finished {name}, Runtime: ", stop - start)
return results

setup ="""
from rapidfuzz.distance.DamerauLevenshtein import distance
from jellyfish import damerau_levenshtein_distance
import string
import random
random.seed(18)
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation
a = ''.join(random.choice(characters) for _ in range({0}))
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})]
"""

lengths = list(range(1,256,2))
count = 1000

time_rapidfuzz = benchmark("rapidfuzz",
'[distance(a, b) for b in b_list]',
setup, lengths, count)

time_jellyfish = benchmark("jellyfish",
'[damerau_levenshtein_distance(a, b) for b in b_list]',
setup, lengths, count)

df = pandas.DataFrame(data={
"length": lengths,
"rapidfuzz": time_rapidfuzz,
"jellyfish": time_jellyfish
})

df.to_csv("results/levenshtein_damerau.csv", sep=',',index=False)
20 changes: 20 additions & 0 deletions bench/benchmark_visualize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import pandas as pd
import matplotlib.pyplot as plt

df=pd.read_csv("results/levenshtein_damerau.csv")

df *= 1000 * 1000
df["length"] /= 1000 * 1000


ax=df.plot(x="length")

plt.xticks(list(range(0, 257, 64)))

plt.title("Performance comparision of the \nDamerauLevenshtein similarity in different libraries")
plt.xlabel("string length [in characters]")
plt.ylabel("runtime [μs]")
ax.set_xlim(xmin=0)
ax.set_ylim(bottom=0)
plt.grid()
plt.show()
129 changes: 129 additions & 0 deletions bench/results/levenshtein_damerau.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
length,rapidfuzz,jellyfish
1,1.3186500291340052e-07,2.2917869937373324e-06
3,1.5496299602091313e-07,2.3576690000481903e-06
5,1.9305800378788262e-07,2.6081079995492475e-06
7,2.5326300237793474e-07,2.7545159973669795e-06
9,3.2603699946776033e-07,2.835453997249715e-06
11,4.1582799167372286e-07,3.0755670013604687e-06
13,5.237079894868657e-07,3.348548008943908e-06
15,6.503320037154481e-07,3.5760050086537376e-06
17,8.075779915088788e-07,4.094708987395279e-06
19,9.830609924392775e-07,4.471113003091886e-06
21,1.1693169944919646e-06,4.878691994235851e-06
23,1.3880559999961407e-06,5.320055002812296e-06
25,1.6101539949886501e-06,5.812328003230505e-06
27,1.8436819955240934e-06,6.365544002619572e-06
29,2.1035520039731635e-06,7.1151800075313074e-06
31,2.3763119970681144e-06,7.682306997594423e-06
33,2.6762380002764983e-06,8.129071007715539e-06
35,3.0065760074649004e-06,8.772588000283577e-06
37,3.334196007926948e-06,9.501428008661605e-06
39,3.7296579976100476e-06,1.0552032996201888e-05
41,4.054303994053043e-06,1.1043670005165041e-05
43,4.501789997448213e-06,1.1927387007744983e-05
45,4.832107006222941e-06,1.2792505003744736e-05
47,5.278729004203342e-06,1.3642278994666412e-05
49,5.832850991282612e-06,1.5023024010588416e-05
51,6.299954999121837e-06,1.601715901051648e-05
53,6.834098006947898e-06,1.6562082004384137e-05
55,8.818272995995358e-06,1.7615651988307947e-05
57,8.89724399894476e-06,1.8693470992729998e-05
59,9.524909997708164e-06,1.9757230998948215e-05
61,9.858479999820703e-06,2.0921860006637873e-05
63,1.0892339007114061e-05,2.2059237002395092e-05
65,1.1153511994052679e-05,2.3281863002921454e-05
67,1.1815436999313534e-05,2.4583477003034204e-05
69,1.2513476991443895e-05,2.5820324997766875e-05
71,1.2912972990307025e-05,2.7160885001649148e-05
73,1.3612305003334767e-05,2.8512088989373295e-05
75,1.4474694995442406e-05,2.994079899508506e-05
77,1.5050637011881918e-05,3.1362107998575085e-05
79,1.6278285998851062e-05,3.2830506999744104e-05
81,1.7094506009016185e-05,3.4311275012441914e-05
83,1.7455348002840766e-05,3.5927311007981186e-05
85,1.8258039010106587e-05,3.744289500173181e-05
87,1.910153501376044e-05,3.909152599226218e-05
89,1.9952228001784534e-05,4.0773032989818604e-05
91,2.1386908003478312e-05,4.260684800101444e-05
93,2.1697513002436608e-05,4.424196299805771e-05
95,2.2654768006759697e-05,4.599233198678121e-05
97,2.3587388001033103e-05,4.7841726001934146e-05
99,2.4578259995905682e-05,4.9719257003744136e-05
101,2.555219200439751e-05,5.1562903987360186e-05
103,2.6544245003606194e-05,5.344945400429424e-05
105,2.7561848997720517e-05,5.546425998909399e-05
107,2.8663237986620516e-05,5.749664599716198e-05
109,2.9656843005795963e-05,5.9485555000719614e-05
111,3.071375100989826e-05,6.161657799384556e-05
113,3.182354199816473e-05,6.362505399738438e-05
115,3.29423270013649e-05,6.59617450000951e-05
117,3.406323600211181e-05,6.807827700686175e-05
119,3.523617400787771e-05,7.033873800537548e-05
121,3.641733099357225e-05,7.259436600725166e-05
123,3.760053600126412e-05,7.488666100834962e-05
125,3.8775731009081935e-05,7.721088700054678e-05
127,4.053250300057698e-05,7.95256010023877e-05
129,4.242038099619094e-05,8.207386899448466e-05
131,4.2548033001367e-05,8.45927530026529e-05
133,4.381964699132368e-05,8.702767000067979e-05
135,4.640341601043474e-05,8.967516900156625e-05
137,4.6470957007841206e-05,9.215094700630289e-05
139,4.785054900276009e-05,9.477479199995287e-05
141,4.920196099556051e-05,9.737256199878174e-05
143,5.0588134006829936e-05,0.00010015238399500959
145,5.2004214012413286e-05,0.00010274529000162146
147,5.339522199938074e-05,0.0001055051699950127
149,5.4872838998562654e-05,0.00010825543300597928
151,5.630636200658046e-05,0.00011108420000527986
153,5.778362399723846e-05,0.00011400010000215843
155,5.929304200981278e-05,0.0001169092579948483
157,6.082800000149291e-05,0.00011980927299009637
159,6.234696898900438e-05,0.00012275251699611544
161,6.389497400959953e-05,0.00012580490300024395
163,6.546421999519225e-05,0.00012879740999778731
165,6.706594899878838e-05,0.00013189049999346025
167,6.86899949942017e-05,0.00013508607700350695
169,7.03255730040837e-05,0.0001384543679887429
171,7.194524399528745e-05,0.0001415441660064971
173,7.364011400204617e-05,0.00014470038999570535
175,7.530159399902914e-05,0.00014812490799522493
177,7.703056299942545e-05,0.00015151044201047625
179,7.877000598818994e-05,0.0001550259229989024
181,8.056354499422014e-05,0.00015840976098843384
183,8.230255900707561e-05,0.00016146046300127636
185,8.411060999787878e-05,0.0001648237220069859
187,8.593102199665736e-05,0.00016880192600365263
189,8.775954999146052e-05,0.00017214770999271421
191,8.957593599916436e-05,0.0001756833649997134
193,9.143969698925502e-05,0.00017933695799729322
195,9.335860000282991e-05,0.00018320062900602352
197,9.528399800183252e-05,0.00018721782499051188
199,9.717094000370707e-05,0.00019091358900186605
201,9.915663600258995e-05,0.00019457660699845293
203,0.00010111445700749755,0.00019843029799812938
205,0.0001030436420114711,0.00020250573099474421
207,0.00010511462000431493,0.0002057212560030166
209,0.00010707404400454834,0.00020976610900834202
211,0.00010923221000120975,0.00021393066599557643
213,0.00011123345700616483,0.000218106002008426
215,0.00011331091000465676,0.0002226826880068984
217,0.00011541820199636276,0.00022656015200482217
219,0.00011745439701189752,0.00023122245600097814
221,0.00011965750799572561,0.00023512257800030055
223,0.0001217683919967385,0.00023922805799520575
225,0.00012395016000664327,0.00024466573499375957
227,0.00012613116498687304,0.00024804236000636594
229,0.00012834949900570792,0.000252487404999556
231,0.00013057007901079486,0.0002572519809909863
233,0.00013282599799276795,0.0002619540010055061
235,0.00013514574500732125,0.00026779574999818577
237,0.00013738959899637848,0.0002701820999936899
239,0.00013969385100062937,0.00027524539300065955
241,0.00014208142399729695,0.0002794181229983224
243,0.00014448049099883065,0.00028530672899796625
245,0.00014690193100250326,0.00028930913399381096
247,0.0001492562710045604,0.00029400941300264095
249,0.0001517066380038159,0.00029908111700206063
251,0.00015404325399140362,0.0003059594859951176
253,0.0001565057249972597,0.0003120929349970538
255,0.00015890150600171183,0.00031580220701289365
31 changes: 31 additions & 0 deletions docs/Usage/distance/DamerauLevenshtein.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Damerau Levenshtein
-------------------

Functions
^^^^^^^^^

distance
~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.distance

normalized_distance
~~~~~~~~~~~~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_distance

similarity
~~~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.similarity

normalized_similarity
~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_similarity

Performance
^^^^^^^^^^^
The following image shows a benchmark of the Damerau Levenshtein distance in
RapidFuzz and jellyfish. Both have a time complexity of ``O(NM)``. However RapidFuzz
only requires ``O(N + M)`` while the implementation in jellyfish requires
has a memory usage of ``O(NM)``.

.. image:: img/damerau_levenshtein.svg
:align: center
6 changes: 0 additions & 6 deletions docs/Usage/distance/Indel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,6 @@ opcodes

Performance
^^^^^^^^^^^
Since the Levenshtein module uses different implementations based on the weights
used, this leads to different performance characteristics. The following sections
show the performance for the different possible weights.

Indel
~~~~~
The following image shows a benchmark of the Indel distance in RapidFuzz
and python-Levenshtein. Similar to the normal Levenshtein distance
python-Levenshtein uses an implementation with a time complexity of ``O(NM)``,
Expand Down
Loading

0 comments on commit dcf6746

Please sign in to comment.