-
-
Notifications
You must be signed in to change notification settings - Fork 119
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
397d7de
commit dcf6746
Showing
13 changed files
with
1,694 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
## Changelog | ||
|
||
### [2.6.0] - 2022-08- | ||
### [2.6.0] - 2022-08-20 | ||
#### Fixed | ||
- fix hashing for custom classes | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# todo combine benchmarks of scorers into common code base | ||
import timeit | ||
import pandas | ||
|
||
def benchmark(name, func, setup, lengths, count): | ||
print(f"starting {name}") | ||
start = timeit.default_timer() | ||
results = [] | ||
from tqdm import tqdm | ||
for length in tqdm(lengths): | ||
#for length in lengths: | ||
test = timeit.Timer(func, setup=setup.format(length, count)) | ||
results.append(min(test.timeit(number=1) for _ in range(7)) / count) | ||
stop = timeit.default_timer() | ||
print(f"finished {name}, Runtime: ", stop - start) | ||
return results | ||
|
||
setup =""" | ||
from rapidfuzz.distance.DamerauLevenshtein import distance | ||
from jellyfish import damerau_levenshtein_distance | ||
import string | ||
import random | ||
random.seed(18) | ||
characters = string.ascii_letters + string.digits + string.whitespace + string.punctuation | ||
a = ''.join(random.choice(characters) for _ in range({0})) | ||
b_list = [''.join(random.choice(characters) for _ in range({0})) for _ in range({1})] | ||
""" | ||
|
||
lengths = list(range(1,256,2)) | ||
count = 1000 | ||
|
||
time_rapidfuzz = benchmark("rapidfuzz", | ||
'[distance(a, b) for b in b_list]', | ||
setup, lengths, count) | ||
|
||
time_jellyfish = benchmark("jellyfish", | ||
'[damerau_levenshtein_distance(a, b) for b in b_list]', | ||
setup, lengths, count) | ||
|
||
df = pandas.DataFrame(data={ | ||
"length": lengths, | ||
"rapidfuzz": time_rapidfuzz, | ||
"jellyfish": time_jellyfish | ||
}) | ||
|
||
df.to_csv("results/levenshtein_damerau.csv", sep=',',index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import pandas as pd | ||
import matplotlib.pyplot as plt | ||
|
||
df=pd.read_csv("results/levenshtein_damerau.csv") | ||
|
||
df *= 1000 * 1000 | ||
df["length"] /= 1000 * 1000 | ||
|
||
|
||
ax=df.plot(x="length") | ||
|
||
plt.xticks(list(range(0, 257, 64))) | ||
|
||
plt.title("Performance comparision of the \nDamerauLevenshtein similarity in different libraries") | ||
plt.xlabel("string length [in characters]") | ||
plt.ylabel("runtime [μs]") | ||
ax.set_xlim(xmin=0) | ||
ax.set_ylim(bottom=0) | ||
plt.grid() | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
length,rapidfuzz,jellyfish | ||
1,1.3186500291340052e-07,2.2917869937373324e-06 | ||
3,1.5496299602091313e-07,2.3576690000481903e-06 | ||
5,1.9305800378788262e-07,2.6081079995492475e-06 | ||
7,2.5326300237793474e-07,2.7545159973669795e-06 | ||
9,3.2603699946776033e-07,2.835453997249715e-06 | ||
11,4.1582799167372286e-07,3.0755670013604687e-06 | ||
13,5.237079894868657e-07,3.348548008943908e-06 | ||
15,6.503320037154481e-07,3.5760050086537376e-06 | ||
17,8.075779915088788e-07,4.094708987395279e-06 | ||
19,9.830609924392775e-07,4.471113003091886e-06 | ||
21,1.1693169944919646e-06,4.878691994235851e-06 | ||
23,1.3880559999961407e-06,5.320055002812296e-06 | ||
25,1.6101539949886501e-06,5.812328003230505e-06 | ||
27,1.8436819955240934e-06,6.365544002619572e-06 | ||
29,2.1035520039731635e-06,7.1151800075313074e-06 | ||
31,2.3763119970681144e-06,7.682306997594423e-06 | ||
33,2.6762380002764983e-06,8.129071007715539e-06 | ||
35,3.0065760074649004e-06,8.772588000283577e-06 | ||
37,3.334196007926948e-06,9.501428008661605e-06 | ||
39,3.7296579976100476e-06,1.0552032996201888e-05 | ||
41,4.054303994053043e-06,1.1043670005165041e-05 | ||
43,4.501789997448213e-06,1.1927387007744983e-05 | ||
45,4.832107006222941e-06,1.2792505003744736e-05 | ||
47,5.278729004203342e-06,1.3642278994666412e-05 | ||
49,5.832850991282612e-06,1.5023024010588416e-05 | ||
51,6.299954999121837e-06,1.601715901051648e-05 | ||
53,6.834098006947898e-06,1.6562082004384137e-05 | ||
55,8.818272995995358e-06,1.7615651988307947e-05 | ||
57,8.89724399894476e-06,1.8693470992729998e-05 | ||
59,9.524909997708164e-06,1.9757230998948215e-05 | ||
61,9.858479999820703e-06,2.0921860006637873e-05 | ||
63,1.0892339007114061e-05,2.2059237002395092e-05 | ||
65,1.1153511994052679e-05,2.3281863002921454e-05 | ||
67,1.1815436999313534e-05,2.4583477003034204e-05 | ||
69,1.2513476991443895e-05,2.5820324997766875e-05 | ||
71,1.2912972990307025e-05,2.7160885001649148e-05 | ||
73,1.3612305003334767e-05,2.8512088989373295e-05 | ||
75,1.4474694995442406e-05,2.994079899508506e-05 | ||
77,1.5050637011881918e-05,3.1362107998575085e-05 | ||
79,1.6278285998851062e-05,3.2830506999744104e-05 | ||
81,1.7094506009016185e-05,3.4311275012441914e-05 | ||
83,1.7455348002840766e-05,3.5927311007981186e-05 | ||
85,1.8258039010106587e-05,3.744289500173181e-05 | ||
87,1.910153501376044e-05,3.909152599226218e-05 | ||
89,1.9952228001784534e-05,4.0773032989818604e-05 | ||
91,2.1386908003478312e-05,4.260684800101444e-05 | ||
93,2.1697513002436608e-05,4.424196299805771e-05 | ||
95,2.2654768006759697e-05,4.599233198678121e-05 | ||
97,2.3587388001033103e-05,4.7841726001934146e-05 | ||
99,2.4578259995905682e-05,4.9719257003744136e-05 | ||
101,2.555219200439751e-05,5.1562903987360186e-05 | ||
103,2.6544245003606194e-05,5.344945400429424e-05 | ||
105,2.7561848997720517e-05,5.546425998909399e-05 | ||
107,2.8663237986620516e-05,5.749664599716198e-05 | ||
109,2.9656843005795963e-05,5.9485555000719614e-05 | ||
111,3.071375100989826e-05,6.161657799384556e-05 | ||
113,3.182354199816473e-05,6.362505399738438e-05 | ||
115,3.29423270013649e-05,6.59617450000951e-05 | ||
117,3.406323600211181e-05,6.807827700686175e-05 | ||
119,3.523617400787771e-05,7.033873800537548e-05 | ||
121,3.641733099357225e-05,7.259436600725166e-05 | ||
123,3.760053600126412e-05,7.488666100834962e-05 | ||
125,3.8775731009081935e-05,7.721088700054678e-05 | ||
127,4.053250300057698e-05,7.95256010023877e-05 | ||
129,4.242038099619094e-05,8.207386899448466e-05 | ||
131,4.2548033001367e-05,8.45927530026529e-05 | ||
133,4.381964699132368e-05,8.702767000067979e-05 | ||
135,4.640341601043474e-05,8.967516900156625e-05 | ||
137,4.6470957007841206e-05,9.215094700630289e-05 | ||
139,4.785054900276009e-05,9.477479199995287e-05 | ||
141,4.920196099556051e-05,9.737256199878174e-05 | ||
143,5.0588134006829936e-05,0.00010015238399500959 | ||
145,5.2004214012413286e-05,0.00010274529000162146 | ||
147,5.339522199938074e-05,0.0001055051699950127 | ||
149,5.4872838998562654e-05,0.00010825543300597928 | ||
151,5.630636200658046e-05,0.00011108420000527986 | ||
153,5.778362399723846e-05,0.00011400010000215843 | ||
155,5.929304200981278e-05,0.0001169092579948483 | ||
157,6.082800000149291e-05,0.00011980927299009637 | ||
159,6.234696898900438e-05,0.00012275251699611544 | ||
161,6.389497400959953e-05,0.00012580490300024395 | ||
163,6.546421999519225e-05,0.00012879740999778731 | ||
165,6.706594899878838e-05,0.00013189049999346025 | ||
167,6.86899949942017e-05,0.00013508607700350695 | ||
169,7.03255730040837e-05,0.0001384543679887429 | ||
171,7.194524399528745e-05,0.0001415441660064971 | ||
173,7.364011400204617e-05,0.00014470038999570535 | ||
175,7.530159399902914e-05,0.00014812490799522493 | ||
177,7.703056299942545e-05,0.00015151044201047625 | ||
179,7.877000598818994e-05,0.0001550259229989024 | ||
181,8.056354499422014e-05,0.00015840976098843384 | ||
183,8.230255900707561e-05,0.00016146046300127636 | ||
185,8.411060999787878e-05,0.0001648237220069859 | ||
187,8.593102199665736e-05,0.00016880192600365263 | ||
189,8.775954999146052e-05,0.00017214770999271421 | ||
191,8.957593599916436e-05,0.0001756833649997134 | ||
193,9.143969698925502e-05,0.00017933695799729322 | ||
195,9.335860000282991e-05,0.00018320062900602352 | ||
197,9.528399800183252e-05,0.00018721782499051188 | ||
199,9.717094000370707e-05,0.00019091358900186605 | ||
201,9.915663600258995e-05,0.00019457660699845293 | ||
203,0.00010111445700749755,0.00019843029799812938 | ||
205,0.0001030436420114711,0.00020250573099474421 | ||
207,0.00010511462000431493,0.0002057212560030166 | ||
209,0.00010707404400454834,0.00020976610900834202 | ||
211,0.00010923221000120975,0.00021393066599557643 | ||
213,0.00011123345700616483,0.000218106002008426 | ||
215,0.00011331091000465676,0.0002226826880068984 | ||
217,0.00011541820199636276,0.00022656015200482217 | ||
219,0.00011745439701189752,0.00023122245600097814 | ||
221,0.00011965750799572561,0.00023512257800030055 | ||
223,0.0001217683919967385,0.00023922805799520575 | ||
225,0.00012395016000664327,0.00024466573499375957 | ||
227,0.00012613116498687304,0.00024804236000636594 | ||
229,0.00012834949900570792,0.000252487404999556 | ||
231,0.00013057007901079486,0.0002572519809909863 | ||
233,0.00013282599799276795,0.0002619540010055061 | ||
235,0.00013514574500732125,0.00026779574999818577 | ||
237,0.00013738959899637848,0.0002701820999936899 | ||
239,0.00013969385100062937,0.00027524539300065955 | ||
241,0.00014208142399729695,0.0002794181229983224 | ||
243,0.00014448049099883065,0.00028530672899796625 | ||
245,0.00014690193100250326,0.00028930913399381096 | ||
247,0.0001492562710045604,0.00029400941300264095 | ||
249,0.0001517066380038159,0.00029908111700206063 | ||
251,0.00015404325399140362,0.0003059594859951176 | ||
253,0.0001565057249972597,0.0003120929349970538 | ||
255,0.00015890150600171183,0.00031580220701289365 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
Damerau Levenshtein | ||
------------------- | ||
|
||
Functions | ||
^^^^^^^^^ | ||
|
||
distance | ||
~~~~~~~~ | ||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.distance | ||
|
||
normalized_distance | ||
~~~~~~~~~~~~~~~~~~~ | ||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_distance | ||
|
||
similarity | ||
~~~~~~~~~~ | ||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.similarity | ||
|
||
normalized_similarity | ||
~~~~~~~~~~~~~~~~~~~~~ | ||
.. autofunction:: rapidfuzz.distance.DamerauLevenshtein.normalized_similarity | ||
|
||
Performance | ||
^^^^^^^^^^^ | ||
The following image shows a benchmark of the Damerau Levenshtein distance in | ||
RapidFuzz and jellyfish. Both have a time complexity of ``O(NM)``. However RapidFuzz | ||
only requires ``O(N + M)`` while the implementation in jellyfish requires | ||
has a memory usage of ``O(NM)``. | ||
|
||
.. image:: img/damerau_levenshtein.svg | ||
:align: center |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.