Skip to content

Commit

Permalink
improve wrapping performance
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Nov 1, 2023
1 parent b6069c7 commit 5b927f8
Showing 1 changed file with 161 additions and 146 deletions.
307 changes: 161 additions & 146 deletions src/Levenshtein/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,27 @@
seqratio,
)


def _copy_func(f, name, doc):
"""Based on https://stackoverflow.com/a/13503277/11335032"""
import types
import functools

g = types.FunctionType(
f.__code__,
f.__globals__,
name=f.__name__,
argdefs=f.__defaults__,
closure=f.__closure__,
)
g = functools.update_wrapper(g, f)
g.__kwdefaults__ = f.__kwdefaults__
g.__name__ = name
g.__qualname__ = name
g.__doc__ = doc
return g


__all__ = [
"quickmedian",
"median",
Expand All @@ -57,152 +78,146 @@
"inverse",
]


def distance(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None):
"""
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein with custom
costs for insertion, deletion and substitution
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
weights : Tuple[int, int, int] or None, optional
The weights for the three operations in the form
(insertion, deletion, substitution). Default is (1, 1, 1),
which gives all three operations a weight of 1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Raises
------
ValueError
If unsupported weights are provided a ValueError is thrown
Examples
--------
Find the Levenshtein distance between two strings:
>>> from Levenshtein import distance
>>> distance("lewenstein", "levenshtein")
2
Setting a maximum distance allows the implementation to select
a more efficient implementation:
>>> distance("lewenstein", "levenshtein", score_cutoff=1)
2
It is possible to select different weights by passing a `weight`
tuple.
>>> distance("lewenstein", "levenshtein", weights=(1,1,2))
3
"""
return _Levenshtein.distance(
s1, s2, weights=weights, processor=processor, score_cutoff=score_cutoff
)


def ratio(s1, s2, *, processor=None, score_cutoff=None):
"""
Calculates a normalized indel similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Examples
--------
Find the normalized Indel similarity between two strings:
>>> from Levenshtein import ratio
>>> ratio("lewenstein", "levenshtein")
0.85714285714285
Setting a score_cutoff allows the implementation to select
a more efficient implementation:
>>> ratio("lewenstein", "levenshtein", score_cutoff=0.9)
0.0
When a different processor is used s1 and s2 do not have to be strings
>>> ratio(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.8571428571428572
"""
return _Indel.normalized_similarity(
s1, s2, processor=processor, score_cutoff=score_cutoff
)


def hamming(s1, s2, *, pad=True, processor=None, score_cutoff=None):
"""
Calculates the Hamming distance between two strings.
The hamming distance is defined as the number of positions
where the two strings differ. It describes the minimum
amount of substitutions required to transform s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Default is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int or None, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Raises
------
ValueError
If s1 and s2 have a different length
"""
return _Hamming.distance(s1, s2, pad=pad, processor=processor, score_cutoff=score_cutoff)
_distance_doc = """
Calculates the minimum number of insertions, deletions, and substitutions
required to change one sequence into the other according to Levenshtein with custom
costs for insertion, deletion and substitution
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
weights : Tuple[int, int, int] or None, optional
The weights for the three operations in the form
(insertion, deletion, substitution). Default is (1, 1, 1),
which gives all three operations a weight of 1.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Raises
------
ValueError
If unsupported weights are provided a ValueError is thrown
Examples
--------
Find the Levenshtein distance between two strings:
>>> from Levenshtein import distance
>>> distance("lewenstein", "levenshtein")
2
Setting a maximum distance allows the implementation to select
a more efficient implementation:
>>> distance("lewenstein", "levenshtein", score_cutoff=1)
2
It is possible to select different weights by passing a `weight`
tuple.
>>> distance("lewenstein", "levenshtein", weights=(1,1,2))
3
"""
distance = _copy_func(_Levenshtein.distance, "distance", _distance_doc)
distance._RF_OriginalScorer = distance


ratio_doc = """
Calculates a normalized indel similarity in the range [0, 1].
This is calculated as ``1 - normalized_distance``
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : float, optional
Optional argument for a score threshold as a float between 0 and 1.0.
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
which deactivates this behaviour.
Returns
-------
norm_sim : float
normalized similarity between s1 and s2 as a float between 0 and 1.0
Examples
--------
Find the normalized Indel similarity between two strings:
>>> from Levenshtein import ratio
>>> ratio("lewenstein", "levenshtein")
0.85714285714285
Setting a score_cutoff allows the implementation to select
a more efficient implementation:
>>> ratio("lewenstein", "levenshtein", score_cutoff=0.9)
0.0
When a different processor is used s1 and s2 do not have to be strings
>>> ratio(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
0.8571428571428572
"""
ratio = _copy_func(_Indel.normalized_similarity, "ratio", ratio_doc)
ratio._RF_OriginalScorer = ratio

hamming_doc = """
Calculates the Hamming distance between two strings.
The hamming distance is defined as the number of positions
where the two strings differ. It describes the minimum
amount of substitutions required to transform s1 into s2.
Parameters
----------
s1 : Sequence[Hashable]
First string to compare.
s2 : Sequence[Hashable]
Second string to compare.
pad : bool, optional
should strings be padded if there is a length difference.
If pad is False and strings have a different length
a ValueError is thrown instead. Default is True.
processor: callable, optional
Optional callable that is used to preprocess the strings before
comparing them. Default is None, which deactivates this behaviour.
score_cutoff : int or None, optional
Maximum distance between s1 and s2, that is
considered as a result. If the distance is bigger than score_cutoff,
score_cutoff + 1 is returned instead. Default is None, which deactivates
this behaviour.
Returns
-------
distance : int
distance between s1 and s2
Raises
------
ValueError
If s1 and s2 have a different length
"""
hamming = _copy_func(_Hamming.distance, "hamming", hamming_doc)
hamming._RF_OriginalScorer = hamming


def jaro(s1, s2, *, processor=None, score_cutoff=None) -> float:
Expand Down

0 comments on commit 5b927f8

Please sign in to comment.