From 5b927f8da985e82ec3bf9a2d326bf00e8b164860 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Wed, 1 Nov 2023 18:50:26 +0100 Subject: [PATCH] improve wrapping performance --- src/Levenshtein/__init__.py | 307 +++++++++++++++++++----------------- 1 file changed, 161 insertions(+), 146 deletions(-) diff --git a/src/Levenshtein/__init__.py b/src/Levenshtein/__init__.py index 064c9d0..fc93c06 100644 --- a/src/Levenshtein/__init__.py +++ b/src/Levenshtein/__init__.py @@ -37,6 +37,27 @@ seqratio, ) + +def _copy_func(f, name, doc): + """Based on https://stackoverflow.com/a/13503277/11335032""" + import types + import functools + + g = types.FunctionType( + f.__code__, + f.__globals__, + name=f.__name__, + argdefs=f.__defaults__, + closure=f.__closure__, + ) + g = functools.update_wrapper(g, f) + g.__kwdefaults__ = f.__kwdefaults__ + g.__name__ = name + g.__qualname__ = name + g.__doc__ = doc + return g + + __all__ = [ "quickmedian", "median", @@ -57,152 +78,146 @@ "inverse", ] - -def distance(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None): - """ - Calculates the minimum number of insertions, deletions, and substitutions - required to change one sequence into the other according to Levenshtein with custom - costs for insertion, deletion and substitution - - Parameters - ---------- - s1 : Sequence[Hashable] - First string to compare. - s2 : Sequence[Hashable] - Second string to compare. - weights : Tuple[int, int, int] or None, optional - The weights for the three operations in the form - (insertion, deletion, substitution). Default is (1, 1, 1), - which gives all three operations a weight of 1. - processor: callable, optional - Optional callable that is used to preprocess the strings before - comparing them. Default is None, which deactivates this behaviour. - score_cutoff : int, optional - Maximum distance between s1 and s2, that is - considered as a result. If the distance is bigger than score_cutoff, - score_cutoff + 1 is returned instead. Default is None, which deactivates - this behaviour. - - Returns - ------- - distance : int - distance between s1 and s2 - - Raises - ------ - ValueError - If unsupported weights are provided a ValueError is thrown - - Examples - -------- - Find the Levenshtein distance between two strings: - - >>> from Levenshtein import distance - >>> distance("lewenstein", "levenshtein") - 2 - - Setting a maximum distance allows the implementation to select - a more efficient implementation: - - >>> distance("lewenstein", "levenshtein", score_cutoff=1) - 2 - - It is possible to select different weights by passing a `weight` - tuple. - - >>> distance("lewenstein", "levenshtein", weights=(1,1,2)) - 3 - """ - return _Levenshtein.distance( - s1, s2, weights=weights, processor=processor, score_cutoff=score_cutoff - ) - - -def ratio(s1, s2, *, processor=None, score_cutoff=None): - """ - Calculates a normalized indel similarity in the range [0, 1]. - This is calculated as ``1 - normalized_distance`` - - Parameters - ---------- - s1 : Sequence[Hashable] - First string to compare. - s2 : Sequence[Hashable] - Second string to compare. - processor: callable, optional - Optional callable that is used to preprocess the strings before - comparing them. Default is None, which deactivates this behaviour. - score_cutoff : float, optional - Optional argument for a score threshold as a float between 0 and 1.0. - For norm_sim < score_cutoff 0 is returned instead. Default is 0, - which deactivates this behaviour. - - Returns - ------- - norm_sim : float - normalized similarity between s1 and s2 as a float between 0 and 1.0 - - Examples - -------- - Find the normalized Indel similarity between two strings: - - >>> from Levenshtein import ratio - >>> ratio("lewenstein", "levenshtein") - 0.85714285714285 - - Setting a score_cutoff allows the implementation to select - a more efficient implementation: - - >>> ratio("lewenstein", "levenshtein", score_cutoff=0.9) - 0.0 - - When a different processor is used s1 and s2 do not have to be strings - - >>> ratio(["lewenstein"], ["levenshtein"], processor=lambda s: s[0]) - 0.8571428571428572 - """ - return _Indel.normalized_similarity( - s1, s2, processor=processor, score_cutoff=score_cutoff - ) - - -def hamming(s1, s2, *, pad=True, processor=None, score_cutoff=None): - """ - Calculates the Hamming distance between two strings. - The hamming distance is defined as the number of positions - where the two strings differ. It describes the minimum - amount of substitutions required to transform s1 into s2. - - Parameters - ---------- - s1 : Sequence[Hashable] - First string to compare. - s2 : Sequence[Hashable] - Second string to compare. - pad : bool, optional - should strings be padded if there is a length difference. - If pad is False and strings have a different length - a ValueError is thrown instead. Default is True. - processor: callable, optional - Optional callable that is used to preprocess the strings before - comparing them. Default is None, which deactivates this behaviour. - score_cutoff : int or None, optional - Maximum distance between s1 and s2, that is - considered as a result. If the distance is bigger than score_cutoff, - score_cutoff + 1 is returned instead. Default is None, which deactivates - this behaviour. - - Returns - ------- - distance : int - distance between s1 and s2 - - Raises - ------ - ValueError - If s1 and s2 have a different length - """ - return _Hamming.distance(s1, s2, pad=pad, processor=processor, score_cutoff=score_cutoff) +_distance_doc = """ +Calculates the minimum number of insertions, deletions, and substitutions +required to change one sequence into the other according to Levenshtein with custom +costs for insertion, deletion and substitution + +Parameters +---------- +s1 : Sequence[Hashable] + First string to compare. +s2 : Sequence[Hashable] + Second string to compare. +weights : Tuple[int, int, int] or None, optional + The weights for the three operations in the form + (insertion, deletion, substitution). Default is (1, 1, 1), + which gives all three operations a weight of 1. +processor: callable, optional + Optional callable that is used to preprocess the strings before + comparing them. Default is None, which deactivates this behaviour. +score_cutoff : int, optional + Maximum distance between s1 and s2, that is + considered as a result. If the distance is bigger than score_cutoff, + score_cutoff + 1 is returned instead. Default is None, which deactivates + this behaviour. + +Returns +------- +distance : int + distance between s1 and s2 + +Raises +------ +ValueError + If unsupported weights are provided a ValueError is thrown + +Examples +-------- +Find the Levenshtein distance between two strings: + +>>> from Levenshtein import distance +>>> distance("lewenstein", "levenshtein") +2 + +Setting a maximum distance allows the implementation to select +a more efficient implementation: + +>>> distance("lewenstein", "levenshtein", score_cutoff=1) +2 + +It is possible to select different weights by passing a `weight` +tuple. + +>>> distance("lewenstein", "levenshtein", weights=(1,1,2)) +3 +""" +distance = _copy_func(_Levenshtein.distance, "distance", _distance_doc) +distance._RF_OriginalScorer = distance + + +ratio_doc = """ +Calculates a normalized indel similarity in the range [0, 1]. +This is calculated as ``1 - normalized_distance`` + +Parameters +---------- +s1 : Sequence[Hashable] + First string to compare. +s2 : Sequence[Hashable] + Second string to compare. +processor: callable, optional + Optional callable that is used to preprocess the strings before + comparing them. Default is None, which deactivates this behaviour. +score_cutoff : float, optional + Optional argument for a score threshold as a float between 0 and 1.0. + For norm_sim < score_cutoff 0 is returned instead. Default is 0, + which deactivates this behaviour. + +Returns +------- +norm_sim : float + normalized similarity between s1 and s2 as a float between 0 and 1.0 + +Examples +-------- +Find the normalized Indel similarity between two strings: + +>>> from Levenshtein import ratio +>>> ratio("lewenstein", "levenshtein") +0.85714285714285 + +Setting a score_cutoff allows the implementation to select +a more efficient implementation: + +>>> ratio("lewenstein", "levenshtein", score_cutoff=0.9) +0.0 + +When a different processor is used s1 and s2 do not have to be strings + +>>> ratio(["lewenstein"], ["levenshtein"], processor=lambda s: s[0]) +0.8571428571428572 +""" +ratio = _copy_func(_Indel.normalized_similarity, "ratio", ratio_doc) +ratio._RF_OriginalScorer = ratio + +hamming_doc = """ +Calculates the Hamming distance between two strings. +The hamming distance is defined as the number of positions +where the two strings differ. It describes the minimum +amount of substitutions required to transform s1 into s2. + +Parameters +---------- +s1 : Sequence[Hashable] + First string to compare. +s2 : Sequence[Hashable] + Second string to compare. +pad : bool, optional + should strings be padded if there is a length difference. + If pad is False and strings have a different length + a ValueError is thrown instead. Default is True. +processor: callable, optional + Optional callable that is used to preprocess the strings before + comparing them. Default is None, which deactivates this behaviour. +score_cutoff : int or None, optional + Maximum distance between s1 and s2, that is + considered as a result. If the distance is bigger than score_cutoff, + score_cutoff + 1 is returned instead. Default is None, which deactivates + this behaviour. + +Returns +------- +distance : int + distance between s1 and s2 + +Raises +------ +ValueError + If s1 and s2 have a different length +""" +hamming = _copy_func(_Hamming.distance, "hamming", hamming_doc) +hamming._RF_OriginalScorer = hamming def jaro(s1, s2, *, processor=None, score_cutoff=None) -> float: