From 5b927f8da985e82ec3bf9a2d326bf00e8b164860 Mon Sep 17 00:00:00 2001
From: Max Bachmann <oss@maxbachmann.de>
Date: Wed, 1 Nov 2023 18:50:26 +0100
Subject: [PATCH] improve wrapping performance

---
 src/Levenshtein/__init__.py | 307 +++++++++++++++++++-----------------
 1 file changed, 161 insertions(+), 146 deletions(-)

diff --git a/src/Levenshtein/__init__.py b/src/Levenshtein/__init__.py
index 064c9d0..fc93c06 100644
--- a/src/Levenshtein/__init__.py
+++ b/src/Levenshtein/__init__.py
@@ -37,6 +37,27 @@
     seqratio,
 )
 
+
+def _copy_func(f, name, doc):
+    """Based on https://stackoverflow.com/a/13503277/11335032"""
+    import types
+    import functools
+
+    g = types.FunctionType(
+        f.__code__,
+        f.__globals__,
+        name=f.__name__,
+        argdefs=f.__defaults__,
+        closure=f.__closure__,
+    )
+    g = functools.update_wrapper(g, f)
+    g.__kwdefaults__ = f.__kwdefaults__
+    g.__name__ = name
+    g.__qualname__ = name
+    g.__doc__ = doc
+    return g
+
+
 __all__ = [
     "quickmedian",
     "median",
@@ -57,152 +78,146 @@
     "inverse",
 ]
 
-
-def distance(s1, s2, *, weights=(1, 1, 1), processor=None, score_cutoff=None):
-    """
-    Calculates the minimum number of insertions, deletions, and substitutions
-    required to change one sequence into the other according to Levenshtein with custom
-    costs for insertion, deletion and substitution
-
-    Parameters
-    ----------
-    s1 : Sequence[Hashable]
-        First string to compare.
-    s2 : Sequence[Hashable]
-        Second string to compare.
-    weights : Tuple[int, int, int] or None, optional
-        The weights for the three operations in the form
-        (insertion, deletion, substitution). Default is (1, 1, 1),
-        which gives all three operations a weight of 1.
-    processor: callable, optional
-        Optional callable that is used to preprocess the strings before
-        comparing them. Default is None, which deactivates this behaviour.
-    score_cutoff : int, optional
-        Maximum distance between s1 and s2, that is
-        considered as a result. If the distance is bigger than score_cutoff,
-        score_cutoff + 1 is returned instead. Default is None, which deactivates
-        this behaviour.
-
-    Returns
-    -------
-    distance : int
-        distance between s1 and s2
-
-    Raises
-    ------
-    ValueError
-        If unsupported weights are provided a ValueError is thrown
-
-    Examples
-    --------
-    Find the Levenshtein distance between two strings:
-
-    >>> from Levenshtein import distance
-    >>> distance("lewenstein", "levenshtein")
-    2
-
-    Setting a maximum distance allows the implementation to select
-    a more efficient implementation:
-
-    >>> distance("lewenstein", "levenshtein", score_cutoff=1)
-    2
-
-    It is possible to select different weights by passing a `weight`
-    tuple.
-
-    >>> distance("lewenstein", "levenshtein", weights=(1,1,2))
-    3
-    """
-    return _Levenshtein.distance(
-        s1, s2, weights=weights, processor=processor, score_cutoff=score_cutoff
-    )
-
-
-def ratio(s1, s2, *, processor=None, score_cutoff=None):
-    """
-    Calculates a normalized indel similarity in the range [0, 1].
-    This is calculated as ``1 - normalized_distance``
-
-    Parameters
-    ----------
-    s1 : Sequence[Hashable]
-        First string to compare.
-    s2 : Sequence[Hashable]
-        Second string to compare.
-    processor: callable, optional
-        Optional callable that is used to preprocess the strings before
-        comparing them. Default is None, which deactivates this behaviour.
-    score_cutoff : float, optional
-        Optional argument for a score threshold as a float between 0 and 1.0.
-        For norm_sim < score_cutoff 0 is returned instead. Default is 0,
-        which deactivates this behaviour.
-
-    Returns
-    -------
-    norm_sim : float
-        normalized similarity between s1 and s2 as a float between 0 and 1.0
-
-    Examples
-    --------
-    Find the normalized Indel similarity between two strings:
-
-    >>> from Levenshtein import ratio
-    >>> ratio("lewenstein", "levenshtein")
-    0.85714285714285
-
-    Setting a score_cutoff allows the implementation to select
-    a more efficient implementation:
-
-    >>> ratio("lewenstein", "levenshtein", score_cutoff=0.9)
-    0.0
-
-    When a different processor is used s1 and s2 do not have to be strings
-
-    >>> ratio(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
-    0.8571428571428572
-    """
-    return _Indel.normalized_similarity(
-        s1, s2, processor=processor, score_cutoff=score_cutoff
-    )
-
-
-def hamming(s1, s2, *, pad=True, processor=None, score_cutoff=None):
-    """
-    Calculates the Hamming distance between two strings.
-    The hamming distance is defined as the number of positions
-    where the two strings differ. It describes the minimum
-    amount of substitutions required to transform s1 into s2.
-
-    Parameters
-    ----------
-    s1 : Sequence[Hashable]
-        First string to compare.
-    s2 : Sequence[Hashable]
-        Second string to compare.
-    pad : bool, optional
-       should strings be padded if there is a length difference.
-       If pad is False and strings have a different length
-       a ValueError is thrown instead. Default is True.
-    processor: callable, optional
-        Optional callable that is used to preprocess the strings before
-        comparing them. Default is None, which deactivates this behaviour.
-    score_cutoff : int or None, optional
-        Maximum distance between s1 and s2, that is
-        considered as a result. If the distance is bigger than score_cutoff,
-        score_cutoff + 1 is returned instead. Default is None, which deactivates
-        this behaviour.
-
-    Returns
-    -------
-    distance : int
-        distance between s1 and s2
-
-    Raises
-    ------
-    ValueError
-        If s1 and s2 have a different length
-    """
-    return _Hamming.distance(s1, s2, pad=pad, processor=processor, score_cutoff=score_cutoff)
+_distance_doc = """
+Calculates the minimum number of insertions, deletions, and substitutions
+required to change one sequence into the other according to Levenshtein with custom
+costs for insertion, deletion and substitution
+
+Parameters
+----------
+s1 : Sequence[Hashable]
+    First string to compare.
+s2 : Sequence[Hashable]
+    Second string to compare.
+weights : Tuple[int, int, int] or None, optional
+    The weights for the three operations in the form
+    (insertion, deletion, substitution). Default is (1, 1, 1),
+    which gives all three operations a weight of 1.
+processor: callable, optional
+    Optional callable that is used to preprocess the strings before
+    comparing them. Default is None, which deactivates this behaviour.
+score_cutoff : int, optional
+    Maximum distance between s1 and s2, that is
+    considered as a result. If the distance is bigger than score_cutoff,
+    score_cutoff + 1 is returned instead. Default is None, which deactivates
+    this behaviour.
+
+Returns
+-------
+distance : int
+    distance between s1 and s2
+
+Raises
+------
+ValueError
+    If unsupported weights are provided a ValueError is thrown
+
+Examples
+--------
+Find the Levenshtein distance between two strings:
+
+>>> from Levenshtein import distance
+>>> distance("lewenstein", "levenshtein")
+2
+
+Setting a maximum distance allows the implementation to select
+a more efficient implementation:
+
+>>> distance("lewenstein", "levenshtein", score_cutoff=1)
+2
+
+It is possible to select different weights by passing a `weight`
+tuple.
+
+>>> distance("lewenstein", "levenshtein", weights=(1,1,2))
+3
+"""
+distance = _copy_func(_Levenshtein.distance, "distance", _distance_doc)
+distance._RF_OriginalScorer = distance
+
+
+ratio_doc = """
+Calculates a normalized indel similarity in the range [0, 1].
+This is calculated as ``1 - normalized_distance``
+
+Parameters
+----------
+s1 : Sequence[Hashable]
+    First string to compare.
+s2 : Sequence[Hashable]
+    Second string to compare.
+processor: callable, optional
+    Optional callable that is used to preprocess the strings before
+    comparing them. Default is None, which deactivates this behaviour.
+score_cutoff : float, optional
+    Optional argument for a score threshold as a float between 0 and 1.0.
+    For norm_sim < score_cutoff 0 is returned instead. Default is 0,
+    which deactivates this behaviour.
+
+Returns
+-------
+norm_sim : float
+    normalized similarity between s1 and s2 as a float between 0 and 1.0
+
+Examples
+--------
+Find the normalized Indel similarity between two strings:
+
+>>> from Levenshtein import ratio
+>>> ratio("lewenstein", "levenshtein")
+0.85714285714285
+
+Setting a score_cutoff allows the implementation to select
+a more efficient implementation:
+
+>>> ratio("lewenstein", "levenshtein", score_cutoff=0.9)
+0.0
+
+When a different processor is used s1 and s2 do not have to be strings
+
+>>> ratio(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
+0.8571428571428572
+"""
+ratio = _copy_func(_Indel.normalized_similarity, "ratio", ratio_doc)
+ratio._RF_OriginalScorer = ratio
+
+hamming_doc = """
+Calculates the Hamming distance between two strings.
+The hamming distance is defined as the number of positions
+where the two strings differ. It describes the minimum
+amount of substitutions required to transform s1 into s2.
+
+Parameters
+----------
+s1 : Sequence[Hashable]
+    First string to compare.
+s2 : Sequence[Hashable]
+    Second string to compare.
+pad : bool, optional
+    should strings be padded if there is a length difference.
+    If pad is False and strings have a different length
+    a ValueError is thrown instead. Default is True.
+processor: callable, optional
+    Optional callable that is used to preprocess the strings before
+    comparing them. Default is None, which deactivates this behaviour.
+score_cutoff : int or None, optional
+    Maximum distance between s1 and s2, that is
+    considered as a result. If the distance is bigger than score_cutoff,
+    score_cutoff + 1 is returned instead. Default is None, which deactivates
+    this behaviour.
+
+Returns
+-------
+distance : int
+    distance between s1 and s2
+
+Raises
+------
+ValueError
+    If s1 and s2 have a different length
+"""
+hamming = _copy_func(_Hamming.distance, "hamming", hamming_doc)
+hamming._RF_OriginalScorer = hamming
 
 
 def jaro(s1, s2, *, processor=None, score_cutoff=None) -> float: