update return types of processors

rapidfuzz · Nov 16, 2020 · 9169444 · 9169444
1 parent 426fbb2
commit 9169444
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 80 deletions.
diff --git a/README.md b/README.md
@@ -129,9 +129,9 @@ pip install .
 ```console
 > choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
 > process.extract("new york jets", choices, limit=2)
-[('new york jets', 100), ('new york giants', 78.57142639160156)]
+[('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
 > process.extractOne("cowboys", choices)
-("dallas cowboys", 90)
+("Dallas Cowboys", 90, 3)
 ```
 
 ## License

diff --git a/docs/usage/process.md b/docs/usage/process.md
@@ -43,17 +43,17 @@ Find the best matches in a list of choices.
 
     Returns:
 
-    - **matches**: *List[Tuple[str, float]] or List[Tuple[str, float, str]])*
+    - **matches**: *List[Tuple[str, float, Any]]*
 
         Returns a list of all matches that have a `score >= score_cutoff`. The list will
-        be of either `(<choice>, <ratio>)` when `choices` is a list of strings
+        be of either `(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings
         or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
 
 
     ```console
     > choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
     > process.extract("new york jets", choices, limit=2)
-    [('new york jets', 100), ('new york giants', 78.57142639160156)]
+    [('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
     ```
 
 === "C++"
@@ -62,7 +62,7 @@ Find the best matches in a list of choices.
     using rapidfuzz::process::extract;
 
     // matches is a vector of std::pairs
-    // [('new york jets', 100), ('new york giants', 78.57142639160156)]
+    // [('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
     auto matches = extract(
       "new york jets",
       std::vector<std::string>{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"},
@@ -80,15 +80,15 @@ Finds the best match in a list of choices by comparing them using the provided s
 
     Returns:
 
-    - **matches**: *Union[None, Tuple[str, float], Tuple[str, float, str]]*
+    - **matches**: *Union[None, Tuple[str, float, Any]]*
 
-        Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(<choice>, <ratio>)` when `choices` is a list of strings or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
+        Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
 
 
     ```console
     > choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
     > process.extractOne("cowboys", choices)
-    ("dallas cowboys", 90)
+    ("Dallas Cowboys", 90, 3)
     ```
 
 === "C++"
@@ -97,7 +97,7 @@ Finds the best match in a list of choices by comparing them using the provided s
     using rapidfuzz::process::extractOne;
 
     // matches is a boost::optional<std::pair>
-    // ("dallas cowboys", 90)
+    // ("Dallas Cowboys", 90, 3)
     auto matches = extractOne(
       "cowboys",
       std::vector<std::string>{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"});

diff --git a/src/py_abstraction.cpp b/src/py_abstraction.cpp
@@ -1,6 +1,5 @@
 /* SPDX-License-Identifier: MIT */
 /* Copyright © 2020 Max Bachmann */
-/* Copyright © 2011 Adam Cohen */
 
 #include "fuzz.hpp"
 #include "py_utils.hpp"
@@ -639,9 +638,9 @@ std::unique_ptr<CachedFuzz> get_matching_instance(PyObject* scorer)
 static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
     PyObject* scorer, PyObject* processor, double score_cutoff)
 {
-  bool match_found = false;
   PyObject* result_choice = NULL;
   PyObject* choice_key = NULL;
+  Py_ssize_t result_index = -1;
   std::vector<PyObject*> outer_owner_list;
 
   bool is_dict = false;
@@ -687,10 +686,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
   }
   outer_owner_list.push_back(choices);
 
-  std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
+  Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices);
 
-
-  for (std::size_t i = 0; i < choice_count; ++i) {
+  for (Py_ssize_t i = 0; i < choice_count; ++i) {
     PyObject* py_choice = NULL;
     PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
 
@@ -741,9 +739,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
     if (comp == 1) {
       Py_DecRef(py_score_cutoff);
       py_score_cutoff = score;
-      match_found = true;
       result_choice = py_match_choice;
       choice_key = py_choice;
+      result_index = i;
     } else if (comp == 0) {
       Py_DecRef(score);
     } else if (comp == -1) {
@@ -758,7 +756,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
 
   free_owner_list(outer_owner_list);
 
-  if (!match_found) {
+  if (result_index != -1) {
     Py_DecRef(py_score_cutoff);
     Py_RETURN_NONE;
   }
@@ -769,7 +767,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
 
   PyObject* result = is_dict
     ? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key)
-    : Py_BuildValue("(OO)", result_choice, py_score_cutoff);
+    : Py_BuildValue("(OOn)", result_choice, py_score_cutoff, result_index);
 
   Py_DecRef(py_score_cutoff);
   return result;
@@ -793,17 +791,17 @@ constexpr const char* extractOne_docstring =
   "Returns:\n"
   "    Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n"
   "        no match with a score >= score_cutoff\n"
-  "    Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n"
+  "    Union[None, Tuple[str, float, Any]]: Returns the best match the best match\n"
   "        in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n"
-  "        be in the form`(<choice>, <ratio>)` when `choices` is a list of strings\n"
+  "        be in the form`(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings\n"
   "        or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.";
 
 static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds)
 {
-  bool match_found = false;
   PyObject* result_choice = NULL;
   PyObject* choice_key = NULL;
   double result_score;
+  Py_ssize_t result_index = -1;
   std::vector<PyObject*> outer_owner_list;
   python_string query;
   bool is_dict = false;
@@ -856,9 +854,9 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds
   }
   outer_owner_list.push_back(choices);
 
-  std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
+  Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices);
 
-  for (std::size_t i = 0; i < choice_count; ++i) {
+  for (Py_ssize_t i = 0; i < choice_count; ++i) {
     PyObject* py_choice = NULL;
     PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);
 
@@ -889,23 +887,23 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds
       // increase the value by a small step so it might be able to exit early
       score_cutoff = score + (float)0.00001;
       result_score = score;
-      match_found = true;
       result_choice = py_match_choice;
       choice_key = py_choice;
+      result_index = i;
     } 
     free_owner_list(inner_owner_list);
   }
 
   free_owner_list(outer_owner_list);
 
-  if (!match_found) {
+  if (result_index == -1) {
     Py_RETURN_NONE;
   }
 
   if (is_dict) {
     return Py_BuildValue("(OdO)", result_choice, result_score, choice_key);
   } else {
-    return Py_BuildValue("(Od)", result_choice, result_score);
+    return Py_BuildValue("(Odn)", result_choice, result_score, result_index);
   }
 }
 

diff --git a/src/rapidfuzz/process.py b/src/rapidfuzz/process.py
@@ -27,7 +27,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_
             if score >= score_cutoff:
                 yield (match_choice, score, choice)
     else:
-        for choice in choices:
+        for i, choice in enumerate(choices):
             if choice is None:
                 continue
             b = processor(choice) if processor else choice
@@ -38,26 +38,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_
                 score_cutoff=score_cutoff)
 
             if score >= score_cutoff:
-                yield (choice, score)
-
-def iterExtractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0):
-    if query is None:
-        return
-
-    a = processor(query) if processor else query
-
-    for (i, choice) in enumerate(choices):
-        if choice is None:
-            continue
-        b = processor(choice) if processor else choice
-        score = scorer(
-            a, b,
-            processor=None,
-            score_cutoff=score_cutoff)
-
-        if score >= score_cutoff:
-            yield (i, score)
-
+                yield (choice, score, i)
 
 def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
     """ 
@@ -76,9 +57,9 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc
             a lower score than this number will not be returned. Defaults to 0
 
     Returns: 
-        Union[List[Tuple[str, float]], List[Tuple[str, float, str]]]: Returns a
+        Union[List[Tuple[str, float, Any]]]: Returns a
         list of all matches that have a `score >= score_cutoff`. The list will
-        be of either `(<choice>, <ratio>)` when `choices` is a list of strings
+        be of either `(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings
         or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
     """
     results = iterExtract(query, choices, scorer, processor, score_cutoff)
@@ -87,34 +68,3 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc
         return sorted(results, key=lambda x: x[1], reverse=True)
 
     return heapq.nlargest(limit, results, key=lambda x: x[1])
-
-
-def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
-    """ 
-    Find the best matches in a list of choices
-
-    Args: 
-        query (str): string we want to find
-        choices (Iterable): list of all strings the query should be compared with
-        scorer (Callable): optional callable that is used to calculate the matching score between
-            the query and each choice. WRatio is used by default
-        processor (Callable): optional callable that reformats the strings. utils.default_process
-            is used by default, which lowercases the strings and trims whitespace
-        limit (int): maximum amount of results to return
-        score_cutoff (float): Optional argument for a score threshold. Matches with
-            a lower score than this number will not be returned. Defaults to 0
-
-    Returns: 
-        List[Tuple[int, float]]: returns a list of all incides in the list that have a score >= score_cutoff
-  
-    """
-    results = iterExtractIndices(query, choices, scorer, processor, score_cutoff)
-
-    if limit is None:
-        return sorted(results, key=lambda x: x[1], reverse=True)
-
-    return heapq.nlargest(limit, results, key=lambda x: x[1])
-
-
-def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
-    return extract(query, choices, scorer, processor, limit, score_cutoff)