diff --git a/README.md b/README.md index 8653986a..277022c4 100644 --- a/README.md +++ b/README.md @@ -129,9 +129,9 @@ pip install . ```console > choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"] > process.extract("new york jets", choices, limit=2) -[('new york jets', 100), ('new york giants', 78.57142639160156)] +[('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)] > process.extractOne("cowboys", choices) -("dallas cowboys", 90) +("Dallas Cowboys", 90, 3) ``` ## License diff --git a/docs/usage/process.md b/docs/usage/process.md index 50f5a40c..dc144a8f 100644 --- a/docs/usage/process.md +++ b/docs/usage/process.md @@ -43,17 +43,17 @@ Find the best matches in a list of choices. Returns: - - **matches**: *List[Tuple[str, float]] or List[Tuple[str, float, str]])* + - **matches**: *List[Tuple[str, float, Any]]* Returns a list of all matches that have a `score >= score_cutoff`. The list will - be of either `(, )` when `choices` is a list of strings + be of either `(, , )` when `choices` is a list of strings or `(, , )` when `choices` is a mapping. ```console > choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"] > process.extract("new york jets", choices, limit=2) - [('new york jets', 100), ('new york giants', 78.57142639160156)] + [('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)] ``` === "C++" @@ -62,7 +62,7 @@ Find the best matches in a list of choices. using rapidfuzz::process::extract; // matches is a vector of std::pairs - // [('new york jets', 100), ('new york giants', 78.57142639160156)] + // [('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)] auto matches = extract( "new york jets", std::vector{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"}, @@ -80,15 +80,15 @@ Finds the best match in a list of choices by comparing them using the provided s Returns: - - **matches**: *Union[None, Tuple[str, float], Tuple[str, float, str]]* + - **matches**: *Union[None, Tuple[str, float, Any]]* - Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(, )` when `choices` is a list of strings or `(, , )` when `choices` is a mapping. + Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(, , )` when `choices` is a list of strings or `(, , )` when `choices` is a mapping. ```console > choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"] > process.extractOne("cowboys", choices) - ("dallas cowboys", 90) + ("Dallas Cowboys", 90, 3) ``` === "C++" @@ -97,7 +97,7 @@ Finds the best match in a list of choices by comparing them using the provided s using rapidfuzz::process::extractOne; // matches is a boost::optional - // ("dallas cowboys", 90) + // ("Dallas Cowboys", 90, 3) auto matches = extractOne( "cowboys", std::vector{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"}); diff --git a/src/py_abstraction.cpp b/src/py_abstraction.cpp index 937429b4..a6041d91 100644 --- a/src/py_abstraction.cpp +++ b/src/py_abstraction.cpp @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: MIT */ /* Copyright © 2020 Max Bachmann */ -/* Copyright © 2011 Adam Cohen */ #include "fuzz.hpp" #include "py_utils.hpp" @@ -639,9 +638,9 @@ std::unique_ptr get_matching_instance(PyObject* scorer) static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices, PyObject* scorer, PyObject* processor, double score_cutoff) { - bool match_found = false; PyObject* result_choice = NULL; PyObject* choice_key = NULL; + Py_ssize_t result_index = -1; std::vector outer_owner_list; bool is_dict = false; @@ -687,10 +686,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices, } outer_owner_list.push_back(choices); - std::size_t choice_count = PySequence_Fast_GET_SIZE(choices); + Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices); - - for (std::size_t i = 0; i < choice_count; ++i) { + for (Py_ssize_t i = 0; i < choice_count; ++i) { PyObject* py_choice = NULL; PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i); @@ -741,9 +739,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices, if (comp == 1) { Py_DecRef(py_score_cutoff); py_score_cutoff = score; - match_found = true; result_choice = py_match_choice; choice_key = py_choice; + result_index = i; } else if (comp == 0) { Py_DecRef(score); } else if (comp == -1) { @@ -758,7 +756,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices, free_owner_list(outer_owner_list); - if (!match_found) { + if (result_index != -1) { Py_DecRef(py_score_cutoff); Py_RETURN_NONE; } @@ -769,7 +767,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices, PyObject* result = is_dict ? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key) - : Py_BuildValue("(OO)", result_choice, py_score_cutoff); + : Py_BuildValue("(OOn)", result_choice, py_score_cutoff, result_index); Py_DecRef(py_score_cutoff); return result; @@ -793,17 +791,17 @@ constexpr const char* extractOne_docstring = "Returns:\n" " Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n" " no match with a score >= score_cutoff\n" - " Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n" + " Union[None, Tuple[str, float, Any]]: Returns the best match the best match\n" " in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n" - " be in the form`(, )` when `choices` is a list of strings\n" + " be in the form`(, , )` when `choices` is a list of strings\n" " or `(, , )` when `choices` is a mapping."; static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds) { - bool match_found = false; PyObject* result_choice = NULL; PyObject* choice_key = NULL; double result_score; + Py_ssize_t result_index = -1; std::vector outer_owner_list; python_string query; bool is_dict = false; @@ -856,9 +854,9 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds } outer_owner_list.push_back(choices); - std::size_t choice_count = PySequence_Fast_GET_SIZE(choices); + Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices); - for (std::size_t i = 0; i < choice_count; ++i) { + for (Py_ssize_t i = 0; i < choice_count; ++i) { PyObject* py_choice = NULL; PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i); @@ -889,23 +887,23 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds // increase the value by a small step so it might be able to exit early score_cutoff = score + (float)0.00001; result_score = score; - match_found = true; result_choice = py_match_choice; choice_key = py_choice; + result_index = i; } free_owner_list(inner_owner_list); } free_owner_list(outer_owner_list); - if (!match_found) { + if (result_index == -1) { Py_RETURN_NONE; } if (is_dict) { return Py_BuildValue("(OdO)", result_choice, result_score, choice_key); } else { - return Py_BuildValue("(Od)", result_choice, result_score); + return Py_BuildValue("(Odn)", result_choice, result_score, result_index); } } diff --git a/src/rapidfuzz/process.py b/src/rapidfuzz/process.py index b7bb7b16..377a79b7 100644 --- a/src/rapidfuzz/process.py +++ b/src/rapidfuzz/process.py @@ -27,7 +27,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_ if score >= score_cutoff: yield (match_choice, score, choice) else: - for choice in choices: + for i, choice in enumerate(choices): if choice is None: continue b = processor(choice) if processor else choice @@ -38,26 +38,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_ score_cutoff=score_cutoff) if score >= score_cutoff: - yield (choice, score) - -def iterExtractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0): - if query is None: - return - - a = processor(query) if processor else query - - for (i, choice) in enumerate(choices): - if choice is None: - continue - b = processor(choice) if processor else choice - score = scorer( - a, b, - processor=None, - score_cutoff=score_cutoff) - - if score >= score_cutoff: - yield (i, score) - + yield (choice, score, i) def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0): """ @@ -76,9 +57,9 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc a lower score than this number will not be returned. Defaults to 0 Returns: - Union[List[Tuple[str, float]], List[Tuple[str, float, str]]]: Returns a + Union[List[Tuple[str, float, Any]]]: Returns a list of all matches that have a `score >= score_cutoff`. The list will - be of either `(, )` when `choices` is a list of strings + be of either `(, , )` when `choices` is a list of strings or `(, , )` when `choices` is a mapping. """ results = iterExtract(query, choices, scorer, processor, score_cutoff) @@ -87,34 +68,3 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc return sorted(results, key=lambda x: x[1], reverse=True) return heapq.nlargest(limit, results, key=lambda x: x[1]) - - -def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0): - """ - Find the best matches in a list of choices - - Args: - query (str): string we want to find - choices (Iterable): list of all strings the query should be compared with - scorer (Callable): optional callable that is used to calculate the matching score between - the query and each choice. WRatio is used by default - processor (Callable): optional callable that reformats the strings. utils.default_process - is used by default, which lowercases the strings and trims whitespace - limit (int): maximum amount of results to return - score_cutoff (float): Optional argument for a score threshold. Matches with - a lower score than this number will not be returned. Defaults to 0 - - Returns: - List[Tuple[int, float]]: returns a list of all incides in the list that have a score >= score_cutoff - - """ - results = iterExtractIndices(query, choices, scorer, processor, score_cutoff) - - if limit is None: - return sorted(results, key=lambda x: x[1], reverse=True) - - return heapq.nlargest(limit, results, key=lambda x: x[1]) - - -def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0): - return extract(query, choices, scorer, processor, limit, score_cutoff)