Skip to content

Commit

Permalink
update return types of processors
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Nov 16, 2020
1 parent 426fbb2 commit 9169444
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 80 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ pip install .
```console
> choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
> process.extract("new york jets", choices, limit=2)
[('new york jets', 100), ('new york giants', 78.57142639160156)]
[('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
> process.extractOne("cowboys", choices)
("dallas cowboys", 90)
("Dallas Cowboys", 90, 3)
```

## License
Expand Down
16 changes: 8 additions & 8 deletions docs/usage/process.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,17 @@ Find the best matches in a list of choices.

Returns:

- **matches**: *List[Tuple[str, float]] or List[Tuple[str, float, str]])*
- **matches**: *List[Tuple[str, float, Any]]*

Returns a list of all matches that have a `score >= score_cutoff`. The list will
be of either `(<choice>, <ratio>)` when `choices` is a list of strings
be of either `(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings
or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.


```console
> choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
> process.extract("new york jets", choices, limit=2)
[('new york jets', 100), ('new york giants', 78.57142639160156)]
[('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
```

=== "C++"
Expand All @@ -62,7 +62,7 @@ Find the best matches in a list of choices.
using rapidfuzz::process::extract;

// matches is a vector of std::pairs
// [('new york jets', 100), ('new york giants', 78.57142639160156)]
// [('New York Jets', 100, 1), ('New York Giants', 78.57142639160156, 2)]
auto matches = extract(
"new york jets",
std::vector<std::string>{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"},
Expand All @@ -80,15 +80,15 @@ Finds the best match in a list of choices by comparing them using the provided s

Returns:

- **matches**: *Union[None, Tuple[str, float], Tuple[str, float, str]]*
- **matches**: *Union[None, Tuple[str, float, Any]]*

Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(<choice>, <ratio>)` when `choices` is a list of strings or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
Returns the best match the best match in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will be in the form`(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.


```console
> choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
> process.extractOne("cowboys", choices)
("dallas cowboys", 90)
("Dallas Cowboys", 90, 3)
```

=== "C++"
Expand All @@ -97,7 +97,7 @@ Finds the best match in a list of choices by comparing them using the provided s
using rapidfuzz::process::extractOne;

// matches is a boost::optional<std::pair>
// ("dallas cowboys", 90)
// ("Dallas Cowboys", 90, 3)
auto matches = extractOne(
"cowboys",
std::vector<std::string>{"Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"});
Expand Down
30 changes: 14 additions & 16 deletions src/py_abstraction.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
/* SPDX-License-Identifier: MIT */
/* Copyright © 2020 Max Bachmann */
/* Copyright © 2011 Adam Cohen */

#include "fuzz.hpp"
#include "py_utils.hpp"
Expand Down Expand Up @@ -639,9 +638,9 @@ std::unique_ptr<CachedFuzz> get_matching_instance(PyObject* scorer)
static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
PyObject* scorer, PyObject* processor, double score_cutoff)
{
bool match_found = false;
PyObject* result_choice = NULL;
PyObject* choice_key = NULL;
Py_ssize_t result_index = -1;
std::vector<PyObject*> outer_owner_list;

bool is_dict = false;
Expand Down Expand Up @@ -687,10 +686,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
}
outer_owner_list.push_back(choices);

std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices);


for (std::size_t i = 0; i < choice_count; ++i) {
for (Py_ssize_t i = 0; i < choice_count; ++i) {
PyObject* py_choice = NULL;
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);

Expand Down Expand Up @@ -741,9 +739,9 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,
if (comp == 1) {
Py_DecRef(py_score_cutoff);
py_score_cutoff = score;
match_found = true;
result_choice = py_match_choice;
choice_key = py_choice;
result_index = i;
} else if (comp == 0) {
Py_DecRef(score);
} else if (comp == -1) {
Expand All @@ -758,7 +756,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,

free_owner_list(outer_owner_list);

if (!match_found) {
if (result_index != -1) {
Py_DecRef(py_score_cutoff);
Py_RETURN_NONE;
}
Expand All @@ -769,7 +767,7 @@ static PyObject* py_extractOne(PyObject* py_query, PyObject* py_choices,

PyObject* result = is_dict
? Py_BuildValue("(OOO)", result_choice, py_score_cutoff, choice_key)
: Py_BuildValue("(OO)", result_choice, py_score_cutoff);
: Py_BuildValue("(OOn)", result_choice, py_score_cutoff, result_index);

Py_DecRef(py_score_cutoff);
return result;
Expand All @@ -793,17 +791,17 @@ constexpr const char* extractOne_docstring =
"Returns:\n"
" Optional[Tuple[str, float]]: returns the best match in form of a tuple or None when there is\n"
" no match with a score >= score_cutoff\n"
" Union[None, Tuple[str, float], Tuple[str, float, str]]: Returns the best match the best match\n"
" Union[None, Tuple[str, float, Any]]: Returns the best match the best match\n"
" in form of a tuple or None when there is no match with a score >= score_cutoff. The Tuple will\n"
" be in the form`(<choice>, <ratio>)` when `choices` is a list of strings\n"
" be in the form`(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings\n"
" or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.";

static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds)
{
bool match_found = false;
PyObject* result_choice = NULL;
PyObject* choice_key = NULL;
double result_score;
Py_ssize_t result_index = -1;
std::vector<PyObject*> outer_owner_list;
python_string query;
bool is_dict = false;
Expand Down Expand Up @@ -856,9 +854,9 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds
}
outer_owner_list.push_back(choices);

std::size_t choice_count = PySequence_Fast_GET_SIZE(choices);
Py_ssize_t choice_count = PySequence_Fast_GET_SIZE(choices);

for (std::size_t i = 0; i < choice_count; ++i) {
for (Py_ssize_t i = 0; i < choice_count; ++i) {
PyObject* py_choice = NULL;
PyObject* py_match_choice = PySequence_Fast_GET_ITEM(choices, i);

Expand Down Expand Up @@ -889,23 +887,23 @@ static PyObject* extractOne(PyObject* /*self*/, PyObject* args, PyObject* keywds
// increase the value by a small step so it might be able to exit early
score_cutoff = score + (float)0.00001;
result_score = score;
match_found = true;
result_choice = py_match_choice;
choice_key = py_choice;
result_index = i;
}
free_owner_list(inner_owner_list);
}

free_owner_list(outer_owner_list);

if (!match_found) {
if (result_index == -1) {
Py_RETURN_NONE;
}

if (is_dict) {
return Py_BuildValue("(OdO)", result_choice, result_score, choice_key);
} else {
return Py_BuildValue("(Od)", result_choice, result_score);
return Py_BuildValue("(Odn)", result_choice, result_score, result_index);
}
}

Expand Down
58 changes: 4 additions & 54 deletions src/rapidfuzz/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_
if score >= score_cutoff:
yield (match_choice, score, choice)
else:
for choice in choices:
for i, choice in enumerate(choices):
if choice is None:
continue
b = processor(choice) if processor else choice
Expand All @@ -38,26 +38,7 @@ def iterExtract(query, choices, scorer = fuzz.WRatio, processor = utils.default_
score_cutoff=score_cutoff)

if score >= score_cutoff:
yield (choice, score)

def iterExtractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, score_cutoff = 0):
if query is None:
return

a = processor(query) if processor else query

for (i, choice) in enumerate(choices):
if choice is None:
continue
b = processor(choice) if processor else choice
score = scorer(
a, b,
processor=None,
score_cutoff=score_cutoff)

if score >= score_cutoff:
yield (i, score)

yield (choice, score, i)

def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
"""
Expand All @@ -76,9 +57,9 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc
a lower score than this number will not be returned. Defaults to 0
Returns:
Union[List[Tuple[str, float]], List[Tuple[str, float, str]]]: Returns a
Union[List[Tuple[str, float, Any]]]: Returns a
list of all matches that have a `score >= score_cutoff`. The list will
be of either `(<choice>, <ratio>)` when `choices` is a list of strings
be of either `(<choice>, <ratio>, <index of choice>)` when `choices` is a list of strings
or `(<choice>, <ratio>, <key of choice>)` when `choices` is a mapping.
"""
results = iterExtract(query, choices, scorer, processor, score_cutoff)
Expand All @@ -87,34 +68,3 @@ def extract(query, choices, scorer = fuzz.WRatio, processor = utils.default_proc
return sorted(results, key=lambda x: x[1], reverse=True)

return heapq.nlargest(limit, results, key=lambda x: x[1])


def extractIndices(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
"""
Find the best matches in a list of choices
Args:
query (str): string we want to find
choices (Iterable): list of all strings the query should be compared with
scorer (Callable): optional callable that is used to calculate the matching score between
the query and each choice. WRatio is used by default
processor (Callable): optional callable that reformats the strings. utils.default_process
is used by default, which lowercases the strings and trims whitespace
limit (int): maximum amount of results to return
score_cutoff (float): Optional argument for a score threshold. Matches with
a lower score than this number will not be returned. Defaults to 0
Returns:
List[Tuple[int, float]]: returns a list of all incides in the list that have a score >= score_cutoff
"""
results = iterExtractIndices(query, choices, scorer, processor, score_cutoff)

if limit is None:
return sorted(results, key=lambda x: x[1], reverse=True)

return heapq.nlargest(limit, results, key=lambda x: x[1])


def extractBests(query, choices, scorer = fuzz.WRatio, processor = utils.default_process, limit = 5, score_cutoff = 0):
return extract(query, choices, scorer, processor, limit, score_cutoff)

0 comments on commit 9169444

Please sign in to comment.