Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix #88 and #93: multiple decimal places and multiple decimal numbers #91

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 58 additions & 23 deletions lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def generate_plurals_en(originals):
_STRING_LONG_ORDINAL_EN = invert_dict(_LONG_ORDINAL_EN)


def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False, places=None):
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
"""
Convert words in a string into their equivalent numbers.
Args:
Expand All @@ -94,7 +94,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
text = text.lower()
tokens = tokenize(text)
numbers_to_replace = \
_extract_numbers_with_text_en(tokens, short_scale, ordinals)
_extract_numbers_with_text_en(
tokens, short_scale, ordinals, places=places)
numbers_to_replace.sort(key=lambda number: number.start_index)

results = []
Expand All @@ -114,7 +115,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):


def _extract_numbers_with_text_en(tokens, short_scale=True,
ordinals=False, fractional_numbers=True):
ordinals=False, fractional_numbers=True,
places=None):
"""
Extract all numbers from a list of Tokens, with the words that
represent them.
Expand All @@ -138,7 +140,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,
while True:
to_replace = \
_extract_number_with_text_en(tokens, short_scale,
ordinals, fractional_numbers)
ordinals, fractional_numbers,
places=places)

if not to_replace:
break
Expand All @@ -156,7 +159,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,


def _extract_number_with_text_en(tokens, short_scale=True,
ordinals=False, fractional_numbers=True):
ordinals=False, fractional_numbers=True,
places=None):
"""
This function extracts a number from a list of Tokens.

Expand All @@ -172,15 +176,17 @@ def _extract_number_with_text_en(tokens, short_scale=True,
"""
number, tokens = \
_extract_number_with_text_en_helper(tokens, short_scale,
ordinals, fractional_numbers)
ordinals, fractional_numbers,
places=places)
while tokens and tokens[0].word in _ARTICLES_EN:
tokens.pop(0)
return ReplaceableNumber(number, tokens)


def _extract_number_with_text_en_helper(tokens,
short_scale=True, ordinals=False,
fractional_numbers=True):
fractional_numbers=True,
places=None):
"""
Helper for _extract_number_with_text_en.

Expand All @@ -205,7 +211,8 @@ def _extract_number_with_text_en_helper(tokens,
return fraction, fraction_text

decimal, decimal_text = \
_extract_decimal_with_text_en(tokens, short_scale, ordinals)
_extract_decimal_with_text_en(
tokens, short_scale, ordinals, places=places)
if decimal:
return decimal, decimal_text

Expand Down Expand Up @@ -254,7 +261,7 @@ def _extract_fraction_with_text_en(tokens, short_scale, ordinals):
return None, None


def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
def _extract_decimal_with_text_en(tokens, short_scale, ordinals, places=None):
"""
Extract decimal numbers from a string.

Expand All @@ -271,6 +278,7 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
tokens [Token]: The text to parse.
short_scale boolean:
ordinals boolean:
places [int]: Number of decimal places to return

Returns:
(float, [Token])
Expand All @@ -281,24 +289,48 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
for c in _DECIMAL_MARKER:
partitions = partition_list(tokens, lambda t: t.word == c)

if len(partitions) == 3:
if len(partitions) >= 3:
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
numbers1 = \
_extract_numbers_with_text_en(partitions[0], short_scale,
ordinals, fractional_numbers=False)
ordinals, fractional_numbers=False,
places=places)
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
numbers2 = \
_extract_numbers_with_text_en(partitions[2], short_scale,
ordinals, fractional_numbers=False)

ordinals, fractional_numbers=False,
places=places)
if not numbers1 or not numbers2:
return None, None

idx = 1
stop = False
while idx < len(numbers2) and not stop:
if numbers2[idx].tokens[0].index != numbers2[idx-1].tokens[0].index + 1 or \
numbers2[idx].value is None:
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
stop = True
else:
idx += 1
numbers2 = numbers2[:idx]

number = numbers1[-1]
decimal = numbers2[0]
# decimal = numbers2[0]

# TODO handle number dot number number number
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
if "." not in str(decimal.text):
return number.value + float('0.' + str(decimal.value)), \
number.tokens + partitions[1] + decimal.tokens
if "." not in str(numbers2[0].text):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be updated to check for the new _DECIMAL_MARKER

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By this point, _DECIMAL_MARKER has done its job. This bit is easier to look at in a debugger than it is to explain, but numbers2 is a list of ReplaceableNumber objects corresponding to the digits of our decimal part. It's going to be joined, then appended to a decimal point, then cast to a float and summed with the whole number part of our result.

The if statement here goes back to Core, and it makes sure the first element of numbers2 doesn't already have a decimal point. It's hypothetically possible, but I can't figure out how, and removing the if clause doesn't break any test cases. Still, I left it there, just removed a superfluous variable is all.

I'll see if I can work backwards through git blame and ask whoever wrote the if statement. I was assuming it had something to do with the way the tokenizer handles certain input. If it's just a relic, I'm all for removing line 318 entirely.

Copy link
Collaborator

@JarbasAl JarbasAl Apr 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it doesnt break test cases, i say let's remove it or find a test case for it

return_value = float('0.' + "".join([str(
decimal.value) for decimal in numbers2]))
return_value = number.value + return_value
if return_value == int(return_value):
return_value = int(return_value)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I was calling an extract_decimal function I'd expect a float is returned. What's the benefit of returning an integer instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be and not places, good catch! That is, this is fundamentally a helper invoked by extract_number(). Unless the user has asked for decimal places, I'd figured no rounding means a whole number is just that. On the other hand, you're right, that doesn't work either: "one" is a whole number. "one point zero" equals a whole number.

What if decimal_places=None does no rounding, decimal_places=0 always returns an int, and decimal_places > 0 does what it does?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 Yeah that sounds intuitive to me


# out_part2 = partitions[2]
# for n in numbers2:
# out_part2[n.index] = n.value
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved

return_tokens = number.tokens + partitions[1]
for n in numbers2:
return_tokens += n.tokens

return (round(return_value, places) if places else return_value), return_tokens
return None, None


Expand All @@ -319,8 +351,8 @@ def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):
The value parsed, and tokens that it corresponds to.

"""
multiplies, string_num_ordinal, string_num_scale = \
_initialize_number_data(short_scale)
multiplies, string_num_ordinal, string_num_scale = _initialize_number_data(
short_scale)

number_words = [] # type: [Token]
val = False
Expand Down Expand Up @@ -560,7 +592,7 @@ def _initialize_number_data(short_scale):
return multiplies, string_num_ordinal_en, string_num_scale_en


def extractnumber_en(text, short_scale=True, ordinals=False):
def extractnumber_en(text, short_scale=True, ordinals=False, decimal_places=None):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -571,13 +603,15 @@ def extractnumber_en(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal_places (int or None): rounds to # decimal places. uses builtin round()
Returns:
(int) or (float) or False: The extracted number or False if no number
was found

"""
return _extract_number_with_text_en(tokenize(text.lower()),
short_scale, ordinals).value
short_scale, ordinals,
places=decimal_places).value


def extract_duration_en(text):
Expand Down Expand Up @@ -1476,7 +1510,7 @@ def isFractional_en(input_str, short_scale=True):
return False


def extract_numbers_en(text, short_scale=True, ordinals=False):
def extract_numbers_en(text, short_scale=True, ordinals=False, decimal_places=None):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -1487,11 +1521,12 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal_places (int or False): rounds to # decimal places. uses builtin round()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more int or False

Returns:
list: list of extracted numbers as floats
"""
results = _extract_numbers_with_text_en(tokenize(text),
short_scale, ordinals)
short_scale, ordinals, places=decimal_places)
return [float(result.value) for result in results]


Expand Down
17 changes: 13 additions & 4 deletions lingua_franca/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ def match_one(query, choices):
else:
return best

# TODO update these docstrings when decimal_places has been implemented
# in all parsers

def extract_numbers(text, short_scale=True, ordinals=False, lang=None):

def extract_numbers(text, short_scale=True, ordinals=False, lang=None,
decimal_places=False):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -90,12 +94,14 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
lang (str): the BCP-47 code for the language to use, None uses default
decimal_places (int or False): rounds to # decimal places. Not yet implemented
in all languages. False performs no rounding. Uses builtin round()
Returns:
list: list of extracted numbers as floats, or empty list if none found
"""
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return extract_numbers_en(text, short_scale, ordinals)
return extract_numbers_en(text, short_scale, ordinals, decimal_places)
elif lang_code == "de":
return extract_numbers_de(text, short_scale, ordinals)
elif lang_code == "fr":
Expand All @@ -112,7 +118,8 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
return []


def extract_number(text, short_scale=True, ordinals=False, lang=None):
def extract_number(text, short_scale=True, ordinals=False, lang=None,
decimal_places=False):
"""Takes in a string and extracts a number.

Args:
Expand All @@ -123,14 +130,16 @@ def extract_number(text, short_scale=True, ordinals=False, lang=None):
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
lang (str): the BCP-47 code for the language to use, None uses default
decimal_places (int or False): rounds to # decimal places. Not yet implemented
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
in all languages. False performs no rounding. Uses builtin round()
Returns:
(int, float or False): The number extracted or False if the input
text contains no numbers
"""
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return extractnumber_en(text, short_scale=short_scale,
ordinals=ordinals)
ordinals=ordinals, decimal_places=decimal_places)
elif lang_code == "es":
return extractnumber_es(text)
elif lang_code == "pt":
Expand Down
17 changes: 13 additions & 4 deletions test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,12 @@ def test_extract_number(self):
self.assertEqual(extract_number("eight hundred trillion two hundred \
fifty seven"), 800000000000257.0)

# TODO handle this case
# self.assertEqual(
# extract_number("6 dot six six six"),
# 6.666)
self.assertEqual(extract_number("6 dot six six six"), 6.666)
self.assertEqual(extract_number(
"6 dot six six six", decimal_places=2), round(6.666, 2))
self.assertEqual(extract_number(
"6 point seventy", decimal_places=2), 6.7)

self.assertTrue(extract_number("The tennis player is fast") is False)
self.assertTrue(extract_number("fraggle") is False)

Expand Down Expand Up @@ -735,6 +737,13 @@ def test_multiple_numbers(self):
self.assertEqual(extract_numbers("this is a seven eight nine and a"
" half test"),
[7.0, 8.0, 9.5])
self.assertEqual(extract_numbers("this is a six point five seven nine"
" bingo ten nancy forty six test"),
[6.579, 10.0, 46.0])
self.assertEqual(extract_numbers("this is a six point five seven nine"
" bingo ten nancy forty six test"
" with decimal rounding", decimal_places=2),
[round(6.579, 2), 10, 46])

def test_contractions(self):
self.assertEqual(normalize("ain't"), "is not")
Expand Down