Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/number_spans #38

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lingua_franca/lang/common_data_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@


# negate next number (-2 = 0 - 2)
_NEGATIVES_EN = {"negative", "minus"}
_NEGATIVES_EN = {"negative", "minus", "-"}

# sum the next number (twenty two = 20 + 2)
_SUMS_EN = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50',
Expand Down
12 changes: 12 additions & 0 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None):
return utterance


def normalize_decimals(text, decimal):
"""
Replace 'decimal' with decimal periods so Python can floatify them
"""
regex = r"\b\d+" + decimal + r"{1}\d+\b"
sanitize_decimals = re.compile(regex)
for _, match in enumerate(re.finditer(sanitize_decimals, text)):
text = text.replace(match.group(
0), match.group(0).replace(decimal, '.'))
return text


def match_yes_or_no(text, lang):
resource_file = resolve_resource_file(f"text/{lang}/yesno.json")
if not resource_file:
Expand Down
31 changes: 21 additions & 10 deletions lingua_franca/lang/parse_cs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
_LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \
_FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \
_ORDINAL_BASE_CS # _ARTICLES_CS

from lingua_franca.lang.parse_common import normalize_decimals
import re
import json
from lingua_franca import resolve_resource_file
Expand Down Expand Up @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale):
return multiplies, string_num_ordinal_cs, string_num_scale_cs


def extract_number_cs(text, short_scale=True, ordinals=False):
def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return _extract_number_with_text_cs(tokenize(text.lower()),
short_scale, ordinals).value

Expand Down Expand Up @@ -1560,20 +1566,25 @@ def isFractional_cs(input_str, short_scale=True):
return False


def extract_numbers_cs(text, short_scale=True, ordinals=False):
def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
results = _extract_numbers_with_text_cs(tokenize(text),
short_scale, ordinals)
return [float(result.value) for result in results]
Expand Down
48 changes: 31 additions & 17 deletions lingua_franca/lang/parse_da.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,31 @@
from lingua_franca.lang.common_data_da import _DA_NUMBERS
from lingua_franca.lang.format_da import pronounce_number_da
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


def extract_number_da(text, short_scale=True, ordinals=False):
def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
This function extracts a number from a text string,
handles pronunciations in long scale and short scale

https://en.wikipedia.org/wiki/Names_of_large_numbers

Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number


undefined articles cannot be suppressed in German:
'ein Pferd' means 'one horse' and 'a horse'
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -869,20 +878,25 @@ def normalize_da(text, remove_articles=True):
return normalized[1:] # strip the initial space


def extract_numbers_da(text, short_scale=True, ordinals=False):
def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.

Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_da, extract_number_da,
short_scale=short_scale, ordinals=ordinals)

Expand Down
59 changes: 38 additions & 21 deletions lingua_franca/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from lingua_franca.lang.common_data_de import _DE_NUMBERS
from lingua_franca.lang.format_de import pronounce_number_de
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


de_numbers = {
Expand Down Expand Up @@ -143,20 +144,28 @@ def repl(match):
return (duration, text)


def extract_number_de(text, short_scale=True, ordinals=False):
def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the default of decimal should be a comma, as this is the common form

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is part of #43 which was dragged in this PR accidentally

"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
This function extracts a number from a text string,
handles pronunciations in long scale and short scale

https://en.wikipedia.org/wiki/Names_of_large_numbers

Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number


undefined articles cannot be suppressed in German:
'ein Pferd' means 'one horse' and 'a horse'
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -1003,20 +1012,28 @@ def normalize_de(text, remove_articles=True):
return normalized[1:] # strip the initial space


def extract_numbers_de(text, short_scale=True, ordinals=False):
"""
Takes in a string and extracts a list of numbers.

Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
Returns:
list: list of extracted numbers as floats
def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale

https://en.wikipedia.org/wiki/Names_of_large_numbers

Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.

"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_de, extract_number_de,
short_scale=short_scale, ordinals=ordinals)

Expand Down
Loading