Skip to content

Commit

Permalink
support decimal markers
Browse files Browse the repository at this point in the history
rebase of MycroftAI#69
  • Loading branch information
JarbasAl committed Nov 27, 2022
1 parent 08ed3c6 commit 6626587
Show file tree
Hide file tree
Showing 18 changed files with 350 additions and 133 deletions.
8 changes: 8 additions & 0 deletions lingua_franca/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,14 @@ def _call_localized_function(func, *args, **kwargs):
# If we didn't find a localized function to correspond with
# the wrapped function, we cached NotImplementedError in its
# place.

# first account for the function not being present in any
# module, meaning all modules are falling back to a catch all
# parser, this usually means the function will need localization
# only in future languages not currently supported
if func_name not in _localized_functions[_module_name][lang_code]:
raise FunctionNotLocalizedError(func_name, lang_code)

loc_signature = _localized_functions[_module_name][lang_code][func_name]
if isinstance(loc_signature, type(NotImplementedError())):
raise loc_signature
Expand Down
12 changes: 12 additions & 0 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None):
return utterance


def normalize_decimals(text, decimal, lang=""):
"""
Replace 'decimal' with decimal periods so Python can floatify them
"""
regex = r"\b\d+" + decimal + r"{1}\d+\b"
sanitize_decimals = re.compile(regex)
for _, match in enumerate(re.finditer(sanitize_decimals, text)):
text = text.replace(match.group(
0), match.group(0).replace(decimal, '.'))
return text


def match_yes_or_no(text, lang):
resource_file = resolve_resource_file(f"text/{lang}/yesno.json")
if not resource_file:
Expand Down
28 changes: 19 additions & 9 deletions lingua_franca/lang/parse_cs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
_LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \
_FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \
_ORDINAL_BASE_CS # _ARTICLES_CS

from lingua_franca.lang.parse_common import normalize_decimals
import re
import json
from lingua_franca import resolve_resource_file
Expand Down Expand Up @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale):
return multiplies, string_num_ordinal_cs, string_num_scale_cs


def extract_number_cs(text, short_scale=True, ordinals=False):
def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return _extract_number_with_text_cs(tokenize(text.lower()),
short_scale, ordinals).value

Expand Down Expand Up @@ -1560,20 +1566,24 @@ def isFractional_cs(input_str, short_scale=True):
return False


def extract_numbers_cs(text, short_scale=True, ordinals=False):
def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.
Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
results = _extract_numbers_with_text_cs(tokenize(text),
short_scale, ordinals)
return [float(result.value) for result in results]
Expand Down
45 changes: 29 additions & 16 deletions lingua_franca/lang/parse_da.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,31 @@
from lingua_franca.lang.common_data_da import _DA_NUMBERS
from lingua_franca.lang.format_da import pronounce_number_da
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


def extract_number_da(text, short_scale=True, ordinals=False):
def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
https://en.wikipedia.org/wiki/Names_of_large_numbers
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number
undefined articles cannot be suppressed in German:
'ein Pferd' means 'one horse' and 'a horse'
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -869,20 +878,24 @@ def normalize_da(text, remove_articles=True):
return normalized[1:] # strip the initial space


def extract_numbers_da(text, short_scale=True, ordinals=False):
def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.
Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_da, extract_number_da,
short_scale=short_scale, ordinals=ordinals)

Expand Down
58 changes: 37 additions & 21 deletions lingua_franca/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from lingua_franca.lang.common_data_de import _DE_NUMBERS
from lingua_franca.lang.format_de import pronounce_number_de
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


de_numbers = {
Expand Down Expand Up @@ -143,20 +144,28 @@ def repl(match):
return (duration, text)


def extract_number_de(text, short_scale=True, ordinals=False):
def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
https://en.wikipedia.org/wiki/Names_of_large_numbers
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number
undefined articles cannot be suppressed in German:
'ein Pferd' means 'one horse' and 'a horse'
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -1003,20 +1012,27 @@ def normalize_de(text, remove_articles=True):
return normalized[1:] # strip the initial space


def extract_numbers_de(text, short_scale=True, ordinals=False):
"""
Takes in a string and extracts a list of numbers.
Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
Returns:
list: list of extracted numbers as floats
def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
https://en.wikipedia.org/wiki/Names_of_large_numbers
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_de, extract_number_de,
short_scale=short_scale, ordinals=ordinals)

Expand Down
17 changes: 15 additions & 2 deletions lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
from lingua_franca.time import now_local
from lingua_franca.lang.parse_common import normalize_decimals


def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
Expand Down Expand Up @@ -529,7 +530,7 @@ def _initialize_number_data_en(short_scale, speech=True):
return multiplies, string_num_ordinal_en, string_num_scale_en


def extract_number_en(text, short_scale=True, ordinals=False):
def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -540,11 +541,17 @@ def extract_number_en(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return _extract_number_with_text_en(tokenize(text.lower()),
short_scale, ordinals).value

Expand Down Expand Up @@ -1655,7 +1662,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True):
return False


def extract_numbers_en(text, short_scale=True, ordinals=False):
def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.
Expand All @@ -1666,9 +1673,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
results = _extract_numbers_with_text_en(tokenize(text),
short_scale, ordinals)
return [float(result.value) for result in results]
Expand Down
41 changes: 29 additions & 12 deletions lingua_franca/lang/parse_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from lingua_franca.lang.format_es import pronounce_number_es
from lingua_franca.lang.parse_common import *
from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES
from lingua_franca.lang.parse_common import normalize_decimals


def is_fractional_es(input_str, short_scale=True):
Expand Down Expand Up @@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True):
return False


def extract_number_es(text, short_scale=True, ordinals=False):
def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'):
"""
This function prepares the given text for parsing by making
numbers consistent, getting rid of contractions, etc.
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
https://en.wikipedia.org/wiki/Names_of_large_numbers
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
(int) or (float): The value of extracted number
(int) or (float) or False: The extracted number or False if no number
was found
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
Expand Down Expand Up @@ -268,20 +281,24 @@ def es_number(i):
return es_number(i)


def extract_numbers_es(text, short_scale=True, ordinals=False):
def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'):
"""
Takes in a string and extracts a list of numbers.
Args:
text (str): the string to extract a number from
short_scale (bool): Use "short scale" or "long scale" for large
numbers -- over a million. The default is short scale, which
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
Args:
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal (str): character to use as decimal point. defaults to '.'
Returns:
list: list of extracted numbers as floats
Note:
will always extract numbers formatted with a decimal dot/full stop,
such as '3.5', even if 'decimal' is specified.
"""
if decimal != '.':
text = normalize_decimals(text, decimal)
return extract_numbers_generic(text, pronounce_number_es,
extract_number_es, short_scale=short_scale,
ordinals=ordinals)
Expand Down
Loading

0 comments on commit 6626587

Please sign in to comment.