From 66265876997cc16864e5542d319930e5ce81a902 Mon Sep 17 00:00:00 2001 From: jarbasai Date: Sun, 9 May 2021 13:20:35 +0100 Subject: [PATCH] support decimal markers rebase of https://github.com/MycroftAI/lingua-franca/pull/69 --- lingua_franca/internal.py | 8 +++++ lingua_franca/lang/parse_common.py | 12 +++++++ lingua_franca/lang/parse_cs.py | 28 ++++++++++----- lingua_franca/lang/parse_da.py | 45 ++++++++++++++--------- lingua_franca/lang/parse_de.py | 58 +++++++++++++++++++----------- lingua_franca/lang/parse_en.py | 17 +++++++-- lingua_franca/lang/parse_es.py | 41 ++++++++++++++------- lingua_franca/lang/parse_eu.py | 24 ++++++++----- lingua_franca/lang/parse_fa.py | 29 ++++++++++----- lingua_franca/lang/parse_fr.py | 44 ++++++++++++++++------- lingua_franca/lang/parse_it.py | 29 ++++++++++----- lingua_franca/lang/parse_nl.py | 34 ++++++++++++------ lingua_franca/lang/parse_pl.py | 32 ++++++++++++----- lingua_franca/lang/parse_pt.py | 21 ++++++++--- lingua_franca/lang/parse_ru.py | 24 ++++++++----- lingua_franca/lang/parse_sv.py | 11 +++++- lingua_franca/parse.py | 15 ++++++-- test/unittests/test_parse_en.py | 11 ++++++ 18 files changed, 350 insertions(+), 133 deletions(-) diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py index 3e389181..497c0db1 100644 --- a/lingua_franca/internal.py +++ b/lingua_franca/internal.py @@ -561,6 +561,14 @@ def _call_localized_function(func, *args, **kwargs): # If we didn't find a localized function to correspond with # the wrapped function, we cached NotImplementedError in its # place. + + # first account for the function not being present in any + # module, meaning all modules are falling back to a catch all + # parser, this usually means the function will need localization + # only in future languages not currently supported + if func_name not in _localized_functions[_module_name][lang_code]: + raise FunctionNotLocalizedError(func_name, lang_code) + loc_signature = _localized_functions[_module_name][lang_code][func_name] if isinstance(loc_signature, type(NotImplementedError())): raise loc_signature diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py index 97cf5be7..f140ac9d 100644 --- a/lingua_franca/lang/parse_common.py +++ b/lingua_franca/lang/parse_common.py @@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None): return utterance +def normalize_decimals(text, decimal, lang=""): + """ + Replace 'decimal' with decimal periods so Python can floatify them + """ + regex = r"\b\d+" + decimal + r"{1}\d+\b" + sanitize_decimals = re.compile(regex) + for _, match in enumerate(re.finditer(sanitize_decimals, text)): + text = text.replace(match.group( + 0), match.group(0).replace(decimal, '.')) + return text + + def match_yes_or_no(text, lang): resource_file = resolve_resource_file(f"text/{lang}/yesno.json") if not resource_file: diff --git a/lingua_franca/lang/parse_cs.py b/lingua_franca/lang/parse_cs.py index e0144b02..7119f738 100644 --- a/lingua_franca/lang/parse_cs.py +++ b/lingua_franca/lang/parse_cs.py @@ -23,7 +23,7 @@ _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \ _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \ _ORDINAL_BASE_CS # _ARTICLES_CS - +from lingua_franca.lang.parse_common import normalize_decimals import re import json from lingua_franca import resolve_resource_file @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale): return multiplies, string_num_ordinal_cs, string_num_scale_cs -def extract_number_cs(text, short_scale=True, ordinals=False): +def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_cs(tokenize(text.lower()), short_scale, ordinals).value @@ -1560,20 +1566,24 @@ def isFractional_cs(input_str, short_scale=True): return False -def extract_numbers_cs(text, short_scale=True, ordinals=False): +def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_cs(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_da.py b/lingua_franca/lang/parse_da.py index 14b18132..c1c30020 100644 --- a/lingua_franca/lang/parse_da.py +++ b/lingua_franca/lang/parse_da.py @@ -20,22 +20,31 @@ from lingua_franca.lang.common_data_da import _DA_NUMBERS from lingua_franca.lang.format_da import pronounce_number_da from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals -def extract_number_da(text, short_scale=True, ordinals=False): +def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -869,20 +878,24 @@ def normalize_da(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_da(text, short_scale=True, ordinals=False): +def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_da, extract_number_da, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 95fda48e..2af17e77 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -21,6 +21,7 @@ from lingua_franca.lang.common_data_de import _DE_NUMBERS from lingua_franca.lang.format_de import pronounce_number_de from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals de_numbers = { @@ -143,20 +144,28 @@ def repl(match): return (duration, text) -def extract_number_de(text, short_scale=True, ordinals=False): +def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1003,20 +1012,27 @@ def normalize_de(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_de(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats +def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'): """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' + Returns: + list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_de, extract_number_de, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py index a51ee02c..d7d4902c 100644 --- a/lingua_franca/lang/parse_en.py +++ b/lingua_franca/lang/parse_en.py @@ -29,6 +29,7 @@ from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False): @@ -529,7 +530,7 @@ def _initialize_number_data_en(short_scale, speech=True): return multiplies, string_num_ordinal_en, string_num_scale_en -def extract_number_en(text, short_scale=True, ordinals=False): +def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -540,11 +541,17 @@ def extract_number_en(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_en(tokenize(text.lower()), short_scale, ordinals).value @@ -1655,7 +1662,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True): return False -def extract_numbers_en(text, short_scale=True, ordinals=False): +def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1666,9 +1673,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_en(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_es.py b/lingua_franca/lang/parse_es.py index 0a810cc4..de718d12 100644 --- a/lingua_franca/lang/parse_es.py +++ b/lingua_franca/lang/parse_es.py @@ -20,6 +20,7 @@ from lingua_franca.lang.format_es import pronounce_number_es from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES +from lingua_franca.lang.parse_common import normalize_decimals def is_fractional_es(input_str, short_scale=True): @@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True): return False -def extract_number_es(text, short_scale=True, ordinals=False): +def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -268,20 +281,24 @@ def es_number(i): return es_number(i) -def extract_numbers_es(text, short_scale=True, ordinals=False): +def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_es, extract_number_es, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_eu.py b/lingua_franca/lang/parse_eu.py index 8d10162b..46ec0c80 100644 --- a/lingua_franca/lang/parse_eu.py +++ b/lingua_franca/lang/parse_eu.py @@ -23,6 +23,7 @@ from lingua_franca.lang.format_eu import pronounce_number_eu from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_eu import _NUM_STRING_EU +from lingua_franca.lang.parse_common import normalize_decimals def isFractional_eu(input_str): @@ -283,20 +284,27 @@ def eu_number(i): return eu_number(i) -def extract_numbers_eu(text, short_scale=True, ordinals=False): +def extract_numbers_eu(text, short_scale=True, ordinals=False, decimal='.'): """ - Takes in a string and extracts a list of numbers. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_eu, extract_number_eu, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_fa.py b/lingua_franca/lang/parse_fa.py index 753ac8eb..b888ae32 100644 --- a/lingua_franca/lang/parse_fa.py +++ b/lingua_franca/lang/parse_fa.py @@ -19,6 +19,7 @@ _FARSI_ONES, _FARSI_TENS, _FORMAL_VARIANT) from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def _is_number(s): @@ -307,20 +308,24 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): return (result, " ".join(remainder)) -def extract_numbers_fa(text, short_scale=True, ordinals=False): +def extract_numbers_fa(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) ar = _parse_sentence(text) result = [] @@ -330,7 +335,7 @@ def extract_numbers_fa(text, short_scale=True, ordinals=False): return result -def extract_number_fa(text, ordinals=False): +def extract_number_fa(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -341,11 +346,17 @@ def extract_number_fa(text, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) x = extract_numbers_fa(text, ordinals=ordinals) if (len(x) == 0): return False diff --git a/lingua_franca/lang/parse_fr.py b/lingua_franca/lang/parse_fr.py index 9728653f..97364182 100644 --- a/lingua_franca/lang/parse_fr.py +++ b/lingua_franca/lang/parse_fr.py @@ -23,6 +23,7 @@ from lingua_franca.lang.common_data_fr import _ARTICLES_FR, _NUMBERS_FR, \ _ORDINAL_ENDINGS_FR from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def extract_duration_fr(text): @@ -369,13 +370,28 @@ def _number_ordinal_fr(words, i): return None -def extract_number_fr(text, short_scale=True, ordinals=False): - """Takes in a string and extracts a number. +def extract_number_fr(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: - text (str): the string to extract a number from + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (str): The number extracted or the original text. + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1067,20 +1083,24 @@ def normalize_fr(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_fr(text, short_scale=True, ordinals=False): +def extract_numbers_fr(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_fr, extract_number_fr, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_it.py b/lingua_franca/lang/parse_it.py index 88c7455d..c7eaea35 100644 --- a/lingua_franca/lang/parse_it.py +++ b/lingua_franca/lang/parse_it.py @@ -28,6 +28,7 @@ pronounce_number_it from lingua_franca.lang.common_data_it import _SHORT_ORDINAL_STRING_IT, \ _ARTICLES_IT, _LONG_ORDINAL_STRING_IT, _STRING_NUM_IT +from lingua_franca.lang.parse_common import normalize_decimals def is_fractional_it(input_str, short_scale=False): @@ -224,7 +225,7 @@ def _extract_number_long_it(word): return value -def extract_number_it(text, short_scale=False, ordinals=False): +def extract_number_it(text, short_scale=False, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -235,11 +236,17 @@ def extract_number_it(text, short_scale=False, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) text = text.lower() string_num_ordinal_it = {} @@ -1148,20 +1155,24 @@ def get_gender_it(word, context=""): return gender -def extract_numbers_it(text, short_scale=False, ordinals=False): +def extract_numbers_it(text, short_scale=False, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_it, extract_number_it, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_nl.py b/lingua_franca/lang/parse_nl.py index ba197704..8a991b77 100644 --- a/lingua_franca/lang/parse_nl.py +++ b/lingua_franca/lang/parse_nl.py @@ -25,6 +25,7 @@ _NEGATIVES_NL, _SHORT_SCALE_NL, _STRING_LONG_ORDINAL_NL, _STRING_NUM_NL, \ _STRING_SHORT_ORDINAL_NL, _SUMS_NL from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals import re @@ -414,10 +415,10 @@ def _initialize_number_data_nl(short_scale): return multiplies, string_num_ordinal_nl, string_num_scale_nl -def extract_number_nl(text, short_scale=True, ordinals=False): - """Extract a number from a text string - - The function handles pronunciations in long scale and short scale +def extract_number_nl(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers @@ -425,10 +426,17 @@ def extract_number_nl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_nl(tokenize(text.lower()), short_scale, ordinals).value @@ -1294,19 +1302,23 @@ def is_fractional_nl(input_str, short_scale=True): return False -def extract_numbers_nl(text, short_scale=True, ordinals=False): +def extract_numbers_nl(text, short_scale=True, ordinals=False, decimal='.'): """Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_nl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pl.py b/lingua_franca/lang/parse_pl.py index 84f83bc8..53f2f43f 100644 --- a/lingua_franca/lang/parse_pl.py +++ b/lingua_franca/lang/parse_pl.py @@ -24,6 +24,7 @@ _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ _ALT_ORDINALS_PL from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals import re @@ -576,7 +577,7 @@ def _initialize_number_data(short_scale): return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale -def extract_number_pl(text, short_scale=True, ordinals=False): +def extract_number_pl(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -587,11 +588,17 @@ def extract_number_pl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_pl(tokenize(text.lower()), True, ordinals).value @@ -1333,20 +1340,27 @@ def isFractional_pl(input_str, short_scale=True): return False -def extract_numbers_pl(text, short_scale=True, ordinals=False): +def extract_numbers_pl(text, short_scale=True, ordinals=False, decimal='.'): """ - Takes in a string and extracts a list of numbers. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_pl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pt.py b/lingua_franca/lang/parse_pt.py index 356c1e83..c4533063 100644 --- a/lingua_franca/lang/parse_pt.py +++ b/lingua_franca/lang/parse_pt.py @@ -29,6 +29,7 @@ from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.parse_common import Normalizer from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals import json import re import unicodedata @@ -77,16 +78,28 @@ def is_fractional_pt(input_str, short_scale=True): return False -def extract_number_pt(text, short_scale=True, ordinals=False): +def extract_number_pt(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. diff --git a/lingua_franca/lang/parse_ru.py b/lingua_franca/lang/parse_ru.py index cd041ec7..d5345747 100644 --- a/lingua_franca/lang/parse_ru.py +++ b/lingua_franca/lang/parse_ru.py @@ -28,6 +28,7 @@ import json from lingua_franca import resolve_resource_file from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def generate_plurals_ru(originals): @@ -1577,20 +1578,27 @@ def is_fractional_ru(input_str, short_scale=True): return False -def extract_numbers_ru(text, short_scale=True, ordinals=False): +def extract_numbers_ru(text, short_scale=True, ordinals=False, decimal='.'): """ - Takes in a string and extracts a list of numbers. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_ru(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_sv.py b/lingua_franca/lang/parse_sv.py index 02164111..bb23f2ee 100644 --- a/lingua_franca/lang/parse_sv.py +++ b/lingua_franca/lang/parse_sv.py @@ -17,6 +17,7 @@ from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals from .parse_common import (is_numeric, look_for_fractions, Normalizer, tokenize, Token) @@ -156,15 +157,23 @@ def extract_duration_sv(text): return (td, remainder) if valid else None -def extract_number_sv(text, short_scale=True, ordinals=False): +def extract_number_sv(text, short_scale=True, ordinals=False, decimal='.'): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float): The value of extracted number + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API # compatibility reasons. diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index f1602717..4870a8a7 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -16,6 +16,7 @@ import json from lingua_franca.util import match_one, fuzzy_match, MatchStrategy from lingua_franca.lang.parse_common import match_yes_or_no +import re from difflib import SequenceMatcher from warnings import warn from lingua_franca.time import now_local @@ -56,7 +57,8 @@ def extract_langcode(text, lang=""): @localized_function() -def extract_numbers(text, short_scale=True, ordinals=False, lang=''): +def extract_numbers(text, short_scale=True, ordinals=False, lang='', + decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -69,13 +71,18 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=''): ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats, or empty list if none found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ @localized_function() -def extract_number(text, short_scale=True, ordinals=False, lang=''): +def extract_number(text, short_scale=True, ordinals=False, lang='', + decimal='.'): """Takes in a string and extracts a number. Args: @@ -87,9 +94,13 @@ def extract_number(text, short_scale=True, ordinals=False, lang=''): ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: (int, float or False): The number extracted or False if the input text contains no numbers + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ diff --git a/test/unittests/test_parse_en.py b/test/unittests/test_parse_en.py index caae8999..7aeb3df9 100644 --- a/test/unittests/test_parse_en.py +++ b/test/unittests/test_parse_en.py @@ -290,6 +290,17 @@ def test_combinations(self): class TestExtractNumber(unittest.TestCase): + def test_extract_number_decimal_markers(self): + # Test decimal normalization + self.assertEqual(extract_number("4,4", decimal=','), 4.4) + self.assertEqual(extract_number("we have 3,5 kilometers to go", + decimal=','), 3.5) + self.assertEqual(extract_numbers("this is a seven eight 9,5 test", + decimal=','), + [7.0, 8.0, 9.5]) + self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test", + decimal=','), [7.0, 8.0, 9.6]) + def test_extract_number_priority(self): # sanity check self.assertEqual(extract_number("third", ordinals=True), 3)