From eb257dac273761b9696d72ffd0d95ff6353490ef Mon Sep 17 00:00:00 2001 From: jarbasal Date: Mon, 10 May 2021 19:05:28 +0100 Subject: [PATCH 1/2] feat/number_spans feat/normalize_decimals port lingua_nostra/pull/20 - support decimal markers rebase of https://github.com/MycroftAI/lingua-franca/pull/69 Co-authored-by: jarbasal --- lingua_franca/lang/common_data_en.py | 2 +- lingua_franca/lang/parse_common.py | 12 + lingua_franca/lang/parse_cs.py | 31 ++- lingua_franca/lang/parse_da.py | 48 ++-- lingua_franca/lang/parse_de.py | 59 +++-- lingua_franca/lang/parse_en.py | 248 ++++++++++++++++++- lingua_franca/lang/parse_es.py | 44 +++- lingua_franca/lang/parse_fa.py | 32 ++- lingua_franca/lang/parse_fr.py | 47 +++- lingua_franca/lang/parse_it.py | 32 ++- lingua_franca/lang/parse_nl.py | 37 ++- lingua_franca/lang/parse_pl.py | 36 ++- lingua_franca/lang/parse_pt.py | 22 +- lingua_franca/lang/parse_sv.py | 11 +- lingua_franca/parse.py | 135 +++++++++-- requirements/requirements.txt | 3 +- test/unittests/test_format_pt.py | 11 + test/unittests/test_parse_en.py | 344 ++++++++++++++++++++++++++- 18 files changed, 1001 insertions(+), 153 deletions(-) diff --git a/lingua_franca/lang/common_data_en.py b/lingua_franca/lang/common_data_en.py index f2f8de1a..25de4cd9 100644 --- a/lingua_franca/lang/common_data_en.py +++ b/lingua_franca/lang/common_data_en.py @@ -247,7 +247,7 @@ # negate next number (-2 = 0 - 2) -_NEGATIVES_EN = {"negative", "minus"} +_NEGATIVES_EN = {"negative", "minus", "-"} # sum the next number (twenty two = 20 + 2) _SUMS_EN = {'twenty', '20', 'thirty', '30', 'forty', '40', 'fifty', '50', diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py index 97cf5be7..aef9e199 100644 --- a/lingua_franca/lang/parse_common.py +++ b/lingua_franca/lang/parse_common.py @@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None): return utterance +def normalize_decimals(text, decimal): + """ + Replace 'decimal' with decimal periods so Python can floatify them + """ + regex = r"\b\d+" + decimal + r"{1}\d+\b" + sanitize_decimals = re.compile(regex) + for _, match in enumerate(re.finditer(sanitize_decimals, text)): + text = text.replace(match.group( + 0), match.group(0).replace(decimal, '.')) + return text + + def match_yes_or_no(text, lang): resource_file = resolve_resource_file(f"text/{lang}/yesno.json") if not resource_file: diff --git a/lingua_franca/lang/parse_cs.py b/lingua_franca/lang/parse_cs.py index e0144b02..0590dc12 100644 --- a/lingua_franca/lang/parse_cs.py +++ b/lingua_franca/lang/parse_cs.py @@ -23,7 +23,7 @@ _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \ _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \ _ORDINAL_BASE_CS # _ARTICLES_CS - +from lingua_franca.lang.parse_common import normalize_decimals import re import json from lingua_franca import resolve_resource_file @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale): return multiplies, string_num_ordinal_cs, string_num_scale_cs -def extract_number_cs(text, short_scale=True, ordinals=False): +def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_cs(tokenize(text.lower()), short_scale, ordinals).value @@ -1560,20 +1566,25 @@ def isFractional_cs(input_str, short_scale=True): return False -def extract_numbers_cs(text, short_scale=True, ordinals=False): +def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_cs(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_da.py b/lingua_franca/lang/parse_da.py index 14b18132..13f0eff1 100644 --- a/lingua_franca/lang/parse_da.py +++ b/lingua_franca/lang/parse_da.py @@ -20,22 +20,31 @@ from lingua_franca.lang.common_data_da import _DA_NUMBERS from lingua_franca.lang.format_da import pronounce_number_da from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals -def extract_number_da(text, short_scale=True, ordinals=False): +def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -869,20 +878,25 @@ def normalize_da(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_da(text, short_scale=True, ordinals=False): +def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_da, extract_number_da, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 95fda48e..81528e68 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -21,6 +21,7 @@ from lingua_franca.lang.common_data_de import _DE_NUMBERS from lingua_franca.lang.format_de import pronounce_number_de from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals de_numbers = { @@ -143,20 +144,28 @@ def repl(match): return (duration, text) -def extract_number_de(text, short_scale=True, ordinals=False): +def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1003,20 +1012,28 @@ def normalize_de(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_de(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats +def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'): """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_de, extract_number_de, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py index a51ee02c..761c14aa 100644 --- a/lingua_franca/lang/parse_en.py +++ b/lingua_franca/lang/parse_en.py @@ -18,6 +18,7 @@ from datetime import datetime, timedelta, time from dateutil.relativedelta import relativedelta +from quebra_frases import span_indexed_word_tokenize from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.common_data_en import _ARTICLES_EN, _LONG_ORDINAL_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, \ @@ -29,6 +30,7 @@ from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False): @@ -368,9 +370,9 @@ def _extract_whole_number_with_text_en(tokens, short_scale, ordinals): # is the prev word a number and should we sum it? # twenty two, fifty six - if (prev_word in _SUMS_EN and val and val < 10) or all([prev_word in - multiplies, - val < prev_val if prev_val else False]): + if (prev_word in _SUMS_EN and val and val < 10) or \ + all([prev_word in multiplies, + val < prev_val if prev_val else False]): val = prev_val + val # is the prev word a number and should we multiply it? @@ -529,7 +531,231 @@ def _initialize_number_data_en(short_scale, speech=True): return multiplies, string_num_ordinal_en, string_num_scale_en -def extract_number_en(text, short_scale=True, ordinals=False): +def extract_number_spans_en(utterance, short_scale=True, ordinals=False, + fractional_numbers=True, decimal="."): + """ + This function tags numbers in an utterance. + + Args: + utterance (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + decimal (str): decimal marker + Returns: + (list): list of tuples with detected number and span of the + number in parent utterance [(number, (start_idx, end_idx))] + + """ + number_spans = [] + if isinstance(utterance, str): + spans = span_indexed_word_tokenize(utterance) + else: + spans = utterance + + # load language number data + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data_en(short_scale, speech=ordinals is not None) + + num_ended = False # number string ended, save prev number + num = None + num2 = None + + num_start = -1 + num_end = -1 + for idx, (start, end, word) in enumerate(spans): + if end <= num_end: + # token consumed already + continue + + prev_span = spans[idx - 1] if idx > 0 else (-1, -1, "") + next_span = spans[idx + 1] if idx + 1 < len(spans) else (-1, -1, "") + next_next_span = spans[idx + 2] if idx + 2 < len(spans) else (-1, -1, "") + + word = word.lower() + prev_word = prev_span[-1].lower() + next_word = next_span[-1].lower() + + def found_number(): + nonlocal num, number_spans, num_end, num_start + # found a number! + number_spans.append((num, (num_start, num_end))) + num = None + + # is the word a number already ? + if not num and is_numeric(word): + num = int(word) + num_start = start + num_end = end + + # spoken/text number? + elif not is_numeric(word): + # let's see if this continuation or end of a previous number + if num is not None: + # is this word the name of a number ? + if word in _STRING_NUM_EN: + num2 = _STRING_NUM_EN.get(word) + elif word in string_num_scale: + num2 = string_num_scale.get(word) + elif ordinals and word in string_num_ordinal: + num2 = string_num_ordinal[word] + + ## how do num and num2 relate + if num is not None and num_ended: + # found a number! + found_number() + continue + + # let's see if this word is the start of a number + else: + # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth + if is_numeric(word[:-2]) and \ + (word.endswith("st") or word.endswith("nd") or + word.endswith("rd") or word.endswith("th")): + num = int(word[:-2]) + num_start = start + num_end = end + + # handle nth one + if next_word == "one": + # consume next span + # would return 1 instead otherwise + spans[idx + 1] = (-1, -1, "") + + # found a number! + found_number() + continue + + # is this word the name of a number ? + if word in _STRING_NUM_EN: + num = _STRING_NUM_EN.get(word) + elif word in string_num_scale: + num = string_num_scale.get(word) + elif ordinals and word in string_num_ordinal: + num = string_num_ordinal[word] + + + # is this a spoken fraction? + # half cup + #elif not (ordinals is None and word in string_num_ordinal): + # num = is_fractional_en(word, short_scale=short_scale, + # spoken=ordinals is not None) + + # process the number we found + if num is not None: + # take note of span + num_start = start + num_end = end + + # negative number marker + if prev_word in _NEGATIVES_EN: + num = 0 - num + num_start = prev_span[0] + + ## is this a final number? + num_ended = False + + # explicit ordinals, 1st, 2nd, 3rd, 4th.... Nth + if is_numeric(next_word[:-2]) and \ + (next_word.endswith("st") or next_word.endswith("nd") or + next_word.endswith("rd") or next_word.endswith("th")): + # new number coming up, invalid continuation + num_ended = True + + if next_word in _NEGATIVES_EN: + # a new negative sign is an invalid number continuation + num_ended = True + + # end of sentence + if idx == len(spans) - 1: + num_ended = True + + if num is not None and num_ended: + found_number() + + continue + + # handle # and fraction, eg. "2 and 3/4" + if fractional_numbers and num is not None and \ + next_span[-1] in _FRACTION_MARKER_EN and \ + prev_span[-1] not in [decimal, "/"]: + fractional_piece = extract_number_spans_en(spans[end:], + short_scale, + ordinals, + fractional_numbers, + decimal) + if fractional_piece: + frac_num = fractional_piece[0][0] + # ensure first is not a fraction and second is a fraction + if num >= 1 and frac_num < 1: + num += frac_num + num_end = fractional_piece[0][1][1] + number_spans.append((num, (num_start, num_end))) + # return all parsed numbers after the marker + # (do not reparse) + return number_spans + fractional_piece[1:] + + # handle # symbol #, eg. 1.5 or 3/4 + elif word.isdigit() and \ + next_next_span[-1].isdigit() and \ + next_span[-1] in [decimal, "/"] and \ + prev_span[-1] not in [decimal, "/"]: + num = int(word) + num_start = start + num_end = end + num2 = int(next_next_span[-1]) + + # negative number marker + if prev_word in _NEGATIVES_EN: + num = 0 - num + num_start = prev_span[0] + + # handle #/#, eg. "1/5" + if next_span[-1] == "/": + num_start = start + num = num / num2 + num_end = next_next_span[1] + # found a number! + found_number() + continue + + # handle #.#, eg. "1.5" + elif next_span[-1] == decimal: + num2 = float(f"0.{num2}") + num = num + num2 + num_end = next_next_span[1] + # found a number! + found_number() + continue + + # handle #, eg. "123" + elif is_numeric(word): + if word.isdigit(): # doesn't work with decimals + num = int(word) + else: + num = float(word) + num_start = start + num_end = end + # negative number marker + if prev_word in _NEGATIVES_EN: + num = 0 - num + num_start = prev_span[0] + # found a number! + found_number() + continue + + return number_spans + + +def extract_number_en_v2(*args, **kwargs): + spans = extract_number_spans_en(*args, **kwargs) + if not spans: + return False + return extract_number_spans_en(*args, **kwargs)[0][0] + + +def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -540,11 +766,17 @@ def extract_number_en(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_en(tokenize(text.lower()), short_scale, ordinals).value @@ -1655,7 +1887,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True): return False -def extract_numbers_en(text, short_scale=True, ordinals=False): +def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1666,9 +1898,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_en(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_es.py b/lingua_franca/lang/parse_es.py index 0a810cc4..f2f471f1 100644 --- a/lingua_franca/lang/parse_es.py +++ b/lingua_franca/lang/parse_es.py @@ -20,6 +20,7 @@ from lingua_franca.lang.format_es import pronounce_number_es from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES +from lingua_franca.lang.parse_common import normalize_decimals def is_fractional_es(input_str, short_scale=True): @@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True): return False -def extract_number_es(text, short_scale=True, ordinals=False): +def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -268,20 +281,25 @@ def es_number(i): return es_number(i) -def extract_numbers_es(text, short_scale=True, ordinals=False): +def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_es, extract_number_es, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_fa.py b/lingua_franca/lang/parse_fa.py index 753ac8eb..8d0be089 100644 --- a/lingua_franca/lang/parse_fa.py +++ b/lingua_franca/lang/parse_fa.py @@ -19,6 +19,7 @@ _FARSI_ONES, _FARSI_TENS, _FORMAL_VARIANT) from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def _is_number(s): @@ -307,20 +308,25 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): return (result, " ".join(remainder)) -def extract_numbers_fa(text, short_scale=True, ordinals=False): +def extract_numbers_fa(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) ar = _parse_sentence(text) result = [] @@ -330,7 +336,7 @@ def extract_numbers_fa(text, short_scale=True, ordinals=False): return result -def extract_number_fa(text, ordinals=False): +def extract_number_fa(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -341,11 +347,17 @@ def extract_number_fa(text, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) x = extract_numbers_fa(text, ordinals=ordinals) if (len(x) == 0): return False diff --git a/lingua_franca/lang/parse_fr.py b/lingua_franca/lang/parse_fr.py index 9728653f..87579c22 100644 --- a/lingua_franca/lang/parse_fr.py +++ b/lingua_franca/lang/parse_fr.py @@ -23,6 +23,7 @@ from lingua_franca.lang.common_data_fr import _ARTICLES_FR, _NUMBERS_FR, \ _ORDINAL_ENDINGS_FR from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def extract_duration_fr(text): @@ -369,13 +370,28 @@ def _number_ordinal_fr(words, i): return None -def extract_number_fr(text, short_scale=True, ordinals=False): - """Takes in a string and extracts a number. +def extract_number_fr(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: - text (str): the string to extract a number from + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (str): The number extracted or the original text. + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1067,20 +1083,25 @@ def normalize_fr(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_fr(text, short_scale=True, ordinals=False): +def extract_numbers_fr(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_fr, extract_number_fr, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_it.py b/lingua_franca/lang/parse_it.py index 88c7455d..297445f3 100644 --- a/lingua_franca/lang/parse_it.py +++ b/lingua_franca/lang/parse_it.py @@ -28,6 +28,7 @@ pronounce_number_it from lingua_franca.lang.common_data_it import _SHORT_ORDINAL_STRING_IT, \ _ARTICLES_IT, _LONG_ORDINAL_STRING_IT, _STRING_NUM_IT +from lingua_franca.lang.parse_common import normalize_decimals def is_fractional_it(input_str, short_scale=False): @@ -224,7 +225,7 @@ def _extract_number_long_it(word): return value -def extract_number_it(text, short_scale=False, ordinals=False): +def extract_number_it(text, short_scale=False, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -235,11 +236,17 @@ def extract_number_it(text, short_scale=False, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) text = text.lower() string_num_ordinal_it = {} @@ -1148,20 +1155,25 @@ def get_gender_it(word, context=""): return gender -def extract_numbers_it(text, short_scale=False, ordinals=False): +def extract_numbers_it(text, short_scale=False, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_it, extract_number_it, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_nl.py b/lingua_franca/lang/parse_nl.py index ba197704..a1f074f7 100644 --- a/lingua_franca/lang/parse_nl.py +++ b/lingua_franca/lang/parse_nl.py @@ -26,6 +26,7 @@ _STRING_SHORT_ORDINAL_NL, _SUMS_NL from lingua_franca.time import now_local import re +from lingua_franca.lang.parse_common import normalize_decimals def _convert_words_to_numbers_nl(text, short_scale=True, ordinals=False): @@ -414,10 +415,10 @@ def _initialize_number_data_nl(short_scale): return multiplies, string_num_ordinal_nl, string_num_scale_nl -def extract_number_nl(text, short_scale=True, ordinals=False): - """Extract a number from a text string - - The function handles pronunciations in long scale and short scale +def extract_number_nl(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers @@ -425,10 +426,17 @@ def extract_number_nl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_nl(tokenize(text.lower()), short_scale, ordinals).value @@ -1294,19 +1302,24 @@ def is_fractional_nl(input_str, short_scale=True): return False -def extract_numbers_nl(text, short_scale=True, ordinals=False): +def extract_numbers_nl(text, short_scale=True, ordinals=False, decimal='.'): """Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_nl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pl.py b/lingua_franca/lang/parse_pl.py index 84f83bc8..67518d25 100644 --- a/lingua_franca/lang/parse_pl.py +++ b/lingua_franca/lang/parse_pl.py @@ -24,6 +24,8 @@ _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ _ALT_ORDINALS_PL from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals + import re @@ -576,7 +578,7 @@ def _initialize_number_data(short_scale): return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale -def extract_number_pl(text, short_scale=True, ordinals=False): +def extract_number_pl(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -587,11 +589,17 @@ def extract_number_pl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_pl(tokenize(text.lower()), True, ordinals).value @@ -1333,20 +1341,28 @@ def isFractional_pl(input_str, short_scale=True): return False -def extract_numbers_pl(text, short_scale=True, ordinals=False): +def extract_numbers_pl(text, short_scale=True, ordinals=False, decimal='.'): """ - Takes in a string and extracts a list of numbers. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_pl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pt.py b/lingua_franca/lang/parse_pt.py index 356c1e83..ab1dd94c 100644 --- a/lingua_franca/lang/parse_pt.py +++ b/lingua_franca/lang/parse_pt.py @@ -29,6 +29,8 @@ from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.parse_common import Normalizer from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals + import json import re import unicodedata @@ -77,16 +79,28 @@ def is_fractional_pt(input_str, short_scale=True): return False -def extract_number_pt(text, short_scale=True, ordinals=False): +def extract_number_pt(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. diff --git a/lingua_franca/lang/parse_sv.py b/lingua_franca/lang/parse_sv.py index 02164111..bb23f2ee 100644 --- a/lingua_franca/lang/parse_sv.py +++ b/lingua_franca/lang/parse_sv.py @@ -17,6 +17,7 @@ from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals from .parse_common import (is_numeric, look_for_fractions, Normalizer, tokenize, Token) @@ -156,15 +157,23 @@ def extract_duration_sv(text): return (td, remainder) if valid else None -def extract_number_sv(text, short_scale=True, ordinals=False): +def extract_number_sv(text, short_scale=True, ordinals=False, decimal='.'): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float): The value of extracted number + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API # compatibility reasons. diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index f1602717..76dbad4e 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -13,9 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import re import json from lingua_franca.util import match_one, fuzzy_match, MatchStrategy -from lingua_franca.lang.parse_common import match_yes_or_no +from lingua_franca.lang.parse_common import match_yes_or_no, is_numeric from difflib import SequenceMatcher from warnings import warn from lingua_franca.time import now_local @@ -24,13 +25,17 @@ get_default_lang, localized_function, _raise_unsupported_language, UnsupportedLanguageError,\ resolve_resource_file, FunctionNotLocalizedError import unicodedata +from quebra_frases import span_indexed_word_tokenize +# TODO deprecate extract_number and extract_numbers in favor of +# extract_number_spans to rule them all _REGISTERED_FUNCTIONS = ("extract_numbers", "extract_number", "extract_duration", "extract_datetime", "extract_langcode", + "extract_number_spans", "normalize", "get_gender", "yes_or_no", @@ -40,6 +45,44 @@ populate_localized_function_dict("parse", langs=get_active_langs()) +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def extract_number_spans(utterance, short_scale=True, ordinals=False, + fractional_numbers=True, decimal=".", lang=''): + """ + This function tags numbers in an utterance. + + Args: + utterance (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + decimal (str): decimal marker + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + Returns: + (list): list of tuples with detected number and span of the + number in parent utterance [(number, (start_idx, end_idx))] + + """ + number_spans = [] + spans = span_indexed_word_tokenize(utterance) + for idx, (start, end, word) in enumerate(spans): + next_span = spans[idx + 1] if idx + 1 < len(spans) else () + next_next_span = spans[idx + 2] if idx + 2 < len(spans) else () + if is_numeric(word): + if next_span and next_next_span and \ + next_span[-1] == decimal and \ + is_numeric(next_next_span[-1]): + end = next_next_span[1] + num = float("".join([word, next_span[-1], next_next_span[-1]])) + spans[idx + 1] = spans[idx + 2] = (-1, -1, "") + else: + num = int(word) + number_spans.append((num, (start, end))) + return number_spans + + @localized_function(run_own_code_on=[FunctionNotLocalizedError]) def yes_or_no(text, lang=""): text = normalize(text, lang=lang, remove_articles=True).lower() @@ -55,8 +98,9 @@ def extract_langcode(text, lang=""): return match_one(text, LANGUAGES, strategy=MatchStrategy.TOKEN_SET_RATIO) -@localized_function() -def extract_numbers(text, short_scale=True, ordinals=False, lang=''): +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def extract_numbers(text, short_scale=True, ordinals=False, lang='', + decimal='.', fractional_numbers=True): """ Takes in a string and extracts a list of numbers. @@ -69,28 +113,83 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=''): ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats, or empty list if none found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + spans = extract_number_spans(text, short_scale=short_scale, lang=lang, + ordinals=ordinals, decimal=decimal, + fractional_numbers=fractional_numbers) + if spans: + return [a[0] for a in spans] + return [] -@localized_function() -def extract_number(text, short_scale=True, ordinals=False, lang=''): +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def extract_number(text, short_scale=True, ordinals=False, lang='', + decimal='.', fractional_numbers=True): + """backwards compat, use extract_first_number instead""" + return extract_first_number(text, short_scale, ordinals, + lang, decimal, fractional_numbers) + + +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def extract_first_number(text, short_scale=True, ordinals=False, lang='', + decimal='.', fractional_numbers=True): """Takes in a string and extracts a number. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - lang (str, optional): an optional BCP-47 language code, if omitted - the default language will be used. - Returns: - (int, float or False): The number extracted or False if the input - text contains no numbers - """ + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' + Returns: + (int, float or False): The number extracted or False if the input + text contains no numbers + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + numbers = extract_numbers(text, short_scale, ordinals, lang, decimal, fractional_numbers) + if numbers: + return numbers[0] + return False + + +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def extract_last_number(text, short_scale=True, ordinals=False, lang='', + decimal='.', fractional_numbers=True): + """Takes in a string and extracts a number. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + lang (str, optional): an optional BCP-47 language code, if omitted + the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' + Returns: + (int, float or False): The number extracted or False if the input + text contains no numbers + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + numbers = extract_numbers(text, short_scale, ordinals, lang, decimal, fractional_numbers) + if numbers: + return numbers[-1] + return False @localized_function() diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 5d943a5d..2df4433c 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,3 @@ python-dateutil~=2.6 -rapidfuzz \ No newline at end of file +rapidfuzz +quebra_frases>=0.3.4 \ No newline at end of file diff --git a/test/unittests/test_format_pt.py b/test/unittests/test_format_pt.py index 61c94406..931afeca 100644 --- a/test/unittests/test_format_pt.py +++ b/test/unittests/test_format_pt.py @@ -21,6 +21,7 @@ from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number from lingua_franca.time import default_timezone +from lingua_franca.parse import extract_number_spans def setUpModule(): @@ -64,6 +65,16 @@ def tearDownModule(): } +class TestSpans(unittest.TestCase): + def test_number_spans(self): + self.assertEqual(extract_number_spans("este é o teste 1 2 3 666 1.5"), + [(1, (15, 16)), + (2, (17, 18)), + (3, (19, 20)), + (666, (21, 24)), + (1.5, (25, 28))]) + + class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0, lang="pt"), "zero") diff --git a/test/unittests/test_parse_en.py b/test/unittests/test_parse_en.py index caae8999..4bd066bf 100644 --- a/test/unittests/test_parse_en.py +++ b/test/unittests/test_parse_en.py @@ -20,14 +20,10 @@ from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.internal import FunctionNotLocalizedError -from lingua_franca.parse import extract_datetime -from lingua_franca.parse import extract_duration -from lingua_franca.parse import extract_number, extract_numbers -from lingua_franca.parse import get_gender -from lingua_franca.parse import normalize from lingua_franca.time import default_timezone, to_local -from lingua_franca.parse import extract_langcode -from lingua_franca.parse import yes_or_no +from lingua_franca.parse import extract_datetime, extract_duration, extract_number, \ + extract_numbers, get_gender, normalize, extract_langcode, yes_or_no, extract_number_spans +from lingua_franca.lang.parse_en import extract_number_en_v2 def setUpModule(): @@ -290,6 +286,17 @@ def test_combinations(self): class TestExtractNumber(unittest.TestCase): + def test_extract_number_decimal_markers(self): + # Test decimal normalization + self.assertEqual(extract_number("4,4", decimal=','), 4.4) + self.assertEqual(extract_number("we have 3,5 kilometers to go", + decimal=','), 3.5) + self.assertEqual(extract_numbers("this is a seven eight 9,5 test", + decimal=','), + [7.0, 8.0, 9.5]) + self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test", + decimal=','), [7.0, 8.0, 9.6]) + def test_extract_number_priority(self): # sanity check self.assertEqual(extract_number("third", ordinals=True), 3) @@ -1742,5 +1749,328 @@ def test_with_conf(text, expected_lang, min_conf=0.8): test_with_conf("American", 'en-us') +class TestNumberSpans(unittest.TestCase): + def test_number_spans(self): + self.assertEqual(extract_number_spans("this is test 1 2 3 666 1.5"), + [(1, (13, 14)), + (2, (15, 16)), + (3, (17, 18)), + (666, (19, 22)), + (1.5, (23, 26))]) + self.assertEqual(extract_number_spans("this is test 1.5.5"), + [(1.5, (13, 16)), + (5, (17, 18))]) + + def test_number_spans_frac(self): + self.assertEqual(extract_number_spans("2 and 3/4"), + [(2.75, (0, 9))]) + self.assertEqual(extract_number_spans("2 and 3/4 and after that " + "comes 1.5"), + [(2.75, (0, 9)), + (1.5, (31, 34))]) + self.assertEqual(extract_number_spans("2 and 3/4 and after that " + "comes 0.5"), + [(2.75, (0, 9)), + (0.5, (31, 34))]) + self.assertEqual(extract_number_spans("2 and 3/4 and 27"), + [(2.75, (0, 9)), + (27, (14, 16))]) + + def test_number_spoken_frac(self): + self.assertEqual(extract_number_spans("half cup"), + [(0.5, (0, 4))]) + self.assertEqual(extract_number_spans("third cup"), + [(1 / 3, (0, 5))]) + + def test_number_ordinals(self): + self.assertEqual(extract_number_spans("this is the 1st the 2nd the " + "3rd the 4th and the Nth"), + [(1, (12, 15)), + (2, (20, 23)), + (3, (28, 31)), + (4, (36, 39))]) + + def test_number_spoken_ordinals(self): + self.assertEqual(extract_number_spans("fourth cup", ordinals=True), + [(4, (0, 6))]) + self.assertEqual(extract_number_spans("third cup", ordinals=True), + [(3, (0, 5))]) + + def test_integers(self): + self.assertEqual(extract_number_spans("number one"), + [(1, (7, 10))]) + self.assertEqual(extract_number_spans("number one number two number " + "four"), + [(1, (7, 10)), + (2, (20, 23)), + (3, (28, 31)), + (4, (36, 39))]) + + def test_scale(self): + self.assertEqual(extract_number_spans("a trillion numbers"), + [(1e+12, (2, 10))]) + self.assertEqual(extract_number_spans("a trillion numbers", + short_scale=False), + [(1e+18, (2, 10))]) + + +class TestExtractNumberV2(unittest.TestCase): + def test_extract_percent(self): + self.assertEqual(extract_number_en_v2("totally 100%"), 100) + + def test_extract_number_decimal_markers(self): + # Test decimal normalization + self.assertEqual(extract_number_en_v2("4,4", decimal=','), 4.4) + self.assertEqual(extract_number_en_v2("we have 3,5 kilometers to go", + decimal=','), 3.5) + + def test_extract_number_priority(self): + # sanity check + self.assertEqual(extract_number_en_v2("third", ordinals=True), 3) + self.assertEqual(extract_number_en_v2("sixth", ordinals=True), 6) + self.assertEqual(extract_number_en_v2("sixth third", ordinals=True), 6) + self.assertEqual(extract_number_en_v2("third sixth", ordinals=True), 3) + + # TODO FIXME + self.assertEqual(extract_number_en_v2("Twenty two and Three Fifths", + ordinals=True), 22) + + def test_extract_number_explicit_ordinals(self): + # test explicit ordinals + self.assertEqual(extract_number_en_v2("this is the 1st", + ordinals=True), 1) + self.assertEqual(extract_number_en_v2("this is the 2nd", + ordinals=False), 2) + self.assertEqual(extract_number_en_v2("this is the 3rd", + ordinals=None), 3) + self.assertEqual(extract_number_en_v2("this is the 4th", + ordinals=None), 4) + self.assertEqual(extract_number_en_v2("this is the 7th test", + ordinals=True), 7) + self.assertEqual(extract_number_en_v2("this is the 7th test", + ordinals=False), 7) + self.assertEqual(extract_number_en_v2("this is the 1st test"), 1) + self.assertEqual(extract_number_en_v2("this is the 2nd test"), 2) + self.assertEqual(extract_number_en_v2("this is the 3rd test"), 3) + self.assertEqual(extract_number_en_v2("this is the 31st test"), 31) + self.assertEqual(extract_number_en_v2("this is the 32nd test"), 32) + self.assertEqual(extract_number_en_v2("this is the 33rd test"), 33) + self.assertEqual(extract_number_en_v2("this is the 34th test"), 34) + + self.assertTrue(extract_number_en_v2("this is the nth test") is False) + + def test_extract_number_spoken_ordinals(self): + # test non ambiguous ordinals + self.assertEqual(extract_number_en_v2("this is the first test", + ordinals=True), 1) + self.assertEqual(extract_number_en_v2("this is the first test", + ordinals=False), False) + self.assertEqual(extract_number_en_v2("this is the first test", + ordinals=None), False) + + # test ambiguous ordinal/time unit + self.assertEqual(extract_number_en_v2("this is second test", + ordinals=True), 2) + self.assertEqual(extract_number_en_v2("this is second test", + ordinals=False), False) + self.assertEqual(extract_number_en_v2("remind me in a second", + ordinals=True), 2) + self.assertEqual(extract_number_en_v2("remind me in a second", + ordinals=False), False) + self.assertEqual(extract_number_en_v2("remind me in a second", + ordinals=None), False) + + # test ambiguous ordinal/fractional + self.assertEqual(extract_number_en_v2("this is the third test", + ordinals=True), 3.0) + self.assertEqual(extract_number_en_v2("this is the third test", + ordinals=False), 1.0 / 3.0) + self.assertEqual(extract_number_en_v2("this is the third test", + ordinals=None), False) + + # TODO FIXME + self.assertEqual(extract_number_en_v2("one third of a cup", + ordinals=False), 1.0 / 3.0) + self.assertEqual(extract_number_en_v2("one third of a cup", + ordinals=True), 3) + self.assertEqual(extract_number_en_v2("one third of a cup", + ordinals=None), 1) + + def test_extract_number_nth_one(self): + # test the Nth one + self.assertEqual(extract_number_en_v2("the fourth one", + ordinals=True), 4.0) + self.assertEqual(extract_number_en_v2("you are the second one", + ordinals=False), 1) + self.assertEqual(extract_number_en_v2("you are the second one", + ordinals=True), 2) + self.assertEqual(extract_number_en_v2("you are the 1st one", + ordinals=None), 1) + self.assertEqual(extract_number_en_v2("you are the 2nd one", + ordinals=None), 2) + self.assertEqual(extract_number_en_v2("you are the 3rd one", + ordinals=None), 3) + self.assertEqual(extract_number_en_v2("you are the 8th one", + ordinals=None), 8) + + # TODO FIXME + self.assertEqual(extract_number_en_v2("the thirty sixth one", + ordinals=True), 36.0) + + def test_scale(self): + # test big numbers / short vs long scale + self.assertEqual(extract_number_en_v2("this is the billionth test", + ordinals=True), 1e09) + self.assertEqual(extract_number_en_v2("this is the billionth test", + ordinals=None), False) + + self.assertEqual(extract_number_en_v2("this is the billionth test", + ordinals=False), 1e-9) + self.assertEqual(extract_number_en_v2("this is the billionth test", + ordinals=True, + short_scale=False), 1e12) + self.assertEqual(extract_number_en_v2("this is the billionth test", + ordinals=None, + short_scale=False), False) + self.assertEqual(extract_number_en_v2("this is the billionth test", + short_scale=False), 1e-12) + + def test_extract_number_ambiguous_fraction_ordinal(self): + # confirm these are not cumulative, prev version would multiple them + self.assertEqual(extract_number_en_v2("sixth third", ordinals=False), + 1 / 6) + + # test plurals + # NOTE plurals are never considered ordinals, but also not + # considered explicit fractions + self.assertEqual(extract_number_en_v2("2 fifths", + ordinals=True), 2) + self.assertEqual(extract_number_en_v2("2 fifth", + ordinals=True), 5) + self.assertEqual(extract_number_en_v2("2 fifths", + ordinals=False), 2 / 5) + self.assertEqual(extract_number_en_v2("2 fifths", + ordinals=None), 2) + + self.assertEqual(extract_number_en_v2("Twenty two and Three Fifths"), + 22.6) + + # test multiple ambiguous + self.assertEqual(extract_number_en_v2("sixth third", ordinals=None), + False) + self.assertEqual(extract_number_en_v2("thirty second", ordinals=False), + 30) + self.assertEqual(extract_number_en_v2("thirty second", ordinals=None), + 30) + self.assertEqual(extract_number_en_v2("thirty second", ordinals=True), + 32) + + self.assertEqual(extract_number_en_v2("sixth third", ordinals=False), + 6) + + def test_extract_number_negative(self): + self.assertEqual(extract_number_en_v2("minus two"), -2) + self.assertEqual(extract_number_en_v2("minus 2"), -2) + self.assertEqual(extract_number_en_v2("negative two"), -2) + self.assertEqual(extract_number_en_v2("minus 1/3"), - 1 / 3) + self.assertEqual(extract_number_en_v2("-2"), -2) + self.assertEqual(extract_number_en_v2("- 2"), -2) + self.assertEqual(extract_number_en_v2("-1/3"), - 1/3) + self.assertEqual(extract_number_en_v2("- 1/3"), - 1 / 3) + + def test_extract_number_fracs(self): + self.assertEqual(extract_number_en_v2("1/3 cups"), 1.0 / 3.0) + self.assertEqual(extract_number_en_v2("quarter cup"), 0.25) + self.assertEqual(extract_number_en_v2("1/4 cup"), 0.25) + self.assertEqual(extract_number_en_v2("2/3 cups"), 2.0 / 3.0) + self.assertEqual(extract_number_en_v2("3/4 cups"), 3.0 / 4.0) + self.assertEqual(extract_number_en_v2("1 and 3/4 cups"), 1.75) + + # TODO FIXME + self.assertEqual(extract_number_en_v2("three quarter cups"), 3.0 / 4.0) + self.assertEqual(extract_number_en_v2("three quarters cups"), + 3.0 / 4.0) + self.assertEqual(extract_number_en_v2("one and one half cups"), 1.5) + self.assertEqual(extract_number_en_v2("one and a half cups"), 1.5) + self.assertEqual(extract_number_en_v2("one cup and a half"), 1.5) + self.assertEqual(extract_number_en_v2("1 cup and a half"), 1.5) + self.assertEqual(extract_number_en_v2("one fourth cup"), 0.25) + + def test_extract_number(self): + self.assertEqual(extract_number_en_v2("this is 2 test"), 2) + self.assertEqual(extract_number_en_v2("this is test number 4"), 4) + self.assertEqual(extract_number_en_v2("three cups"), 3) + self.assertEqual(extract_number_en_v2("twenty two"), 22) + self.assertEqual(extract_number_en_v2( + "Twenty two with a leading capital letter"), 22) + self.assertEqual(extract_number_en_v2( + "twenty Two with Two capital letters"), 22) + self.assertEqual(extract_number_en_v2( + "twenty Two with mixed capital letters"), 22) + self.assertEqual(extract_number_en_v2("two hundred"), 200) + self.assertEqual(extract_number_en_v2("nine thousand"), 9000) + self.assertEqual(extract_number_en_v2("six hundred sixty six"), 666) + self.assertEqual(extract_number_en_v2("two million"), 2000000) + self.assertEqual(extract_number_en_v2( + "two million five hundred thousand tons of spinning metal"), + 2500000) + self.assertEqual(extract_number_en_v2("six trillion"), 6000000000000.0) + self.assertEqual(extract_number_en_v2("six trillion", + short_scale=False), 6e+18) + self.assertEqual(extract_number_en_v2("one point five"), 1.5) + self.assertEqual(extract_number_en_v2("three dot fourteen"), 3.14) + self.assertEqual(extract_number_en_v2("zero point two"), 0.2) + self.assertEqual(extract_number_en_v2("billions of years older"), + 1000000000.0) + self.assertEqual(extract_number_en_v2( + "billions of years older", short_scale=False), 1000000000000.0) + self.assertEqual(extract_number_en_v2("one hundred thousand"), 100000) + + self.assertEqual(extract_number_en_v2("negative seventy"), -70) + self.assertEqual(extract_number_en_v2("thousand million"), 1000000000) + + # Verify non-power multiples of ten no longer discard + # adjacent multipliers + self.assertEqual(extract_number_en_v2("twenty thousand"), 20000) + self.assertEqual(extract_number_en_v2("fifty million"), 50000000) + + # Verify smaller powers of ten no longer cause miscalculation of larger + # powers of ten (see MycroftAI#86) + self.assertEqual(extract_number_en_v2("twenty billion three hundred million \ + nine hundred fifty thousand six hundred \ + seventy five point eight"), + 20300950675.8) + self.assertEqual(extract_number_en_v2("nine hundred ninety nine million nine \ + hundred ninety nine thousand nine \ + hundred ninety nine point nine"), + 999999999.9) + + # TODO why does "trillion" result in xxxx.0? + self.assertEqual(extract_number_en_v2("eight hundred trillion two hundred \ + fifty seven"), 800000000000257.0) + + # TODO handle this case + # self.assertEqual( + # extract_number_en_v2("6 dot six six six"), + # 6.666) + + def test_extract_no_number(self): + self.assertTrue( + extract_number_en_v2("The tennis player is fast") is False) + self.assertTrue(extract_number_en_v2("fraggle") is False) + + self.assertTrue(extract_number_en_v2("grobo 0") is not False) + self.assertEqual(extract_number_en_v2("grobo 0"), 0) + + self.assertTrue(extract_number_en_v2("fraggle zero") is not False) + self.assertEqual(extract_number_en_v2("fraggle zero"), 0) + + def test_extract_couple_number(self): + # TODO FIXME + self.assertEqual(extract_number_en_v2("a couple of beers"), 2) + self.assertEqual(extract_number_en_v2("a couple hundred beers"), 200) + self.assertEqual(extract_number_en_v2("a couple thousand beers"), 2000) + + if __name__ == "__main__": unittest.main() From 4eebadaf44b239569267db3c69428065c3e284c3 Mon Sep 17 00:00:00 2001 From: jarbasai Date: Sun, 27 Nov 2022 14:57:30 +0000 Subject: [PATCH 2/2] cleanup --- lingua_franca/lang/parse_en.py | 23 ++++++++++++----------- lingua_franca/parse.py | 14 ++++++++++---- test/unittests/test_parse_en.py | 2 +- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py index 761c14aa..b4203bbf 100644 --- a/lingua_franca/lang/parse_en.py +++ b/lingua_franca/lang/parse_en.py @@ -531,28 +531,28 @@ def _initialize_number_data_en(short_scale, speech=True): return multiplies, string_num_ordinal_en, string_num_scale_en -def extract_number_spans_en(utterance, short_scale=True, ordinals=False, - fractional_numbers=True, decimal="."): +def extract_number_spans_en(text, short_scale=True, ordinals=False, + decimal=".", fractional_numbers=True): """ This function tags numbers in an utterance. Args: - utterance (str): the string to normalize + text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): decimal marker fractional_numbers (bool): True if we should look for fractions and decimals. - decimal (str): decimal marker Returns: (list): list of tuples with detected number and span of the number in parent utterance [(number, (start_idx, end_idx))] """ number_spans = [] - if isinstance(utterance, str): - spans = span_indexed_word_tokenize(utterance) + if isinstance(text, str): + spans = span_indexed_word_tokenize(text) else: - spans = utterance + spans = text # load language number data multiplies, string_num_ordinal, string_num_scale = \ @@ -748,14 +748,15 @@ def found_number(): return number_spans -def extract_number_en_v2(*args, **kwargs): - spans = extract_number_spans_en(*args, **kwargs) +def extract_number_en_v2(text, short_scale=True, ordinals=False, decimal='.', fractional_numbers=True): + spans = extract_number_spans_en(text, short_scale=short_scale, ordinals=ordinals, + decimal=decimal, fractional_numbers=fractional_numbers) if not spans: return False - return extract_number_spans_en(*args, **kwargs)[0][0] + return spans[0][0] -def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'): +def extract_number_en(text, short_scale=True, ordinals=False, decimal='.', fractional_numbers=True): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 76dbad4e..ee1cadec 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -46,13 +46,13 @@ @localized_function(run_own_code_on=[FunctionNotLocalizedError]) -def extract_number_spans(utterance, short_scale=True, ordinals=False, - fractional_numbers=True, decimal=".", lang=''): +def extract_number_spans(text, short_scale=True, ordinals=False, + decimal=".", fractional_numbers=True, lang=''): """ This function tags numbers in an utterance. Args: - utterance (str): the string to normalize + text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 fractional_numbers (bool): True if we should look for fractions and @@ -66,7 +66,7 @@ def extract_number_spans(utterance, short_scale=True, ordinals=False, """ number_spans = [] - spans = span_indexed_word_tokenize(utterance) + spans = span_indexed_word_tokenize(text) for idx, (start, end, word) in enumerate(spans): next_span = spans[idx + 1] if idx + 1 < len(spans) else () next_next_span = spans[idx + 2] if idx + 2 < len(spans) else () @@ -114,6 +114,8 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang='', lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. decimal (str): character to use as decimal point. defaults to '.' + fractional_numbers (bool): True if we should look for fractions and + decimals. Returns: list: list of extracted numbers as floats, or empty list if none found Note: @@ -151,6 +153,8 @@ def extract_first_number(text, short_scale=True, ordinals=False, lang='', lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. decimal (str): character to use as decimal point. defaults to '.' + fractional_numbers (bool): True if we should look for fractions and + decimals. Returns: (int, float or False): The number extracted or False if the input text contains no numbers @@ -179,6 +183,8 @@ def extract_last_number(text, short_scale=True, ordinals=False, lang='', lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. decimal (str): character to use as decimal point. defaults to '.' + fractional_numbers (bool): True if we should look for fractions and + decimals. Returns: (int, float or False): The number extracted or False if the input text contains no numbers diff --git a/test/unittests/test_parse_en.py b/test/unittests/test_parse_en.py index 4bd066bf..2c1aedef 100644 --- a/test/unittests/test_parse_en.py +++ b/test/unittests/test_parse_en.py @@ -1936,7 +1936,7 @@ def test_scale(self): short_scale=False), 1e-12) def test_extract_number_ambiguous_fraction_ordinal(self): - # confirm these are not cumulative, prev version would multiple them + # confirm these are not cumulative, prev version would multiply them self.assertEqual(extract_number_en_v2("sixth third", ordinals=False), 1 / 6)