move utils to parse_common

OpenVoiceOS · Dec 1, 2022 · f9f7019 · f9f7019
1 parent d5e5cb6
commit f9f7019
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 53 deletions.
diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py
@@ -561,14 +561,6 @@ def _call_localized_function(func, *args, **kwargs):
             # If we didn't find a localized function to correspond with
             # the wrapped function, we cached NotImplementedError in its
             # place.
-
-            # first account for the function not being present in any
-            # module, meaning all modules are falling back to a catch all
-            # parser, this usually means the function will need localization
-            # only in future languages not currently supported
-            if func_name not in _localized_functions[_module_name][lang_code]:
-                raise FunctionNotLocalizedError(func_name, lang_code)
-
             loc_signature = _localized_functions[_module_name][lang_code][func_name]
             if isinstance(loc_signature, type(NotImplementedError())):
                 raise loc_signature

diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py
@@ -18,6 +18,8 @@
 import json
 from lingua_franca.internal import  resolve_resource_file, FunctionNotLocalizedError
 import unicodedata
+from quebra_frases import span_indexed_empty_space_tokenize
+
 
 ROMAN_NUMERALS = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
 
@@ -462,3 +464,36 @@ def replace_right(source, target, replacement, replacements=None):
             extract = extract_handler(to_parse, short_scale, ordinals)
     numbers.reverse()
     return numbers
+
+
+
+def roman_to_int(word):
+    if not is_roman_numeral(word):
+        return None
+    number = 0
+    for i in range(len(word)):
+        if i > 0 and ROMAN_NUMERALS[word[i]] > ROMAN_NUMERALS[word[i - 1]]:
+            number += ROMAN_NUMERALS[word[i]] - 2 * ROMAN_NUMERALS[word[i - 1]]
+        else:
+            number += ROMAN_NUMERALS[word[i]]
+    return number
+
+
+def is_roman_numeral(word):
+    return all(char in ROMAN_NUMERALS for char in word)
+
+
+def extract_roman_numeral_spans(utterance):
+    """
+    This function tags roman numerals in an utterance.
+
+    Args:
+        utterance (str): the string to normalize
+    Returns:
+        (list): list of tuples with detected number and span of the
+                number in parent utterance [(number, (start_idx, end_idx))]
+
+    """
+    spans = span_indexed_empty_space_tokenize(utterance)
+    return [(roman_to_int(word), (start, end)) for start, end, word in spans
+            if is_roman_numeral(word)]
diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py
@@ -27,7 +27,8 @@
     _STRING_NUM_EN, _STRING_SHORT_ORDINAL_EN, _STRING_LONG_ORDINAL_EN, \
     _generate_plurals_en, _SPOKEN_EXTRA_NUM_EN
 from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
-    invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
+    invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer,\
+    extract_roman_numeral_spans
 from lingua_franca.time import now_local
 
 
@@ -1685,3 +1686,21 @@ def numbers_to_digits(self, utterance):
 def normalize_en(text, remove_articles=True):
     """ English string normalization """
     return EnglishNormalizer().normalize(text, remove_articles)
+
+
+def normalize_roman_numerals_en(utterance, ordinals=False):
+    # localization might be needed for ordinals flag
+    norm_utt = utterance
+    for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
+        if ordinals:
+            if str(num)[-1] == "1":
+                num = f"{num}st"
+            elif str(num)[-1] == "2":
+                num = f"{num}nd"
+            elif str(num)[-1] == "3":
+                num = f"{num}rd"
+            else:
+                num = f"{num}th"
+        norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
+    return norm_utt
+
diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py
@@ -15,16 +15,15 @@
 #
 import json
 from lingua_franca.util import match_one, fuzzy_match, MatchStrategy
-from lingua_franca.lang.parse_common import match_yes_or_no, ROMAN_NUMERALS
+from lingua_franca.lang.parse_common import match_yes_or_no, extract_roman_numeral_spans, is_roman_numeral
+
 from difflib import SequenceMatcher
 from warnings import warn
 from lingua_franca.time import now_local
 from lingua_franca.internal import populate_localized_function_dict, \
     get_active_langs, get_full_lang_code, get_primary_lang_code, \
     get_default_lang, localized_function, _raise_unsupported_language, UnsupportedLanguageError,\
     resolve_resource_file, FunctionNotLocalizedError
-import unicodedata
-from quebra_frases import span_indexed_empty_space_tokenize
 
 
 _REGISTERED_FUNCTIONS = ("extract_numbers",
@@ -41,52 +40,12 @@
 populate_localized_function_dict("parse", langs=get_active_langs())
 
 
-def roman_to_int(word):
-    if not is_roman_numeral(word):
-        return None
-    number = 0
-    for i in range(len(word)):
-        if i > 0 and ROMAN_NUMERALS[word[i]] > ROMAN_NUMERALS[word[i - 1]]:
-            number += ROMAN_NUMERALS[word[i]] - 2 * ROMAN_NUMERALS[word[i - 1]]
-        else:
-            number += ROMAN_NUMERALS[word[i]]
-    return number
-
-
-def is_roman_numeral(word):
-    return all(char in ROMAN_NUMERALS for char in word)
-
-
-def extract_roman_numeral_spans(utterance):
-    """
-    This function tags roman numerals in an utterance.
-
-    Args:
-        utterance (str): the string to normalize
-    Returns:
-        (list): list of tuples with detected number and span of the
-                number in parent utterance [(number, (start_idx, end_idx))]
-
-    """
-    spans = span_indexed_empty_space_tokenize(utterance)
-    return [(roman_to_int(word), (start, end)) for start, end, word in spans
-            if is_roman_numeral(word)]
-
-
 @localized_function(run_own_code_on=[FunctionNotLocalizedError])
 def normalize_roman_numerals(utterance, ordinals=False, lang=""):
     # localization might be needed for ordinals flag
     norm_utt = utterance
     for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
-        if ordinals:
-            if str(num)[-1] == "1":
-                num = f"{num}st"
-            elif str(num)[-1] == "2":
-                num = f"{num}nd"
-            elif str(num)[-1] == "3":
-                num = f"{num}rd"
-            else:
-                num = f"{num}th"
+        # if ordinals: # TODO - this is lang specific
         norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
     return norm_utt