OpenVoiceOS · JarbasAl · May 19, 2021 · Dec 1, 2022 · Jul 17, 2024 · Jul 17, 2024
diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py
@@ -18,6 +18,10 @@
 import json
 from lingua_franca.internal import  resolve_resource_file, FunctionNotLocalizedError
 import unicodedata
+from quebra_frases import span_indexed_empty_space_tokenize
+
+
+ROMAN_NUMERALS = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}
 
 
 class Normalizer:
@@ -460,3 +464,36 @@ def replace_right(source, target, replacement, replacements=None):
             extract = extract_handler(to_parse, short_scale, ordinals)
     numbers.reverse()
     return numbers
+
+
+
+def roman_to_int(word):
+    if not is_roman_numeral(word):
+        return None
+    number = 0
+    for i in range(len(word)):
+        if i > 0 and ROMAN_NUMERALS[word[i]] > ROMAN_NUMERALS[word[i - 1]]:
+            number += ROMAN_NUMERALS[word[i]] - 2 * ROMAN_NUMERALS[word[i - 1]]
+        else:
+            number += ROMAN_NUMERALS[word[i]]
+    return number
+
+
+def is_roman_numeral(word):
+    return all(char in ROMAN_NUMERALS for char in word)
+
+
+def extract_roman_numeral_spans(utterance):
+    """
+    This function tags roman numerals in an utterance.
+
+    Args:
+        utterance (str): the string to normalize
+    Returns:
+        (list): list of tuples with detected number and span of the
+                number in parent utterance [(number, (start_idx, end_idx))]
+
+    """
+    spans = span_indexed_empty_space_tokenize(utterance)
+    return [(roman_to_int(word), (start, end)) for start, end, word in spans
+            if is_roman_numeral(word)]
diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py
@@ -27,7 +27,8 @@
     _STRING_NUM_EN, _STRING_SHORT_ORDINAL_EN, _STRING_LONG_ORDINAL_EN, \
     _generate_plurals_en, _SPOKEN_EXTRA_NUM_EN
 from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
-    invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
+    invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer,\
+    extract_roman_numeral_spans
 from lingua_franca.time import now_local
 
 
@@ -1685,3 +1686,21 @@ def numbers_to_digits(self, utterance):
 def normalize_en(text, remove_articles=True):
     """ English string normalization """
     return EnglishNormalizer().normalize(text, remove_articles)
+
+
+def normalize_roman_numerals_en(utterance, ordinals=False):
+    # localization might be needed for ordinals flag
+    norm_utt = utterance
+    for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
+        if ordinals:
+            if str(num)[-1] == "1":
+                num = f"{num}st"
+            elif str(num)[-1] == "2":
+                num = f"{num}nd"
+            elif str(num)[-1] == "3":
+                num = f"{num}rd"
+            else:
+                num = f"{num}th"
+        norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
+    return norm_utt
+
diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py
@@ -15,15 +15,15 @@
 #
 import json
 from lingua_franca.util import match_one, fuzzy_match, MatchStrategy
-from lingua_franca.lang.parse_common import match_yes_or_no
+from lingua_franca.lang.parse_common import match_yes_or_no, extract_roman_numeral_spans, is_roman_numeral
+
 from difflib import SequenceMatcher
 from warnings import warn
 from lingua_franca.time import now_local
 from lingua_franca.internal import populate_localized_function_dict, \
     get_active_langs, get_full_lang_code, get_primary_lang_code, \
     get_default_lang, localized_function, _raise_unsupported_language, UnsupportedLanguageError,\
     resolve_resource_file, FunctionNotLocalizedError
-import unicodedata
 
 
 _REGISTERED_FUNCTIONS = ("extract_numbers",
@@ -40,6 +40,16 @@
 populate_localized_function_dict("parse", langs=get_active_langs())
 
 
+@localized_function(run_own_code_on=[FunctionNotLocalizedError])
+def normalize_roman_numerals(utterance, ordinals=False, lang=""):
+    # localization might be needed for ordinals flag
+    norm_utt = utterance
+    for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
+        # if ordinals: # TODO - this is lang specific
+        norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
+    return norm_utt
+
+
 @localized_function(run_own_code_on=[FunctionNotLocalizedError])
 def yes_or_no(text, lang=""):
     text = normalize(text, lang=lang, remove_articles=True).lower()

diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -1,2 +1,3 @@
 python-dateutil~=2.6
-rapidfuzz
+rapidfuzz
+quebra_frases>=0.3.5
diff --git a/test/unittests/test_parse_common.py b/test/unittests/test_parse_common.py
@@ -0,0 +1,71 @@
+#
+# Copyright 2019 Mycroft AI Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from lingua_franca import load_language
+from lingua_franca.parse import roman_to_int, is_roman_numeral,\
+    extract_roman_numeral_spans, normalize_roman_numerals
+
+
+class TestParseCommon(unittest.TestCase):
+    def setUp(cls) -> None:
+        load_language("en")
+
+    def test_roman(self):
+        # valid numerals
+        self.assertEqual(roman_to_int("III"), 3)
+        self.assertEqual(roman_to_int("IV"), 4)
+        self.assertEqual(roman_to_int("V"), 5)
+        self.assertEqual(roman_to_int("MCMLXXIV"), 1974)
+        self.assertEqual(roman_to_int("MCMLXXV"), 1975)
+        self.assertEqual(is_roman_numeral("IV"), True)
+
+        # invalid numerals
+        self.assertEqual(roman_to_int("v"), None)
+        self.assertEqual(is_roman_numeral("ii"), False)
+        self.assertEqual(is_roman_numeral("the IV century"), False)
+
+        # test spans
+        self.assertEqual(extract_roman_numeral_spans("the IV century"),
+                         [(4, (4, 6))])
+        self.assertEqual(extract_roman_numeral_spans("the XIV century"),
+                         [(14, (4, 7))])
+
+        # test normalization
+        self.assertEqual(normalize_roman_numerals("the XV century"),
+                         "the 15 century")
+
+        # test ordinals
+        self.assertEqual(normalize_roman_numerals("the XXI century",
+                                                  ordinals=True),
+                         "the 21st century")
+        self.assertEqual(normalize_roman_numerals("the XII century",
+                                                  ordinals=True),
+                         "the 12nd century")
+        self.assertEqual(normalize_roman_numerals("the XXIII century",
+                                                  ordinals=True),
+                         "the 23rd century")
+        self.assertEqual(normalize_roman_numerals("the XXIV century",
+                                                  ordinals=True),
+                         "the 24th century")
+
+        # test space
+        self.assertEqual(is_roman_numeral("I V"), False)
+        self.assertEqual(normalize_roman_numerals("the X V century"),
+                         "the 10 5 century")
+        self.assertEqual(extract_roman_numeral_spans("the X V century"),
+                         [(10, (4, 5)),
+                          (5, (6, 7))])
+