Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/roman numerals #40

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
import json
from lingua_franca.internal import resolve_resource_file, FunctionNotLocalizedError
import unicodedata
from quebra_frases import span_indexed_empty_space_tokenize


ROMAN_NUMERALS = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}


class Normalizer:
Expand Down Expand Up @@ -460,3 +464,36 @@ def replace_right(source, target, replacement, replacements=None):
extract = extract_handler(to_parse, short_scale, ordinals)
numbers.reverse()
return numbers



def roman_to_int(word):
if not is_roman_numeral(word):
return None
number = 0
for i in range(len(word)):
if i > 0 and ROMAN_NUMERALS[word[i]] > ROMAN_NUMERALS[word[i - 1]]:
number += ROMAN_NUMERALS[word[i]] - 2 * ROMAN_NUMERALS[word[i - 1]]
else:
number += ROMAN_NUMERALS[word[i]]
return number


def is_roman_numeral(word):
return all(char in ROMAN_NUMERALS for char in word)


def extract_roman_numeral_spans(utterance):
"""
This function tags roman numerals in an utterance.

Args:
utterance (str): the string to normalize
Returns:
(list): list of tuples with detected number and span of the
number in parent utterance [(number, (start_idx, end_idx))]

"""
spans = span_indexed_empty_space_tokenize(utterance)
return [(roman_to_int(word), (start, end)) for start, end, word in spans
if is_roman_numeral(word)]
21 changes: 20 additions & 1 deletion lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
_STRING_NUM_EN, _STRING_SHORT_ORDINAL_EN, _STRING_LONG_ORDINAL_EN, \
_generate_plurals_en, _SPOKEN_EXTRA_NUM_EN
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer,\
extract_roman_numeral_spans
from lingua_franca.time import now_local


Expand Down Expand Up @@ -1685,3 +1686,21 @@ def numbers_to_digits(self, utterance):
def normalize_en(text, remove_articles=True):
""" English string normalization """
return EnglishNormalizer().normalize(text, remove_articles)


def normalize_roman_numerals_en(utterance, ordinals=False):
# localization might be needed for ordinals flag
norm_utt = utterance
for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
if ordinals:
if str(num)[-1] == "1":
num = f"{num}st"
elif str(num)[-1] == "2":
num = f"{num}nd"
elif str(num)[-1] == "3":
num = f"{num}rd"
else:
num = f"{num}th"
norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
return norm_utt
Comment on lines +1779 to +1793
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review of the new function normalize_roman_numerals_en.

The function normalize_roman_numerals_en is designed to normalize Roman numerals within English text. The implementation uses the extract_roman_numeral_spans function to locate and replace Roman numeral spans with their numeric equivalents, optionally formatting them as ordinals.

  1. Correctness and Logic:

    • The function correctly handles the conversion of Roman numerals to numbers and adjusts them based on the ordinals flag.
    • The use of reversed ensures that replacements do not affect the indices of subsequent replacements, which is crucial for correctness.
  2. Performance:

    • The function iterates over each numeral span once, which is efficient. However, the repeated slicing and concatenation of strings might be less efficient for very large strings. Consider using a list and joining it at the end for potentially better performance.
  3. Readability and Maintainability:

    • The function is well-structured and the logic is clear. Inline comments or a docstring explaining the parameters and the return value could enhance readability and maintainability.
  4. Error Handling:

    • There is no explicit error handling. It would be beneficial to add error handling to manage cases where extract_roman_numeral_spans might return unexpected values or fail.
  5. Best Practices:

    • Using f-strings for string operations is a good practice and is used here effectively.

Overall, the function meets the objectives of the PR and is implemented correctly, but there are opportunities for optimization and improved error handling.

def normalize_roman_numerals_en(utterance, ordinals=False):
    # localization might be needed for ordinals flag
    norm_utt = []
    original_length = len(utterance)
    last_end = 0
    for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
        norm_utt.append(utterance[last_end:start])  # append the text before the numeral
        if ordinals:
            ordinal_suffix = 'th' if num[-1] not in '123' else {'1': 'st', '2': 'nd', '3': 'rd'}[num[-1]]
            norm_utt.append(f"{num}{ordinal_suffix}")
        else:
            norm_utt.append(f"{num}")
        last_end = end
    norm_utt.append(utterance[last_end:original_length])  # append the remaining text after the last numeral
    return ''.join(norm_utt)  # join all parts to form the normalized utterance


14 changes: 12 additions & 2 deletions lingua_franca/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
#
import json
from lingua_franca.util import match_one, fuzzy_match, MatchStrategy
from lingua_franca.lang.parse_common import match_yes_or_no
from lingua_franca.lang.parse_common import match_yes_or_no, extract_roman_numeral_spans, is_roman_numeral
JarbasAl marked this conversation as resolved.
Show resolved Hide resolved

from difflib import SequenceMatcher
from warnings import warn
from lingua_franca.time import now_local
from lingua_franca.internal import populate_localized_function_dict, \
get_active_langs, get_full_lang_code, get_primary_lang_code, \
get_default_lang, localized_function, _raise_unsupported_language, UnsupportedLanguageError,\
resolve_resource_file, FunctionNotLocalizedError
import unicodedata


_REGISTERED_FUNCTIONS = ("extract_numbers",
Expand All @@ -40,6 +40,16 @@
populate_localized_function_dict("parse", langs=get_active_langs())


@localized_function(run_own_code_on=[FunctionNotLocalizedError])
def normalize_roman_numerals(utterance, ordinals=False, lang=""):
# localization might be needed for ordinals flag
norm_utt = utterance
for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
# if ordinals: # TODO - this is lang specific
norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
return norm_utt
JarbasAl marked this conversation as resolved.
Show resolved Hide resolved


@localized_function(run_own_code_on=[FunctionNotLocalizedError])
def yes_or_no(text, lang=""):
text = normalize(text, lang=lang, remove_articles=True).lower()
Expand Down
3 changes: 2 additions & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
python-dateutil~=2.6
rapidfuzz
rapidfuzz
quebra_frases>=0.3.5
71 changes: 71 additions & 0 deletions test/unittests/test_parse_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#
# Copyright 2019 Mycroft AI Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest
from lingua_franca import load_language
from lingua_franca.parse import roman_to_int, is_roman_numeral,\
extract_roman_numeral_spans, normalize_roman_numerals


class TestParseCommon(unittest.TestCase):
def setUp(cls) -> None:
load_language("en")

def test_roman(self):
# valid numerals
self.assertEqual(roman_to_int("III"), 3)
self.assertEqual(roman_to_int("IV"), 4)
self.assertEqual(roman_to_int("V"), 5)
self.assertEqual(roman_to_int("MCMLXXIV"), 1974)
self.assertEqual(roman_to_int("MCMLXXV"), 1975)
self.assertEqual(is_roman_numeral("IV"), True)

# invalid numerals
self.assertEqual(roman_to_int("v"), None)
self.assertEqual(is_roman_numeral("ii"), False)
self.assertEqual(is_roman_numeral("the IV century"), False)

Comment on lines +26 to +39
Copy link

@coderabbitai coderabbitai bot Jul 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comprehensive tests for Roman numeral functionalities.

The method effectively tests both valid and invalid inputs. Consider adding more edge cases, such as numerals at the boundary of valid ranges or mixed-case inputs.

Would you like me to help by adding more edge case tests?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JarbasAl, I've prepared some additional edge case tests for the Roman numeral functionalities. These tests cover boundary values, mixed-case inputs, and numerals embedded in text to ensure robustness. Here's the updated test code:

class TestParseCommon(unittest.TestCase):
    def setUp(cls) -> None:
        load_language("en")

    def test_roman(self):
        # valid numerals
        self.assertEqual(roman_to_int("III"), 3)
        self.assertEqual(roman_to_int("IV"), 4)
        self.assertEqual(roman_to_int("V"), 5)
        self.assertEqual(roman_to_int("MCMLXXIV"), 1974)
        self.assertEqual(roman_to_int("MCMLXXV"), 1975)
        self.assertEqual(is_roman_numeral("IV"), True)

        # invalid numerals
        self.assertEqual(roman_to_int("v"), None)
        self.assertEqual(is_roman_numeral("ii"), False)
        self.assertEqual(is_roman_numeral("the IV century"), False)

        # edge cases
        self.assertEqual(roman_to_int("MCMXCIX"), 1999)  # Upper boundary of common usage
        self.assertEqual(roman_to_int("I"), 1)  # Lower boundary
        self.assertEqual(roman_to_int("iV"), 4)  # Mixed case
        self.assertEqual(roman_to_int("xIv"), 14)  # Mixed case
        self.assertEqual(is_roman_numeral("MDCCLXXVI"), True)  # Valid numeral in a string
        self.assertEqual(is_roman_numeral("123IVabc"), False)  # Invalid numeral in a string

Feel free to integrate these into your test suite or let me know if there are any other specific scenarios you'd like to cover!

# test spans
self.assertEqual(extract_roman_numeral_spans("the IV century"),
[(4, (4, 6))])
self.assertEqual(extract_roman_numeral_spans("the XIV century"),
[(14, (4, 7))])

# test normalization
self.assertEqual(normalize_roman_numerals("the XV century"),
"the 15 century")

# test ordinals
self.assertEqual(normalize_roman_numerals("the XXI century",
ordinals=True),
"the 21st century")
self.assertEqual(normalize_roman_numerals("the XII century",
ordinals=True),
"the 12nd century")
self.assertEqual(normalize_roman_numerals("the XXIII century",
ordinals=True),
"the 23rd century")
self.assertEqual(normalize_roman_numerals("the XXIV century",
ordinals=True),
"the 24th century")
JarbasAl marked this conversation as resolved.
Show resolved Hide resolved

# test space
self.assertEqual(is_roman_numeral("I V"), False)
self.assertEqual(normalize_roman_numerals("the X V century"),
"the 10 5 century")
self.assertEqual(extract_roman_numeral_spans("the X V century"),
[(10, (4, 5)),
(5, (6, 7))])