Skip to content

Commit

Permalink
move utils to parse_common
Browse files Browse the repository at this point in the history
  • Loading branch information
JarbasAl committed Dec 1, 2022
1 parent d5e5cb6 commit f9f7019
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 53 deletions.
8 changes: 0 additions & 8 deletions lingua_franca/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,14 +561,6 @@ def _call_localized_function(func, *args, **kwargs):
# If we didn't find a localized function to correspond with
# the wrapped function, we cached NotImplementedError in its
# place.

# first account for the function not being present in any
# module, meaning all modules are falling back to a catch all
# parser, this usually means the function will need localization
# only in future languages not currently supported
if func_name not in _localized_functions[_module_name][lang_code]:
raise FunctionNotLocalizedError(func_name, lang_code)

loc_signature = _localized_functions[_module_name][lang_code][func_name]
if isinstance(loc_signature, type(NotImplementedError())):
raise loc_signature
Expand Down
35 changes: 35 additions & 0 deletions lingua_franca/lang/parse_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
import json
from lingua_franca.internal import resolve_resource_file, FunctionNotLocalizedError
import unicodedata
from quebra_frases import span_indexed_empty_space_tokenize


ROMAN_NUMERALS = {'I': 1, 'V': 5, 'X': 10, 'L': 50, 'C': 100, 'D': 500, 'M': 1000}

Expand Down Expand Up @@ -462,3 +464,36 @@ def replace_right(source, target, replacement, replacements=None):
extract = extract_handler(to_parse, short_scale, ordinals)
numbers.reverse()
return numbers



def roman_to_int(word):
if not is_roman_numeral(word):
return None
number = 0
for i in range(len(word)):
if i > 0 and ROMAN_NUMERALS[word[i]] > ROMAN_NUMERALS[word[i - 1]]:
number += ROMAN_NUMERALS[word[i]] - 2 * ROMAN_NUMERALS[word[i - 1]]
else:
number += ROMAN_NUMERALS[word[i]]
return number


def is_roman_numeral(word):
return all(char in ROMAN_NUMERALS for char in word)


def extract_roman_numeral_spans(utterance):
"""
This function tags roman numerals in an utterance.
Args:
utterance (str): the string to normalize
Returns:
(list): list of tuples with detected number and span of the
number in parent utterance [(number, (start_idx, end_idx))]
"""
spans = span_indexed_empty_space_tokenize(utterance)
return [(roman_to_int(word), (start, end)) for start, end, word in spans
if is_roman_numeral(word)]
21 changes: 20 additions & 1 deletion lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
_STRING_NUM_EN, _STRING_SHORT_ORDINAL_EN, _STRING_LONG_ORDINAL_EN, \
_generate_plurals_en, _SPOKEN_EXTRA_NUM_EN
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer,\
extract_roman_numeral_spans
from lingua_franca.time import now_local


Expand Down Expand Up @@ -1685,3 +1686,21 @@ def numbers_to_digits(self, utterance):
def normalize_en(text, remove_articles=True):
""" English string normalization """
return EnglishNormalizer().normalize(text, remove_articles)


def normalize_roman_numerals_en(utterance, ordinals=False):
# localization might be needed for ordinals flag
norm_utt = utterance
for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
if ordinals:
if str(num)[-1] == "1":
num = f"{num}st"
elif str(num)[-1] == "2":
num = f"{num}nd"
elif str(num)[-1] == "3":
num = f"{num}rd"
else:
num = f"{num}th"
norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
return norm_utt

47 changes: 3 additions & 44 deletions lingua_franca/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,15 @@
#
import json
from lingua_franca.util import match_one, fuzzy_match, MatchStrategy
from lingua_franca.lang.parse_common import match_yes_or_no, ROMAN_NUMERALS
from lingua_franca.lang.parse_common import match_yes_or_no, extract_roman_numeral_spans, is_roman_numeral

from difflib import SequenceMatcher
from warnings import warn
from lingua_franca.time import now_local
from lingua_franca.internal import populate_localized_function_dict, \
get_active_langs, get_full_lang_code, get_primary_lang_code, \
get_default_lang, localized_function, _raise_unsupported_language, UnsupportedLanguageError,\
resolve_resource_file, FunctionNotLocalizedError
import unicodedata
from quebra_frases import span_indexed_empty_space_tokenize


_REGISTERED_FUNCTIONS = ("extract_numbers",
Expand All @@ -41,52 +40,12 @@
populate_localized_function_dict("parse", langs=get_active_langs())


def roman_to_int(word):
if not is_roman_numeral(word):
return None
number = 0
for i in range(len(word)):
if i > 0 and ROMAN_NUMERALS[word[i]] > ROMAN_NUMERALS[word[i - 1]]:
number += ROMAN_NUMERALS[word[i]] - 2 * ROMAN_NUMERALS[word[i - 1]]
else:
number += ROMAN_NUMERALS[word[i]]
return number


def is_roman_numeral(word):
return all(char in ROMAN_NUMERALS for char in word)


def extract_roman_numeral_spans(utterance):
"""
This function tags roman numerals in an utterance.
Args:
utterance (str): the string to normalize
Returns:
(list): list of tuples with detected number and span of the
number in parent utterance [(number, (start_idx, end_idx))]
"""
spans = span_indexed_empty_space_tokenize(utterance)
return [(roman_to_int(word), (start, end)) for start, end, word in spans
if is_roman_numeral(word)]


@localized_function(run_own_code_on=[FunctionNotLocalizedError])
def normalize_roman_numerals(utterance, ordinals=False, lang=""):
# localization might be needed for ordinals flag
norm_utt = utterance
for num, (start, end) in reversed(extract_roman_numeral_spans(utterance)):
if ordinals:
if str(num)[-1] == "1":
num = f"{num}st"
elif str(num)[-1] == "2":
num = f"{num}nd"
elif str(num)[-1] == "3":
num = f"{num}rd"
else:
num = f"{num}th"
# if ordinals: # TODO - this is lang specific
norm_utt = norm_utt[:start] + f"{num}" + norm_utt[end:]
return norm_utt

Expand Down

0 comments on commit f9f7019

Please sign in to comment.