Skip to content

Commit

Permalink
feat: add case preservation option to mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Jul 18, 2023
1 parent 2e75984 commit 7da4f16
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 2 deletions.
6 changes: 6 additions & 0 deletions g2p/mappings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,11 @@ def __init__( # noqa: C901
"out_delimiter",
"as_is",
"case_sensitive",
"case_equivalencies",
"rule_ordering",
"escape_special",
"norm_form",
"preserve_case",
"prevent_feeding",
"reverse",
"type",
Expand Down Expand Up @@ -313,6 +315,10 @@ def process_kwargs(self, mapping): # noqa: C901
self.kwargs["rule_ordering"] = "as-written"
if "case_sensitive" not in self.kwargs:
self.kwargs["case_sensitive"] = True
if "case_equivalencies" not in self.kwargs:
self.kwargs["case_equivalencies"] = {}
if "preserve_case" not in self.kwargs:
self.kwargs["preserve_case"] = False
if "escape_special" not in self.kwargs:
self.kwargs["escape_special"] = False
if "norm_form" not in self.kwargs:
Expand Down
2 changes: 2 additions & 0 deletions g2p/mappings/langs/kwk/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ mappings:
out_lang: kwk-umista
rule_ordering: apply-longest-first
prevent_feeding: true
case_sensitive: false
preserve_case: true
authors:
- Fineen Davis
- Olivia Chen
Expand Down
Binary file modified g2p/mappings/langs/langs.pkl
Binary file not shown.
2 changes: 1 addition & 1 deletion g2p/tests/public/data/kwk.psv
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ kwk-boas|kwk-umista|g·āyaxalisē|gayax̱alisi
kwk-boas|kwk-umista|x\u0323wēlaxᵋw\u1D07sdes|xwilax̱ʼwa̱sdis
kwk-boas|kwk-umista|ăwŭnagwīsē ʟ̣ēg̣adēs|a̱wunagwisi dłig̱adis
kwk-boas|kwk-umista|yîx ōmpas ōᵋmaxt!ālaʟēᵋyēxa|yix̱ umpas uʼmax̱t̓alatłiʼyix̱a
kwk-boas|kwk-umista|tsāg̣ᴇmas g·ōkwas Ts!ᴇxᵋēdē|tsag̱a̱mas gukwas Ts!a̱x̱ʼidi
kwk-boas|kwk-umista|tsāg̣ᴇmas g·ōkwas Ts!ᴇxᵋēdē|tsag̱a̱mas gukwas Tʼsa̱x̱ʼidi
kwk-boas|kwk-umista|lāx̣wa ᵋnāx̣wax|laxwa ʼnaxwax̱
kwk-boas|kwk-umista|g·ig̣ŭmaᵋyasa ᵋnᴇᵋmēmotasa|gig̱umaʼyasa ʼna̱ʼmimutasa
kwk-boas|kwk-umista|yîxs sēsᴇyūʟaēs|yix̱s sisa̱yutłaʼis
Expand Down
23 changes: 23 additions & 0 deletions g2p/tests/test_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,29 @@ def test_deletion(self):
self.assertEqual(self.test_deletion_transducer_csv("a").output_string, "")
self.assertEqual(self.test_deletion_transducer_json("a").output_string, "")

def test_case_preservation(self):
mapping = Mapping(
[
{"in": "'a", "out": "b"},
{"in": "e\u0301", "out": "f"},
{"in": "tl", "out": "λ"},
],
case_sensitive=False,
preserve_case=True,
norm_form="NFC",
case_equivalencies={"λ": "\u2144"},
)
transducer = Transducer(mapping)
self.assertEqual(transducer("'a").output_string, "b")
self.assertEqual(transducer("'A").output_string, "B")
self.assertEqual(transducer("E\u0301").output_string, "F")
self.assertEqual(transducer("E\u0301").output_string, "F")
# Test what happens in Heiltsuk. \u03BB should be capitalized as \u2144
self.assertEqual(transducer("TLaba").output_string, "\u2144aba")
self.assertEqual(transducer("tlaba").output_string, "λaba")
# I guess it's arguable what should happen here, but I'll just change case if any of the characters are differently cased
self.assertEqual(transducer("Tlaba").output_string, "\u2144aba")

def test_normalize_edges(self):
# Remove non-deletion edges with the same index as deletions
bad_edges = [
Expand Down
57 changes: 56 additions & 1 deletion g2p/transducer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ class Transducer:
def __init__(self, mapping: Mapping):
self.mapping = mapping
self.case_sensitive = mapping.kwargs["case_sensitive"]
self.preserve_case = mapping.kwargs["preserve_case"]
self.norm_form = mapping.kwargs.get("norm_form", "none")
self.out_delimiter = mapping.kwargs.get("out_delimiter", "")
self._index_match_pattern = re.compile(r"(?<={)\d+(?=})")
Expand All @@ -438,7 +439,11 @@ def __call__(self, to_convert: str):
and output characters and their corresponding edges representing the indices
of the transformation.
"""
return self.apply_rules(to_convert)
tg = self.apply_rules(to_convert)
if self.preserve_case:
return preserve_case(tg, self.mapping.kwargs.get("case_equivalencies", {}))
else:
return tg

@staticmethod
def _pua_to_index(string: str) -> int:
Expand Down Expand Up @@ -1249,3 +1254,53 @@ def check(self, tg: TransductionGraph, shallow=False, display_warnings=False):
else:
return False
return result


def preserve_case(
tg: TransductionGraph, case_equivalencies: Dict[str, str] = None
) -> TransductionGraph:
if case_equivalencies is None:
case_equivalencies = {}
reverse_case_equivalencies = {v: k for k, v in case_equivalencies.items()}
all_lower_case_equivalencies = case_equivalencies.keys()
all_upper_case_equivalencies = case_equivalencies.values()
new_string = ""
for item in tg.substring_alignments():
in_sub = item[0]
out_sub = item[1]
any_in_upper = any(x.isupper() for x in in_sub)
any_in_lower = any(x.islower() for x in in_sub)
any_out_upper = any(x.isupper() for x in out_sub)
any_out_lower = any(x.islower() for x in out_sub)
# continue if character is un-caseable
if (
out_sub not in case_equivalencies
and not any_out_upper
and not any_out_lower
):
new_string += out_sub
continue
# lower case using case equivalencies if they exist
if (
any_in_lower or in_sub in all_lower_case_equivalencies
) and out_sub in all_upper_case_equivalencies:
new_string += reverse_case_equivalencies[out_sub]
continue
# upper case using case equivalencies if they exist
elif (
any_in_upper or in_sub in all_upper_case_equivalencies
) and out_sub in all_lower_case_equivalencies:
new_string += case_equivalencies[out_sub]
continue
# change to upper if required
if any_in_upper and any_out_lower:
new_string += out_sub.upper()
continue
# change to lower if required
if any_in_lower and any_out_upper:
new_string += out_sub.lower()
continue
# just in case, append the out_sub
new_string += out_sub
tg.output_string = new_string
return tg

0 comments on commit 7da4f16

Please sign in to comment.