Skip to content

Commit

Permalink
fix: add exception for case equivalencies of different lengths
Browse files Browse the repository at this point in the history
  • Loading branch information
roedoejet committed Jul 18, 2023
1 parent 7da4f16 commit d231a3a
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 5 deletions.
6 changes: 6 additions & 0 deletions g2p/mappings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,12 @@ def process_kwargs(self, mapping): # noqa: C901
io[k] = normalize(v, self.kwargs["norm_form"])
elif kwarg == "reverse" and val:
mapping = self.reverse_mappings(mapping)
elif kwarg == "case_equivalencies":
for k, v in val.items():
if len(k) != len(v):
raise exceptions.MalformedMapping(
f"Sorry, the case equivalency between {k} and {v} is not valid because it is not the same length, please write more rules such that any case equivalent is of equal length."
)

# After all processing is done, turn into regex
for i, io in enumerate(mapping):
Expand Down
21 changes: 16 additions & 5 deletions g2p/tests/test_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,22 +163,33 @@ def test_case_sensitive(self):
self.assertEqual(transducer_case_sensitive("a").output_string, "a")
self.assertEqual(transducer("A").output_string, "b")

def test_case_equivalencies(self):
with self.assertRaises(exceptions.MalformedMapping):
Mapping([{"in": "a", "out": "b"}], case_equivalencies={"a": "AA"})

def test_escape_special(self):
mapping = Mapping([{"in": r"\d", "out": "digit"}])
mapping_escaped = Mapping([{"in": r"\d", "out": "b"}], escape_special=True)
mapping_input_and_output_special_escaped = Mapping([{"in": "&", "out": "&"}], escape_special=True)
mapping_specific_from_fpcc = Mapping([{"in": r"^", "out": "A"}, {"in": "o", "out": r"."}], rule_ordering="apply-longest-first", escape_special=True)
mapping_input_and_output_special_escaped = Mapping(
[{"in": "&", "out": "&"}], escape_special=True
)
mapping_specific_from_fpcc = Mapping(
[{"in": r"^", "out": "A"}, {"in": "o", "out": r"."}],
rule_ordering="apply-longest-first",
escape_special=True,
)
transducer = Transducer(mapping)
transducer_escaped = Transducer(mapping_escaped)
transducer_escaped_input_output = Transducer(mapping_input_and_output_special_escaped)
transducer_escaped_input_output = Transducer(
mapping_input_and_output_special_escaped
)
transducer_fpcc = Transducer(mapping_specific_from_fpcc)
self.assertEqual(transducer("1").output_string, "digit")
self.assertEqual(transducer(r"\d").output_string, r"\d")
self.assertEqual(transducer_escaped("1").output_string, "1")
self.assertEqual(transducer_escaped(r"\d").output_string, "b")
self.assertEqual(transducer_escaped_input_output('&').output_string, "&")
self.assertEqual(transducer_escaped_input_output("&").output_string, "&")
self.assertEqual(transducer_fpcc("^o").output_string, "A.")


def test_norm_form(self):
mapping_nfc = Mapping([{"in": "a\u0301", "out": "a"}]) # Defaults to NFC
Expand Down
14 changes: 14 additions & 0 deletions g2p/tests/test_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
from unittest import TestCase, main

from g2p.exceptions import MalformedMapping
from g2p.mappings import Mapping
from g2p.tests.public import PUBLIC_DIR
from g2p.transducer import CompositeTransducer, Transducer, normalize_edges
Expand Down Expand Up @@ -231,6 +232,19 @@ def test_case_preservation(self):
self.assertEqual(transducer("tlaba").output_string, "λaba")
# I guess it's arguable what should happen here, but I'll just change case if any of the characters are differently cased
self.assertEqual(transducer("Tlaba").output_string, "\u2144aba")
# case equivalencies that are not the same length cause indexing errors in the current implementation
with self.assertRaises(MalformedMapping):
Mapping(
[
{"in": "'a", "out": "b"},
{"in": "e\u0301", "out": "f"},
{"in": "tl", "out": "λ"},
],
case_sensitive=False,
preserve_case=True,
norm_form="NFC",
case_equivalencies={"λ": "\u2144\u2144\u2144"},
)

def test_normalize_edges(self):
# Remove non-deletion edges with the same index as deletions
Expand Down

0 comments on commit d231a3a

Please sign in to comment.