fix: add exception for case equivalencies of different lengths

roedoejet · Jul 18, 2023 · d231a3a · d231a3a
1 parent 7da4f16
commit d231a3a
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 5 deletions.
diff --git a/g2p/mappings/__init__.py b/g2p/mappings/__init__.py
@@ -346,6 +346,12 @@ def process_kwargs(self, mapping):  # noqa: C901
                             io[k] = normalize(v, self.kwargs["norm_form"])
             elif kwarg == "reverse" and val:
                 mapping = self.reverse_mappings(mapping)
+            elif kwarg == "case_equivalencies":
+                for k, v in val.items():
+                    if len(k) != len(v):
+                        raise exceptions.MalformedMapping(
+                            f"Sorry, the case equivalency between {k} and {v} is not valid because it is not the same length, please write more rules such that any case equivalent is of equal length."
+                        )
 
         # After all processing is done, turn into regex
         for i, io in enumerate(mapping):

diff --git a/g2p/tests/test_mappings.py b/g2p/tests/test_mappings.py
@@ -163,22 +163,33 @@ def test_case_sensitive(self):
         self.assertEqual(transducer_case_sensitive("a").output_string, "a")
         self.assertEqual(transducer("A").output_string, "b")
 
+    def test_case_equivalencies(self):
+        with self.assertRaises(exceptions.MalformedMapping):
+            Mapping([{"in": "a", "out": "b"}], case_equivalencies={"a": "AA"})
+
     def test_escape_special(self):
         mapping = Mapping([{"in": r"\d", "out": "digit"}])
         mapping_escaped = Mapping([{"in": r"\d", "out": "b"}], escape_special=True)
-        mapping_input_and_output_special_escaped = Mapping([{"in": "&", "out": "&"}], escape_special=True)
-        mapping_specific_from_fpcc = Mapping([{"in": r"^", "out": "A"}, {"in": "o", "out": r"."}], rule_ordering="apply-longest-first", escape_special=True)
+        mapping_input_and_output_special_escaped = Mapping(
+            [{"in": "&", "out": "&"}], escape_special=True
+        )
+        mapping_specific_from_fpcc = Mapping(
+            [{"in": r"^", "out": "A"}, {"in": "o", "out": r"."}],
+            rule_ordering="apply-longest-first",
+            escape_special=True,
+        )
         transducer = Transducer(mapping)
         transducer_escaped = Transducer(mapping_escaped)
-        transducer_escaped_input_output = Transducer(mapping_input_and_output_special_escaped)
+        transducer_escaped_input_output = Transducer(
+            mapping_input_and_output_special_escaped
+        )
         transducer_fpcc = Transducer(mapping_specific_from_fpcc)
         self.assertEqual(transducer("1").output_string, "digit")
         self.assertEqual(transducer(r"\d").output_string, r"\d")
         self.assertEqual(transducer_escaped("1").output_string, "1")
         self.assertEqual(transducer_escaped(r"\d").output_string, "b")
-        self.assertEqual(transducer_escaped_input_output('&').output_string, "&")
+        self.assertEqual(transducer_escaped_input_output("&").output_string, "&")
         self.assertEqual(transducer_fpcc("^o").output_string, "A.")
-
 
     def test_norm_form(self):
         mapping_nfc = Mapping([{"in": "a\u0301", "out": "a"}])  # Defaults to NFC

diff --git a/g2p/tests/test_transducer.py b/g2p/tests/test_transducer.py
@@ -3,6 +3,7 @@
 import os
 from unittest import TestCase, main
 
+from g2p.exceptions import MalformedMapping
 from g2p.mappings import Mapping
 from g2p.tests.public import PUBLIC_DIR
 from g2p.transducer import CompositeTransducer, Transducer, normalize_edges
@@ -231,6 +232,19 @@ def test_case_preservation(self):
         self.assertEqual(transducer("tlaba").output_string, "λaba")
         # I guess it's arguable what should happen here, but I'll just change case if any of the characters are differently cased
         self.assertEqual(transducer("Tlaba").output_string, "\u2144aba")
+        # case equivalencies that are not the same length cause indexing errors in the current implementation
+        with self.assertRaises(MalformedMapping):
+            Mapping(
+                [
+                    {"in": "'a", "out": "b"},
+                    {"in": "e\u0301", "out": "f"},
+                    {"in": "tl", "out": "λ"},
+                ],
+                case_sensitive=False,
+                preserve_case=True,
+                norm_form="NFC",
+                case_equivalencies={"λ": "\u2144\u2144\u2144"},
+            )
 
     def test_normalize_edges(self):
         # Remove non-deletion edges with the same index as deletions