Merge remote-tracking branch 'upstream/dev.deletion-alignment' into d…

…ev.apiv2
roedoejet · Mar 29, 2023 · c120841 · c120841
2 parents 58de7a4 + f989584
commit c120841
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 19 deletions.
diff --git a/g2p/tests/public/mappings/hello.aligned.txt b/g2p/tests/public/mappings/hello.aligned.txt
@@ -1,2 +1,3 @@
 h}HH e}EH l|l}L o}OW
 y}Y o|u}UH '}_ r|e}R
+b}_ o}_ g}_ u}_ s}_
diff --git a/g2p/tests/test_indices.py b/g2p/tests/test_indices.py
@@ -501,6 +501,19 @@ def test_case_nine(self):
         self.assertEqual(transducer.edges, [(0, None), (1, None)])
         # Support deletions in substring_alignments
         self.assertEqual(transducer.substring_alignments(), [("aa", "")])
+        transducer = self.trans_nine("aabbaab")
+        self.assertEqual(transducer.output_string, "bbb")
+        self.assertEqual(
+            transducer.edges,
+            [(0, 0), (1, 0), (2, 0), (3, 1), (4, 1), (5, 1), (6, 2)],
+        )
+        # Support deletions in substring_alignments.  NOTE: these
+        # alignments are quite bogus due to the ad-hoc treatment of
+        # deletions by rule-based mappings
+        self.assertEqual(
+            transducer.substring_alignments(),
+            [("aab", "b"), ("baa", "b"), ("b", "b")],
+        )
 
     def test_case_ten(self):
         transducer = self.trans_ten("abc")

diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py
@@ -35,7 +35,17 @@ def test_lexicon_mapping(self):
         self.assertEqual(tg.output_string, "Y UH R ")
         self.assertEqual(
             tg.edges,
-            [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, None), (4, 5), (5, 5)],
+            [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, 5), (4, 5), (5, 5)],
+        )
+        tg = t("bogus")
+        self.assertEqual(tg.output_string, "")
+        self.assertEqual(
+            tg.edges,
+            [(0, None), (1, None), (2, None), (3, None), (4, None)],
+        )
+        self.assertEqual(
+            tg.substring_alignments(),
+            [("bogus", "")],
         )
 
     def test_load_lexicon_mapping(self):
@@ -74,9 +84,7 @@ def test_eng_lexicon(self):
         self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (4, 4)])
         tg = t("you're")
         self.assertEqual(tg.output_string, "jʊɹ")
-        self.assertEqual(
-            tg.edges, [(0, 0), (1, None), (2, 1), (3, None), (4, 2), (5, None)]
-        )
+        self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 1), (3, 2), (4, 2), (5, 2)])
         tg = t("change")
         self.assertEqual(tg.output_string, "tʃeɪndʒ")
         self.assertEqual(tg.input_string, "change")
@@ -85,20 +93,20 @@ def test_eng_lexicon(self):
             [
                 (0, 0),
                 (0, 1),
-                (1, None),
+                (1, 2),
                 (2, 2),
                 (2, 3),
                 (3, 4),
                 (4, 5),
                 (4, 6),
-                (5, None),
+                (5, 6),
             ],
         )
         tg = t("chain")
         # These aligments are weird but they are the ones EM gave us
         self.assertEqual(tg.output_string, "tʃeɪn")
         self.assertEqual(tg.input_string, "chain")
-        self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, None), (2, 2), (3, 3), (4, 4)])
+        self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, 2), (2, 2), (3, 3), (4, 4)])
         tg = t("xtra")
         self.assertEqual(tg.output_string, "ɛkstɹʌ")
         self.assertEqual(tg.input_string, "xtra")

diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py
@@ -8,7 +8,7 @@
 import re
 import unicodedata
 from collections import OrderedDict, defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import text_unidecode
 
@@ -239,6 +239,8 @@ def find_monotonic_segments(alignments):
             osort = sorted(
                 alignments, key=lambda x: (x[0], x[0]) if x[1] is None else (x[1], x[0])
             )
+            # print("isort:", isort)
+            # print("osort:", osort)
             # Use -1 as flag value because None has a meaning in alignments
             istart = ostart = iend = oend = -1
             for iedge, oedge in zip(isort, osort):
@@ -264,7 +266,10 @@ def find_monotonic_segments(alignments):
                 else:
                     assert oedge[0] is not None
                     iend = max(iend, oedge[0])
-                    if iedge[1] is not None:
+                    # Replace None with not-None
+                    if oend is None:
+                        oend = iedge[1]
+                    elif iedge[1] is not None:
                         oend = max(oend, iedge[1])
             if istart != -1:
                 assert iend != -1
@@ -751,29 +756,37 @@ def apply_lexicon(self, to_convert: str):
             tg.output_string = ""
         else:
             tg.output_string = ""
-            edges: List[Tuple[Optional[int], Optional[int]]] = []
+            edges: List[Tuple[int, int]] = []
             in_pos = 0
             out_pos = 0
-            # Mappings are flat to save space
-            for n_inputs, outtxt in zip(alignment[::2], alignment[1::2]):
+            # Mappings are flattened to save space
+            for idx in range(0, len(alignment), 2):
+                (n_inputs, outtxt) = alignment[idx : idx + 2]
                 for i in range(n_inputs):
                     for j in range(len(outtxt)):
                         edges.append((in_pos + i, out_pos + j))
-                    if len(outtxt) == 0:  # Deletions
-                        edges.append((in_pos + i, None))
+                    if len(outtxt) == 0:
+                        # Attach deletions to the next input and the
+                        # previous output (fixed below if it does not
+                        # exist)
+                        edges.append((in_pos + i, out_pos))
                 if n_inputs == 0:
-                    # Insertions are treated differently because many
-                    # parts of the code assume that they cannot exist
+                    # Attach insertions to the previous input
                     for j in range(len(outtxt)):
                         edges.append((in_pos, out_pos + j))
-
                 in_pos += n_inputs
                 if len(outtxt) != 0:
                     out_pos += len(outtxt) + len(self.out_delimiter)
                     # Be bug-compatible with mappings and add an extra delimiter
                     tg.output_string += outtxt + self.out_delimiter
-
-            tg.edges = edges
+            # Fix up bogus indices here
+            out_len = len(tg.output_string)
+            tg.edges = []
+            for in_pos, out_pos in edges:
+                if out_pos >= out_len:
+                    tg.edges.append((in_pos, None if out_len == 0 else out_len - 1))
+                else:
+                    tg.edges.append((in_pos, out_pos))
         return tg
 
     def apply_rules(self, to_convert: str):  # noqa: C901