From 18bdc6ba1260a402f8845030dc5c56b23fd9b646 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Wed, 29 Mar 2023 12:24:09 -0400
Subject: [PATCH 1/3] fix: update treatment of deletions in lexicon to match
 rules

---
 g2p/tests/test_indices.py            | 13 ++++++++++++
 g2p/tests/test_lexicon_transducer.py | 12 +++++------
 g2p/transducer/__init__.py           | 30 ++++++++++++++++++++++------
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/g2p/tests/test_indices.py b/g2p/tests/test_indices.py
index 5ecbc46b..96f7d5e0 100755
--- a/g2p/tests/test_indices.py
+++ b/g2p/tests/test_indices.py
@@ -501,6 +501,19 @@ def test_case_nine(self):
         self.assertEqual(transducer.edges, [(0, None), (1, None)])
         # Support deletions in substring_alignments
         self.assertEqual(transducer.substring_alignments(), [("aa", "")])
+        transducer = self.trans_nine("aabbaab")
+        self.assertEqual(transducer.output_string, "bbb")
+        self.assertEqual(
+            transducer.edges,
+            [(0, 0), (1, 0), (2, 0), (3, 1), (4, 1), (5, 1), (6, 2)],
+        )
+        # Support deletions in substring_alignments.  NOTE: these
+        # alignments are quite bogus due to the ad-hoc treatment of
+        # deletions by rule-based mappings
+        self.assertEqual(
+            transducer.substring_alignments(),
+            [("aab", "b"), ("baa", "b"), ("b", "b")],
+        )
 
     def test_case_ten(self):
         transducer = self.trans_ten("abc")
diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py
index b2d84b92..828af1ce 100644
--- a/g2p/tests/test_lexicon_transducer.py
+++ b/g2p/tests/test_lexicon_transducer.py
@@ -35,7 +35,7 @@ def test_lexicon_mapping(self):
         self.assertEqual(tg.output_string, "Y UH R ")
         self.assertEqual(
             tg.edges,
-            [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, None), (4, 5), (5, 5)],
+            [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, 5), (4, 5), (5, 5)],
         )
 
     def test_load_lexicon_mapping(self):
@@ -74,9 +74,7 @@ def test_eng_lexicon(self):
         self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (4, 4)])
         tg = t("you're")
         self.assertEqual(tg.output_string, "jʊɹ")
-        self.assertEqual(
-            tg.edges, [(0, 0), (1, None), (2, 1), (3, None), (4, 2), (5, None)]
-        )
+        self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 1), (3, 2), (4, 2), (5, 2)])
         tg = t("change")
         self.assertEqual(tg.output_string, "tʃeɪndʒ")
         self.assertEqual(tg.input_string, "change")
@@ -85,20 +83,20 @@ def test_eng_lexicon(self):
             [
                 (0, 0),
                 (0, 1),
-                (1, None),
+                (1, 2),
                 (2, 2),
                 (2, 3),
                 (3, 4),
                 (4, 5),
                 (4, 6),
-                (5, None),
+                (5, 6),
             ],
         )
         tg = t("chain")
         # These aligments are weird but they are the ones EM gave us
         self.assertEqual(tg.output_string, "tʃeɪn")
         self.assertEqual(tg.input_string, "chain")
-        self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, None), (2, 2), (3, 3), (4, 4)])
+        self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, 2), (2, 2), (3, 3), (4, 4)])
         tg = t("xtra")
         self.assertEqual(tg.output_string, "ɛkstɹʌ")
         self.assertEqual(tg.input_string, "xtra")
diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py
index 2415f73c..6322548a 100644
--- a/g2p/transducer/__init__.py
+++ b/g2p/transducer/__init__.py
@@ -239,6 +239,8 @@ def find_monotonic_segments(alignments):
             osort = sorted(
                 alignments, key=lambda x: (x[0], x[0]) if x[1] is None else (x[1], x[0])
             )
+            # print("isort:", isort)
+            # print("osort:", osort)
             # Use -1 as flag value because None has a meaning in alignments
             istart = ostart = iend = oend = -1
             for iedge, oedge in zip(isort, osort):
@@ -264,7 +266,10 @@ def find_monotonic_segments(alignments):
                 else:
                     assert oedge[0] is not None
                     iend = max(iend, oedge[0])
-                    if iedge[1] is not None:
+                    # Replace None with not-None
+                    if oend is None:
+                        oend = iedge[1]
+                    elif iedge[1] is not None:
                         oend = max(oend, iedge[1])
             if istart != -1:
                 assert iend != -1
@@ -755,15 +760,28 @@ def apply_lexicon(self, to_convert: str):
             in_pos = 0
             out_pos = 0
             # Mappings are flat to save space
-            for n_inputs, outtxt in zip(alignment[::2], alignment[1::2]):
+            for idx in range(0, len(alignment), 2):
+                (n_inputs, outtxt) = alignment[idx : idx + 2]
                 for i in range(n_inputs):
                     for j in range(len(outtxt)):
                         edges.append((in_pos + i, out_pos + j))
-                    if len(outtxt) == 0:  # Deletions
-                        edges.append((in_pos + i, None))
+                    if len(outtxt) == 0:
+                        # Match the (dubious) behaviour of rule-based
+                        # mappings which will always attach deletions
+                        # to an adjacent output unless the output is
+                        # empty, in which case the output index is None
+                        if idx == len(alignment) - 2:
+                            # Previous output at end
+                            edges.append(
+                                (in_pos + i, None if out_pos == 0 else out_pos - 1)
+                            )
+                        else:
+                            # Otherwise next output... this is very
+                            # ad-hoc but so is the behaviour of
+                            # rule-based mappings ;-(
+                            edges.append((in_pos + i, out_pos))
                 if n_inputs == 0:
-                    # Insertions are treated differently because many
-                    # parts of the code assume that they cannot exist
+                    # Attach insertions to the previous input
                     for j in range(len(outtxt)):
                         edges.append((in_pos, out_pos + j))
 

From 20ac41e166c4755a9e977dfb721ad42dddd616cb Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Wed, 29 Mar 2023 12:50:58 -0400
Subject: [PATCH 2/3] fix: make sure we do not output bogus edges

---
 g2p/tests/public/mappings/hello.aligned.txt |  1 +
 g2p/tests/test_lexicon_transducer.py        | 10 ++++++
 g2p/transducer/__init__.py                  | 34 +++++++++------------
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/g2p/tests/public/mappings/hello.aligned.txt b/g2p/tests/public/mappings/hello.aligned.txt
index 48e46cea..a081bb42 100644
--- a/g2p/tests/public/mappings/hello.aligned.txt
+++ b/g2p/tests/public/mappings/hello.aligned.txt
@@ -1,2 +1,3 @@
 h}HH e}EH l|l}L o}OW
 y}Y o|u}UH '}_ r|e}R
+b}_ o}_ g}_ u}_ s}_
diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py
index 828af1ce..ac3cb1b2 100644
--- a/g2p/tests/test_lexicon_transducer.py
+++ b/g2p/tests/test_lexicon_transducer.py
@@ -37,6 +37,16 @@ def test_lexicon_mapping(self):
             tg.edges,
             [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, 5), (4, 5), (5, 5)],
         )
+        tg = t("bogus")
+        self.assertEqual(tg.output_string, "")
+        self.assertEqual(
+            tg.edges,
+            [(0, None), (1, None), (2, None), (3, None), (4, None)],
+        )
+        self.assertEqual(
+            tg.substring_alignments(),
+            [("bogus", "")],
+        )
 
     def test_load_lexicon_mapping(self):
         """Test loading a lexicon mapping through a config file."""
diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py
index 6322548a..8a49e814 100644
--- a/g2p/transducer/__init__.py
+++ b/g2p/transducer/__init__.py
@@ -8,7 +8,7 @@
 import re
 import unicodedata
 from collections import OrderedDict, defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Tuple, Union
 
 import text_unidecode
 
@@ -756,42 +756,36 @@ def apply_lexicon(self, to_convert: str):
             tg.output_string = ""
         else:
             tg.output_string = ""
-            edges: List[Tuple[Optional[int], Optional[int]]] = []
+            edges: List[Tuple[int, int]] = []
             in_pos = 0
             out_pos = 0
-            # Mappings are flat to save space
+            # Mappings are flattened to save space
             for idx in range(0, len(alignment), 2):
                 (n_inputs, outtxt) = alignment[idx : idx + 2]
                 for i in range(n_inputs):
                     for j in range(len(outtxt)):
                         edges.append((in_pos + i, out_pos + j))
                     if len(outtxt) == 0:
-                        # Match the (dubious) behaviour of rule-based
-                        # mappings which will always attach deletions
-                        # to an adjacent output unless the output is
-                        # empty, in which case the output index is None
-                        if idx == len(alignment) - 2:
-                            # Previous output at end
-                            edges.append(
-                                (in_pos + i, None if out_pos == 0 else out_pos - 1)
-                            )
-                        else:
-                            # Otherwise next output... this is very
-                            # ad-hoc but so is the behaviour of
-                            # rule-based mappings ;-(
-                            edges.append((in_pos + i, out_pos))
+                        # Attach deletions to the next input (we will
+                        # fix this below if it does not exist)
+                        edges.append((in_pos + i, out_pos))
                 if n_inputs == 0:
                     # Attach insertions to the previous input
                     for j in range(len(outtxt)):
                         edges.append((in_pos, out_pos + j))
-
                 in_pos += n_inputs
                 if len(outtxt) != 0:
                     out_pos += len(outtxt) + len(self.out_delimiter)
                     # Be bug-compatible with mappings and add an extra delimiter
                     tg.output_string += outtxt + self.out_delimiter
-
-            tg.edges = edges
+            # Fix up bogus indices here
+            out_len = len(tg.output_string)
+            tg.edges = []
+            for in_pos, out_pos in edges:
+                if out_pos >= out_len:
+                    tg.edges.append((in_pos, None if out_len == 0 else out_len - 1))
+                else:
+                    tg.edges.append((in_pos, out_pos))
         return tg
 
     def apply_rules(self, to_convert: str):  # noqa: C901

From f989584032e17d4027293d342e8a5f3ae03a6d3b Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Wed, 29 Mar 2023 13:04:06 -0400
Subject: [PATCH 3/3] docs: correct a comment

---
 g2p/transducer/__init__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py
index 8a49e814..d668ba96 100644
--- a/g2p/transducer/__init__.py
+++ b/g2p/transducer/__init__.py
@@ -766,8 +766,9 @@ def apply_lexicon(self, to_convert: str):
                     for j in range(len(outtxt)):
                         edges.append((in_pos + i, out_pos + j))
                     if len(outtxt) == 0:
-                        # Attach deletions to the next input (we will
-                        # fix this below if it does not exist)
+                        # Attach deletions to the next input and the
+                        # previous output (fixed below if it does not
+                        # exist)
                         edges.append((in_pos + i, out_pos))
                 if n_inputs == 0:
                     # Attach insertions to the previous input