From 18bdc6ba1260a402f8845030dc5c56b23fd9b646 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 29 Mar 2023 12:24:09 -0400 Subject: [PATCH 1/3] fix: update treatment of deletions in lexicon to match rules --- g2p/tests/test_indices.py | 13 ++++++++++++ g2p/tests/test_lexicon_transducer.py | 12 +++++------ g2p/transducer/__init__.py | 30 ++++++++++++++++++++++------ 3 files changed, 42 insertions(+), 13 deletions(-) diff --git a/g2p/tests/test_indices.py b/g2p/tests/test_indices.py index 5ecbc46b..96f7d5e0 100755 --- a/g2p/tests/test_indices.py +++ b/g2p/tests/test_indices.py @@ -501,6 +501,19 @@ def test_case_nine(self): self.assertEqual(transducer.edges, [(0, None), (1, None)]) # Support deletions in substring_alignments self.assertEqual(transducer.substring_alignments(), [("aa", "")]) + transducer = self.trans_nine("aabbaab") + self.assertEqual(transducer.output_string, "bbb") + self.assertEqual( + transducer.edges, + [(0, 0), (1, 0), (2, 0), (3, 1), (4, 1), (5, 1), (6, 2)], + ) + # Support deletions in substring_alignments. NOTE: these + # alignments are quite bogus due to the ad-hoc treatment of + # deletions by rule-based mappings + self.assertEqual( + transducer.substring_alignments(), + [("aab", "b"), ("baa", "b"), ("b", "b")], + ) def test_case_ten(self): transducer = self.trans_ten("abc") diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py index b2d84b92..828af1ce 100644 --- a/g2p/tests/test_lexicon_transducer.py +++ b/g2p/tests/test_lexicon_transducer.py @@ -35,7 +35,7 @@ def test_lexicon_mapping(self): self.assertEqual(tg.output_string, "Y UH R ") self.assertEqual( tg.edges, - [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, None), (4, 5), (5, 5)], + [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, 5), (4, 5), (5, 5)], ) def test_load_lexicon_mapping(self): @@ -74,9 +74,7 @@ def test_eng_lexicon(self): self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (4, 4)]) tg = t("you're") self.assertEqual(tg.output_string, "jʊɹ") - self.assertEqual( - tg.edges, [(0, 0), (1, None), (2, 1), (3, None), (4, 2), (5, None)] - ) + self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 1), (3, 2), (4, 2), (5, 2)]) tg = t("change") self.assertEqual(tg.output_string, "tʃeɪndʒ") self.assertEqual(tg.input_string, "change") @@ -85,20 +83,20 @@ def test_eng_lexicon(self): [ (0, 0), (0, 1), - (1, None), + (1, 2), (2, 2), (2, 3), (3, 4), (4, 5), (4, 6), - (5, None), + (5, 6), ], ) tg = t("chain") # These aligments are weird but they are the ones EM gave us self.assertEqual(tg.output_string, "tʃeɪn") self.assertEqual(tg.input_string, "chain") - self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, None), (2, 2), (3, 3), (4, 4)]) + self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, 2), (2, 2), (3, 3), (4, 4)]) tg = t("xtra") self.assertEqual(tg.output_string, "ɛkstɹʌ") self.assertEqual(tg.input_string, "xtra") diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py index 2415f73c..6322548a 100644 --- a/g2p/transducer/__init__.py +++ b/g2p/transducer/__init__.py @@ -239,6 +239,8 @@ def find_monotonic_segments(alignments): osort = sorted( alignments, key=lambda x: (x[0], x[0]) if x[1] is None else (x[1], x[0]) ) + # print("isort:", isort) + # print("osort:", osort) # Use -1 as flag value because None has a meaning in alignments istart = ostart = iend = oend = -1 for iedge, oedge in zip(isort, osort): @@ -264,7 +266,10 @@ def find_monotonic_segments(alignments): else: assert oedge[0] is not None iend = max(iend, oedge[0]) - if iedge[1] is not None: + # Replace None with not-None + if oend is None: + oend = iedge[1] + elif iedge[1] is not None: oend = max(oend, iedge[1]) if istart != -1: assert iend != -1 @@ -755,15 +760,28 @@ def apply_lexicon(self, to_convert: str): in_pos = 0 out_pos = 0 # Mappings are flat to save space - for n_inputs, outtxt in zip(alignment[::2], alignment[1::2]): + for idx in range(0, len(alignment), 2): + (n_inputs, outtxt) = alignment[idx : idx + 2] for i in range(n_inputs): for j in range(len(outtxt)): edges.append((in_pos + i, out_pos + j)) - if len(outtxt) == 0: # Deletions - edges.append((in_pos + i, None)) + if len(outtxt) == 0: + # Match the (dubious) behaviour of rule-based + # mappings which will always attach deletions + # to an adjacent output unless the output is + # empty, in which case the output index is None + if idx == len(alignment) - 2: + # Previous output at end + edges.append( + (in_pos + i, None if out_pos == 0 else out_pos - 1) + ) + else: + # Otherwise next output... this is very + # ad-hoc but so is the behaviour of + # rule-based mappings ;-( + edges.append((in_pos + i, out_pos)) if n_inputs == 0: - # Insertions are treated differently because many - # parts of the code assume that they cannot exist + # Attach insertions to the previous input for j in range(len(outtxt)): edges.append((in_pos, out_pos + j)) From 20ac41e166c4755a9e977dfb721ad42dddd616cb Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 29 Mar 2023 12:50:58 -0400 Subject: [PATCH 2/3] fix: make sure we do not output bogus edges --- g2p/tests/public/mappings/hello.aligned.txt | 1 + g2p/tests/test_lexicon_transducer.py | 10 ++++++ g2p/transducer/__init__.py | 34 +++++++++------------ 3 files changed, 25 insertions(+), 20 deletions(-) diff --git a/g2p/tests/public/mappings/hello.aligned.txt b/g2p/tests/public/mappings/hello.aligned.txt index 48e46cea..a081bb42 100644 --- a/g2p/tests/public/mappings/hello.aligned.txt +++ b/g2p/tests/public/mappings/hello.aligned.txt @@ -1,2 +1,3 @@ h}HH e}EH l|l}L o}OW y}Y o|u}UH '}_ r|e}R +b}_ o}_ g}_ u}_ s}_ diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py index 828af1ce..ac3cb1b2 100644 --- a/g2p/tests/test_lexicon_transducer.py +++ b/g2p/tests/test_lexicon_transducer.py @@ -37,6 +37,16 @@ def test_lexicon_mapping(self): tg.edges, [(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, 5), (4, 5), (5, 5)], ) + tg = t("bogus") + self.assertEqual(tg.output_string, "") + self.assertEqual( + tg.edges, + [(0, None), (1, None), (2, None), (3, None), (4, None)], + ) + self.assertEqual( + tg.substring_alignments(), + [("bogus", "")], + ) def test_load_lexicon_mapping(self): """Test loading a lexicon mapping through a config file.""" diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py index 6322548a..8a49e814 100644 --- a/g2p/transducer/__init__.py +++ b/g2p/transducer/__init__.py @@ -8,7 +8,7 @@ import re import unicodedata from collections import OrderedDict, defaultdict -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Tuple, Union import text_unidecode @@ -756,42 +756,36 @@ def apply_lexicon(self, to_convert: str): tg.output_string = "" else: tg.output_string = "" - edges: List[Tuple[Optional[int], Optional[int]]] = [] + edges: List[Tuple[int, int]] = [] in_pos = 0 out_pos = 0 - # Mappings are flat to save space + # Mappings are flattened to save space for idx in range(0, len(alignment), 2): (n_inputs, outtxt) = alignment[idx : idx + 2] for i in range(n_inputs): for j in range(len(outtxt)): edges.append((in_pos + i, out_pos + j)) if len(outtxt) == 0: - # Match the (dubious) behaviour of rule-based - # mappings which will always attach deletions - # to an adjacent output unless the output is - # empty, in which case the output index is None - if idx == len(alignment) - 2: - # Previous output at end - edges.append( - (in_pos + i, None if out_pos == 0 else out_pos - 1) - ) - else: - # Otherwise next output... this is very - # ad-hoc but so is the behaviour of - # rule-based mappings ;-( - edges.append((in_pos + i, out_pos)) + # Attach deletions to the next input (we will + # fix this below if it does not exist) + edges.append((in_pos + i, out_pos)) if n_inputs == 0: # Attach insertions to the previous input for j in range(len(outtxt)): edges.append((in_pos, out_pos + j)) - in_pos += n_inputs if len(outtxt) != 0: out_pos += len(outtxt) + len(self.out_delimiter) # Be bug-compatible with mappings and add an extra delimiter tg.output_string += outtxt + self.out_delimiter - - tg.edges = edges + # Fix up bogus indices here + out_len = len(tg.output_string) + tg.edges = [] + for in_pos, out_pos in edges: + if out_pos >= out_len: + tg.edges.append((in_pos, None if out_len == 0 else out_len - 1)) + else: + tg.edges.append((in_pos, out_pos)) return tg def apply_rules(self, to_convert: str): # noqa: C901 From f989584032e17d4027293d342e8a5f3ae03a6d3b Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Wed, 29 Mar 2023 13:04:06 -0400 Subject: [PATCH 3/3] docs: correct a comment --- g2p/transducer/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/g2p/transducer/__init__.py b/g2p/transducer/__init__.py index 8a49e814..d668ba96 100644 --- a/g2p/transducer/__init__.py +++ b/g2p/transducer/__init__.py @@ -766,8 +766,9 @@ def apply_lexicon(self, to_convert: str): for j in range(len(outtxt)): edges.append((in_pos + i, out_pos + j)) if len(outtxt) == 0: - # Attach deletions to the next input (we will - # fix this below if it does not exist) + # Attach deletions to the next input and the + # previous output (fixed below if it does not + # exist) edges.append((in_pos + i, out_pos)) if n_inputs == 0: # Attach insertions to the previous input