Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev.deletion-alignment' into d…
Browse files Browse the repository at this point in the history
…ev.apiv2
  • Loading branch information
dhdaines committed Mar 29, 2023
2 parents 58de7a4 + f989584 commit c120841
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 19 deletions.
1 change: 1 addition & 0 deletions g2p/tests/public/mappings/hello.aligned.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
h}HH e}EH l|l}L o}OW
y}Y o|u}UH '}_ r|e}R
b}_ o}_ g}_ u}_ s}_
13 changes: 13 additions & 0 deletions g2p/tests/test_indices.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,19 @@ def test_case_nine(self):
self.assertEqual(transducer.edges, [(0, None), (1, None)])
# Support deletions in substring_alignments
self.assertEqual(transducer.substring_alignments(), [("aa", "")])
transducer = self.trans_nine("aabbaab")
self.assertEqual(transducer.output_string, "bbb")
self.assertEqual(
transducer.edges,
[(0, 0), (1, 0), (2, 0), (3, 1), (4, 1), (5, 1), (6, 2)],
)
# Support deletions in substring_alignments. NOTE: these
# alignments are quite bogus due to the ad-hoc treatment of
# deletions by rule-based mappings
self.assertEqual(
transducer.substring_alignments(),
[("aab", "b"), ("baa", "b"), ("b", "b")],
)

def test_case_ten(self):
transducer = self.trans_ten("abc")
Expand Down
22 changes: 15 additions & 7 deletions g2p/tests/test_lexicon_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,17 @@ def test_lexicon_mapping(self):
self.assertEqual(tg.output_string, "Y UH R ")
self.assertEqual(
tg.edges,
[(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, None), (4, 5), (5, 5)],
[(0, 0), (1, 2), (1, 3), (2, 2), (2, 3), (3, 5), (4, 5), (5, 5)],
)
tg = t("bogus")
self.assertEqual(tg.output_string, "")
self.assertEqual(
tg.edges,
[(0, None), (1, None), (2, None), (3, None), (4, None)],
)
self.assertEqual(
tg.substring_alignments(),
[("bogus", "")],
)

def test_load_lexicon_mapping(self):
Expand Down Expand Up @@ -74,9 +84,7 @@ def test_eng_lexicon(self):
self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (4, 4)])
tg = t("you're")
self.assertEqual(tg.output_string, "jʊɹ")
self.assertEqual(
tg.edges, [(0, 0), (1, None), (2, 1), (3, None), (4, 2), (5, None)]
)
self.assertEqual(tg.edges, [(0, 0), (1, 1), (2, 1), (3, 2), (4, 2), (5, 2)])
tg = t("change")
self.assertEqual(tg.output_string, "tʃeɪndʒ")
self.assertEqual(tg.input_string, "change")
Expand All @@ -85,20 +93,20 @@ def test_eng_lexicon(self):
[
(0, 0),
(0, 1),
(1, None),
(1, 2),
(2, 2),
(2, 3),
(3, 4),
(4, 5),
(4, 6),
(5, None),
(5, 6),
],
)
tg = t("chain")
# These aligments are weird but they are the ones EM gave us
self.assertEqual(tg.output_string, "tʃeɪn")
self.assertEqual(tg.input_string, "chain")
self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, None), (2, 2), (3, 3), (4, 4)])
self.assertEqual(tg.edges, [(0, 0), (0, 1), (1, 2), (2, 2), (3, 3), (4, 4)])
tg = t("xtra")
self.assertEqual(tg.output_string, "ɛkstɹʌ")
self.assertEqual(tg.input_string, "xtra")
Expand Down
37 changes: 25 additions & 12 deletions g2p/transducer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import re
import unicodedata
from collections import OrderedDict, defaultdict
from typing import Dict, List, Optional, Tuple, Union
from typing import Dict, List, Tuple, Union

import text_unidecode

Expand Down Expand Up @@ -239,6 +239,8 @@ def find_monotonic_segments(alignments):
osort = sorted(
alignments, key=lambda x: (x[0], x[0]) if x[1] is None else (x[1], x[0])
)
# print("isort:", isort)
# print("osort:", osort)
# Use -1 as flag value because None has a meaning in alignments
istart = ostart = iend = oend = -1
for iedge, oedge in zip(isort, osort):
Expand All @@ -264,7 +266,10 @@ def find_monotonic_segments(alignments):
else:
assert oedge[0] is not None
iend = max(iend, oedge[0])
if iedge[1] is not None:
# Replace None with not-None
if oend is None:
oend = iedge[1]
elif iedge[1] is not None:
oend = max(oend, iedge[1])
if istart != -1:
assert iend != -1
Expand Down Expand Up @@ -751,29 +756,37 @@ def apply_lexicon(self, to_convert: str):
tg.output_string = ""
else:
tg.output_string = ""
edges: List[Tuple[Optional[int], Optional[int]]] = []
edges: List[Tuple[int, int]] = []
in_pos = 0
out_pos = 0
# Mappings are flat to save space
for n_inputs, outtxt in zip(alignment[::2], alignment[1::2]):
# Mappings are flattened to save space
for idx in range(0, len(alignment), 2):
(n_inputs, outtxt) = alignment[idx : idx + 2]
for i in range(n_inputs):
for j in range(len(outtxt)):
edges.append((in_pos + i, out_pos + j))
if len(outtxt) == 0: # Deletions
edges.append((in_pos + i, None))
if len(outtxt) == 0:
# Attach deletions to the next input and the
# previous output (fixed below if it does not
# exist)
edges.append((in_pos + i, out_pos))
if n_inputs == 0:
# Insertions are treated differently because many
# parts of the code assume that they cannot exist
# Attach insertions to the previous input
for j in range(len(outtxt)):
edges.append((in_pos, out_pos + j))

in_pos += n_inputs
if len(outtxt) != 0:
out_pos += len(outtxt) + len(self.out_delimiter)
# Be bug-compatible with mappings and add an extra delimiter
tg.output_string += outtxt + self.out_delimiter

tg.edges = edges
# Fix up bogus indices here
out_len = len(tg.output_string)
tg.edges = []
for in_pos, out_pos in edges:
if out_pos >= out_len:
tg.edges.append((in_pos, None if out_len == 0 else out_len - 1))
else:
tg.edges.append((in_pos, out_pos))
return tg

def apply_rules(self, to_convert: str): # noqa: C901
Expand Down

0 comments on commit c120841

Please sign in to comment.