Merge pull request #253 from ReadAlongs/dev.ej/convert_api

Add a convert_to_readalong() function for use by other projects, e.g., EveryVoice
ReadAlongs · Dec 10, 2024 · c6d0f64 · c6d0f64
2 parents 2afc316 + eb1e7b3
commit c6d0f64
Show file tree

Hide file tree

Showing 9 changed files with 186 additions and 40 deletions.
diff --git a/readalongs/_version.py b/readalongs/_version.py
@@ -1,3 +1,3 @@
-VERSION = "1.1.0"
+VERSION = "1.2.0"
 
 READALONG_FILE_FORMAT_VERSION = "1.2"
diff --git a/readalongs/align.py b/readalongs/align.py
@@ -1191,7 +1191,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
 """
 
 
-def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
+def create_ras_from_text(lines: Iterable[str], text_languages: Sequence[str]) -> str:
     """Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
         Uses the line sequence to infer paragraph and sentence structure from plain text:
         Assumes a double blank line marks a page break, and a single blank line

diff --git a/readalongs/api.py b/readalongs/api.py
@@ -31,29 +31,48 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
                         you come accross such an exception and you believe the
                         problem is not in your own code.
  - log: any logging messages issued during execution
+
+Additional API function:
+
+convert_to_readalong(sentences: Sequence[Sequence[Token]], language: Sequence[str]) -> str:
+    convert a list of sentences into a readalong XML string ready to print to file.
+    Just like align and make_xml, this function expects a black line (empty list) to
+    make a paragraph break, and two consecutive blank lines to make a page break.
+    Unlike the other functions here, this function is not a wrapper around the CLI and
+    it just returns the string, non status.
 """
 
 import io
 import logging
-from typing import Optional, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Sequence, Tuple, Union
 
 import click
 
 from readalongs import cli
+from readalongs.align import create_ras_from_text
 from readalongs.log import LOGGER
+from readalongs.text.add_ids_to_xml import add_ids
+from readalongs.text.util import parse_xml
 from readalongs.util import JoinerCallbackForClick, get_langs_deferred
 
 
 def align(
-    textfile, audiofile, output_base, language=(), output_formats=(), **kwargs
+    textfile: Union[str, Path],
+    audiofile: Union[str, Path],
+    output_base: Union[str, Path],
+    language: Sequence[str] = (),
+    output_formats: Sequence[str] = (),
+    **kwargs
 ) -> Tuple[int, Optional[Exception], str]:
     """Run the "readalongs align" command from within a Python script.
 
     Args:
-        textfile (str | Path): input text file (XML or plain text)
-        audiofile (str | Path): input audio file (format supported by ffmpeg)
-        output_base (str | Path): basename for output files
-        language (List[str]): Specify only of textfile is plain text;
+        textfile: input text file (XML or plain text)
+        audiofile: input audio file (format supported by ffmpeg)
+        output_base: basename for output files
+        language: Specify only if textfile is plain text;
             list of languages for g2p and g2p cascade
         save_temps (bool): Optional; whether to save temporary files
 
@@ -100,14 +119,17 @@ def align(
 
 
 def make_xml(
-    plaintextfile, xmlfile, language, **kwargs
+    plaintextfile: Union[str, Path],
+    xmlfile: Union[str, Path],
+    language: Sequence[str],
+    **kwargs
 ) -> Tuple[int, Optional[Exception], str]:
     """Run the "readalongs make-xml" command from within a Python script.
 
     Args:
-        plaintextfile (str | Path): input plain text file
-        xmlfile (str | Path): output XML file
-        language (List[str]): list of languages for g2p and g2p cascade
+        plaintextfile: input plain text file
+        xmlfile: output XML file
+        language: list of languages for g2p and g2p cascade
 
         Run "readalongs make-xml -h" or consult
         https://readalong-studio.readthedocs.io/en/latest/cli-ref.html#readalongs-make-xml
@@ -116,11 +138,13 @@ def make_xml(
     Returns: (status, exception, log_text)
     """
     # plaintextfile is not a file object if passed from click
+
     plaintextfile = (
         plaintextfile.name
         if isinstance(plaintextfile, click.utils.LazyFile)
         else plaintextfile
     )
+    xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile
     logging_stream = io.StringIO()
     logging_handler = logging.StreamHandler(logging_stream)
     try:
@@ -157,3 +181,81 @@ def prepare(*args, **kwargs):
         "readalongs.api.prepare() is deprecated. Please use make_xml() instead."
     )
     return make_xml(*args, **kwargs)
+
+
+@dataclass
+class Token:
+    """A token in a readalong: a word has a time and dur, a non-word does not."""
+
+    text: str
+    time: Optional[float]
+    dur: Optional[float]
+    is_word: bool
+
+    def __init__(
+        self,
+        text: str,
+        time: Optional[float] = None,
+        dur: Optional[float] = None,
+        is_word: Optional[bool] = None,
+    ):
+        """Create a word token:
+            t = Token("asdf", time=1.3, dur=.34) or t = Token("asdf", 1.3, .34)
+        Create a non-word token (e.g., punctuation, spacing):
+            t = Token(", ")
+        """
+        self.text = text
+        self.time = time
+        self.dur = dur
+        self.is_word = is_word if is_word is not None else bool(time is not None)
+
+
+def convert_to_readalong(
+    sentences: Sequence[Sequence[Token]],
+    language: Sequence[str] = ("und",),
+) -> str:
+    """Convert a list of sentences/paragraphs/pages of tokens into a readalong XML string.
+
+    Args:
+        sentences: a list of sentences, each of which is a list of Token objects
+            Paragraph breaks are marked by a empty sentence (i.e., an empty list)
+            Page breaks are marked by two empty sentences in a row
+        language: list of languages to declare at the top of the readalong
+            (has no functional effect since g2p is not applied, it's only metadata)
+
+    Returns:
+        str: the readalong XML string, ready to print to a .readalong file
+    """
+    from lxml import etree
+
+    xml_text = create_ras_from_text(
+        ["".join(token.text for token in sentence) for sentence in sentences],
+        language,
+    )
+    xml = parse_xml(xml_text)
+    filtered_sentences = [sentence for sentence in sentences if sentence]
+    for sentence, sentence_xml in zip(filtered_sentences, xml.findall(".//s")):
+        sentence_xml.text = ""
+        for token in sentence:
+            if token.is_word:
+                w = etree.Element("w")
+                w.text = token.text
+                w.attrib["time"] = str(token.time)
+                w.attrib["dur"] = str(token.dur)
+                sentence_xml.append(w)
+            else:
+                if len(sentence_xml):  # if it has children
+                    if not sentence_xml[-1].tail:
+                        sentence_xml[-1].tail = ""
+                    sentence_xml[-1].tail += token.text
+                else:
+                    sentence_xml.text += token.text
+
+    xml = add_ids(xml)
+    xml_text = etree.tostring(
+        xml,
+        encoding="utf-8",
+        xml_declaration=True,
+    ).decode("utf8")
+
+    return xml_text + "\n"
diff --git a/readalongs/text/tokenize_xml.py b/readalongs/text/tokenize_xml.py
@@ -32,7 +32,7 @@
 from lxml import etree
 
 from readalongs.log import LOGGER
-from readalongs.text.util import get_lang_attrib, is_do_not_align, unicode_normalize_xml
+from readalongs.text.util import get_lang_attrib, is_do_not_align
 
 
 def tokenize_xml_in_place(xml):
@@ -115,8 +115,6 @@ def add_word_children(element):
 def tokenize_xml(xml):
     """Returns a deep copy of xml with all words wrapped in a "w" XML element"""
     xml = deepcopy(xml)
-    # FIXME: different langs have different normalizations, is this necessary?
-    unicode_normalize_xml(xml)
     words = xml.xpath(".//w")
     if words:
         LOGGER.info("Words (<w>) already present; skipping tokenization")

diff --git a/readalongs/text/util.py b/readalongs/text/util.py
@@ -16,7 +16,6 @@
 from io import TextIOWrapper
 from pathlib import Path
 from typing import IO, Union
-from unicodedata import normalize
 
 from lxml import etree
 
@@ -275,15 +274,6 @@ def save_minimal_index_html(
         )
 
 
-def unicode_normalize_xml(element):
-    if element.text:
-        element.text = normalize("NFD", unicode(element.text))
-    for child in element.getchildren():
-        unicode_normalize_xml(child)
-        if child.tail:
-            child.tail = normalize("NFD", unicode(child.tail))
-
-
 def parse_time(time_string: str) -> int:
     """Parse a time stamp in h/m/s(default)/ms or any combination of these units.
 

diff --git a/test/test_api.py b/test/test_api.py
@@ -5,6 +5,7 @@
 """
 
 import os
+import re
 from contextlib import redirect_stderr
 from io import StringIO
 from unittest import main
@@ -13,7 +14,7 @@
 from basic_test_case import BasicTestCase
 from sound_swallower_stub import SoundSwallowerStub
 
-import readalongs.api as api
+from readalongs import api
 from readalongs.log import LOGGER
 
 
@@ -96,6 +97,61 @@ def test_deprecated_prepare(self):
             api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
         self.assertIn("deprecated", "\n".join(cm.output))
 
+    def test_convert_to_readalong(self):
+        sentences = [
+            [
+                api.Token("Bonjöûr,", 0.2, 1.0),
+                api.Token(" "),
+                api.Token("hello", 1.0, 0.2),
+                api.Token("!"),
+            ],
+            [api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
+            [],
+            [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
+            [],
+            [],
+            [
+                api.Token("("),
+                api.Token('"'),
+                api.Token("Page2", 5.2, 0.2),
+                api.Token("."),
+                api.Token('"'),
+                api.Token(")"),
+            ],
+        ]
+
+        readalong = api.convert_to_readalong(sentences)
+        # print(readalong)
+
+        # Make the reference by calling align with the same text and adjusting
+        # things we expect to be different.
+        sentences_as_text = "\n".join(
+            "".join(token.text for token in sentence) for sentence in sentences
+        )
+        with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
+            f.write(sentences_as_text)
+        with redirect_stderr(StringIO()):
+            result = api.align(
+                self.tempdir / "sentences.txt",
+                self.data_dir / "noise.mp3",
+                self.tempdir / "output",
+                ("und",),
+            )
+        if result[0] != 0:
+            print("align error:", result)
+        with open(self.tempdir / "output/www/output.readalong", encoding="utf8") as f:
+            align_result = f.read()
+
+        align_result = re.sub(r" ARPABET=\".*?\"", "", align_result)
+        align_result = re.sub(
+            r'<w (id=".*?") time=".*?" dur=".*?"',
+            r'<w time="ttt" dur="ddd" \1',
+            align_result,
+        )
+        readalong = re.sub(r"time=\".*?\"", 'time="ttt"', readalong)
+        readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
+        self.assertEqual(readalong, align_result)
+
 
 if __name__ == "__main__":
     main()
diff --git a/test/test_dna_text.py b/test/test_dna_text.py
@@ -31,8 +31,8 @@ def test_tok_all_words(self):
         # print(etree.tounicode(tokenized))
 
         ref = """<document xml:lang="fra">
-<s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
-<s><w>Voici</w> <w>une</w> <w>deuxième</w> <w>phrase</w>.</s>
+<s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
+<s><w>Voici</w> <w>une</w> <w>deuxième</w> <w>phrase</w>.</s>
 </document>"""
         # print('as_txt="' + as_txt +'"')
         # print('ref="' + ref +'"')
@@ -42,8 +42,8 @@ def test_tok_all_words(self):
         ids_as_txt = etree.tounicode(with_ids)
         # print('with ids="' + ids_as_txt + '"')
         ref_with_ids = """<document xml:lang="fra">
-<s id="s0"><w id="s0w0">Bonjour</w>! <w id="s0w1">Comment</w> <w id="s0w2">ça</w> <w id="s0w3">va</w>?</s>
-<s id="s1"><w id="s1w0">Voici</w> <w id="s1w1">une</w> <w id="s1w2">deuxième</w> <w id="s1w3">phrase</w>.</s>
+<s id="s0"><w id="s0w0">Bonjour</w>! <w id="s0w1">Comment</w> <w id="s0w2">ça</w> <w id="s0w3">va</w>?</s>
+<s id="s1"><w id="s1w0">Voici</w> <w id="s1w1">une</w> <w id="s1w2">deuxième</w> <w id="s1w3">phrase</w>.</s>
 </document>"""
         self.assertEqual(ids_as_txt, ref_with_ids)
 
@@ -63,21 +63,21 @@ def test_tok_some_words(self):
         # print('as_txt="' + as_txt +'"')
 
         ref = """<document xml:lang="fra">
-<p><s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s></p>
-<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
-<s do-not-align="TRUE">Voici une deuxième phrase.</s>
-<s><w>Un</w> <foo do-not-align="1">mot ou deux</foo> <w>à</w> <w>exclure</w>.</s>
+<p><s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s></p>
+<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
+<s do-not-align="TRUE">Voici une deuxième phrase.</s>
+<s><w>Un</w> <foo do-not-align="1">mot ou deux</foo> <w>à</w> <w>exclure</w>.</s>
 </document>"""
         self.assertEqual(as_txt, ref)
 
         with_ids = add_ids(tokenized)
         ids_as_txt = etree.tounicode(with_ids)
         # print('with ids="' + ids_as_txt + '"')
         ref_with_ids = """<document xml:lang="fra">
-<p id="p0"><s id="p0s0"><w id="p0s0w0">Bonjour</w>! <w id="p0s0w1">Comment</w> <w id="p0s0w2">ça</w> <w id="p0s0w3">va</w>?</s></p>
-<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
-<s do-not-align="TRUE">Voici une deuxième phrase.</s>
-<s id="s0"><w id="s0w0">Un</w> <foo do-not-align="1">mot ou deux</foo> <w id="s0w1">à</w> <w id="s0w2">exclure</w>.</s>
+<p id="p0"><s id="p0s0"><w id="p0s0w0">Bonjour</w>! <w id="p0s0w1">Comment</w> <w id="p0s0w2">ça</w> <w id="p0s0w3">va</w>?</s></p>
+<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
+<s do-not-align="TRUE">Voici une deuxième phrase.</s>
+<s id="s0"><w id="s0w0">Un</w> <foo do-not-align="1">mot ou deux</foo> <w id="s0w1">à</w> <w id="s0w2">exclure</w>.</s>
 </document>"""
         self.assertEqual(ids_as_txt, ref_with_ids)
 

diff --git a/test/test_tokenize_xml.py b/test/test_tokenize_xml.py
@@ -39,7 +39,7 @@ def test_mixed_lang(self):
 """
         ref = """<document>
 <s xml:lang="atj"><w>Kwei</w>! <w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s>
-<s xml:lang="fra"><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
+<s xml:lang="fra"><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
 </document>"""
         xml = parse_xml(txt)
         with redirect_stderr(StringIO()):

diff --git a/test/test_web_api.py b/test/test_web_api.py
@@ -188,7 +188,7 @@ def test_logs(self):
             response = self.API_CLIENT.post("/api/v1/assemble", json=request)
         content = response.json()
         # print("Content", content)
-        self.assertIn('Could not g2p "ña" as French', content["log"])
+        self.assertIn('Could not g2p "ña" as French', content["log"])
 
     def test_debug(self):
         # Test the assemble endpoint with debug mode on