diff --git a/readalongs/_version.py b/readalongs/_version.py index a17ef292..a4438160 100644 --- a/readalongs/_version.py +++ b/readalongs/_version.py @@ -1,3 +1,3 @@ -VERSION = "1.1.0" +VERSION = "1.2.0" READALONG_FILE_FORMAT_VERSION = "1.2" diff --git a/readalongs/align.py b/readalongs/align.py index 88df880a..9c2764b7 100644 --- a/readalongs/align.py +++ b/readalongs/align.py @@ -1191,7 +1191,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"): """ -def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str: +def create_ras_from_text(lines: Iterable[str], text_languages: Sequence[str]) -> str: """Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd) Uses the line sequence to infer paragraph and sentence structure from plain text: Assumes a double blank line marks a page break, and a single blank line diff --git a/readalongs/api.py b/readalongs/api.py index 7ae413f3..8ca72b6d 100644 --- a/readalongs/api.py +++ b/readalongs/api.py @@ -31,29 +31,48 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json" you come accross such an exception and you believe the problem is not in your own code. - log: any logging messages issued during execution + +Additional API function: + +convert_to_readalong(sentences: Sequence[Sequence[Token]], language: Sequence[str]) -> str: + convert a list of sentences into a readalong XML string ready to print to file. + Just like align and make_xml, this function expects a black line (empty list) to + make a paragraph break, and two consecutive blank lines to make a page break. + Unlike the other functions here, this function is not a wrapper around the CLI and + it just returns the string, non status. """ import io import logging -from typing import Optional, Tuple +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Sequence, Tuple, Union import click from readalongs import cli +from readalongs.align import create_ras_from_text from readalongs.log import LOGGER +from readalongs.text.add_ids_to_xml import add_ids +from readalongs.text.util import parse_xml from readalongs.util import JoinerCallbackForClick, get_langs_deferred def align( - textfile, audiofile, output_base, language=(), output_formats=(), **kwargs + textfile: Union[str, Path], + audiofile: Union[str, Path], + output_base: Union[str, Path], + language: Sequence[str] = (), + output_formats: Sequence[str] = (), + **kwargs ) -> Tuple[int, Optional[Exception], str]: """Run the "readalongs align" command from within a Python script. Args: - textfile (str | Path): input text file (XML or plain text) - audiofile (str | Path): input audio file (format supported by ffmpeg) - output_base (str | Path): basename for output files - language (List[str]): Specify only of textfile is plain text; + textfile: input text file (XML or plain text) + audiofile: input audio file (format supported by ffmpeg) + output_base: basename for output files + language: Specify only if textfile is plain text; list of languages for g2p and g2p cascade save_temps (bool): Optional; whether to save temporary files @@ -100,14 +119,17 @@ def align( def make_xml( - plaintextfile, xmlfile, language, **kwargs + plaintextfile: Union[str, Path], + xmlfile: Union[str, Path], + language: Sequence[str], + **kwargs ) -> Tuple[int, Optional[Exception], str]: """Run the "readalongs make-xml" command from within a Python script. Args: - plaintextfile (str | Path): input plain text file - xmlfile (str | Path): output XML file - language (List[str]): list of languages for g2p and g2p cascade + plaintextfile: input plain text file + xmlfile: output XML file + language: list of languages for g2p and g2p cascade Run "readalongs make-xml -h" or consult https://readalong-studio.readthedocs.io/en/latest/cli-ref.html#readalongs-make-xml @@ -116,11 +138,13 @@ def make_xml( Returns: (status, exception, log_text) """ # plaintextfile is not a file object if passed from click + plaintextfile = ( plaintextfile.name if isinstance(plaintextfile, click.utils.LazyFile) else plaintextfile ) + xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile logging_stream = io.StringIO() logging_handler = logging.StreamHandler(logging_stream) try: @@ -157,3 +181,81 @@ def prepare(*args, **kwargs): "readalongs.api.prepare() is deprecated. Please use make_xml() instead." ) return make_xml(*args, **kwargs) + + +@dataclass +class Token: + """A token in a readalong: a word has a time and dur, a non-word does not.""" + + text: str + time: Optional[float] + dur: Optional[float] + is_word: bool + + def __init__( + self, + text: str, + time: Optional[float] = None, + dur: Optional[float] = None, + is_word: Optional[bool] = None, + ): + """Create a word token: + t = Token("asdf", time=1.3, dur=.34) or t = Token("asdf", 1.3, .34) + Create a non-word token (e.g., punctuation, spacing): + t = Token(", ") + """ + self.text = text + self.time = time + self.dur = dur + self.is_word = is_word if is_word is not None else bool(time is not None) + + +def convert_to_readalong( + sentences: Sequence[Sequence[Token]], + language: Sequence[str] = ("und",), +) -> str: + """Convert a list of sentences/paragraphs/pages of tokens into a readalong XML string. + + Args: + sentences: a list of sentences, each of which is a list of Token objects + Paragraph breaks are marked by a empty sentence (i.e., an empty list) + Page breaks are marked by two empty sentences in a row + language: list of languages to declare at the top of the readalong + (has no functional effect since g2p is not applied, it's only metadata) + + Returns: + str: the readalong XML string, ready to print to a .readalong file + """ + from lxml import etree + + xml_text = create_ras_from_text( + ["".join(token.text for token in sentence) for sentence in sentences], + language, + ) + xml = parse_xml(xml_text) + filtered_sentences = [sentence for sentence in sentences if sentence] + for sentence, sentence_xml in zip(filtered_sentences, xml.findall(".//s")): + sentence_xml.text = "" + for token in sentence: + if token.is_word: + w = etree.Element("w") + w.text = token.text + w.attrib["time"] = str(token.time) + w.attrib["dur"] = str(token.dur) + sentence_xml.append(w) + else: + if len(sentence_xml): # if it has children + if not sentence_xml[-1].tail: + sentence_xml[-1].tail = "" + sentence_xml[-1].tail += token.text + else: + sentence_xml.text += token.text + + xml = add_ids(xml) + xml_text = etree.tostring( + xml, + encoding="utf-8", + xml_declaration=True, + ).decode("utf8") + + return xml_text + "\n" diff --git a/readalongs/text/tokenize_xml.py b/readalongs/text/tokenize_xml.py index b53a9227..9de19ccb 100644 --- a/readalongs/text/tokenize_xml.py +++ b/readalongs/text/tokenize_xml.py @@ -32,7 +32,7 @@ from lxml import etree from readalongs.log import LOGGER -from readalongs.text.util import get_lang_attrib, is_do_not_align, unicode_normalize_xml +from readalongs.text.util import get_lang_attrib, is_do_not_align def tokenize_xml_in_place(xml): @@ -115,8 +115,6 @@ def add_word_children(element): def tokenize_xml(xml): """Returns a deep copy of xml with all words wrapped in a "w" XML element""" xml = deepcopy(xml) - # FIXME: different langs have different normalizations, is this necessary? - unicode_normalize_xml(xml) words = xml.xpath(".//w") if words: LOGGER.info("Words () already present; skipping tokenization") diff --git a/readalongs/text/util.py b/readalongs/text/util.py index 68cf2980..3c3c6780 100644 --- a/readalongs/text/util.py +++ b/readalongs/text/util.py @@ -16,7 +16,6 @@ from io import TextIOWrapper from pathlib import Path from typing import IO, Union -from unicodedata import normalize from lxml import etree @@ -275,15 +274,6 @@ def save_minimal_index_html( ) -def unicode_normalize_xml(element): - if element.text: - element.text = normalize("NFD", unicode(element.text)) - for child in element.getchildren(): - unicode_normalize_xml(child) - if child.tail: - child.tail = normalize("NFD", unicode(child.tail)) - - def parse_time(time_string: str) -> int: """Parse a time stamp in h/m/s(default)/ms or any combination of these units. diff --git a/test/test_api.py b/test/test_api.py index 34aa0ae4..7499de13 100755 --- a/test/test_api.py +++ b/test/test_api.py @@ -5,6 +5,7 @@ """ import os +import re from contextlib import redirect_stderr from io import StringIO from unittest import main @@ -13,7 +14,7 @@ from basic_test_case import BasicTestCase from sound_swallower_stub import SoundSwallowerStub -import readalongs.api as api +from readalongs import api from readalongs.log import LOGGER @@ -96,6 +97,61 @@ def test_deprecated_prepare(self): api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",)) self.assertIn("deprecated", "\n".join(cm.output)) + def test_convert_to_readalong(self): + sentences = [ + [ + api.Token("Bonjöûr,", 0.2, 1.0), + api.Token(" "), + api.Token("hello", 1.0, 0.2), + api.Token("!"), + ], + [api.Token("Sentence2", 4.2, 0.2), api.Token("!")], + [], + [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")], + [], + [], + [ + api.Token("("), + api.Token('"'), + api.Token("Page2", 5.2, 0.2), + api.Token("."), + api.Token('"'), + api.Token(")"), + ], + ] + + readalong = api.convert_to_readalong(sentences) + # print(readalong) + + # Make the reference by calling align with the same text and adjusting + # things we expect to be different. + sentences_as_text = "\n".join( + "".join(token.text for token in sentence) for sentence in sentences + ) + with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f: + f.write(sentences_as_text) + with redirect_stderr(StringIO()): + result = api.align( + self.tempdir / "sentences.txt", + self.data_dir / "noise.mp3", + self.tempdir / "output", + ("und",), + ) + if result[0] != 0: + print("align error:", result) + with open(self.tempdir / "output/www/output.readalong", encoding="utf8") as f: + align_result = f.read() + + align_result = re.sub(r" ARPABET=\".*?\"", "", align_result) + align_result = re.sub( + r' -Bonjour! Comment ça va? -Voici une deuxième phrase. +Bonjour! Comment ça va? +Voici une deuxième phrase. """ # print('as_txt="' + as_txt +'"') # print('ref="' + ref +'"') @@ -42,8 +42,8 @@ def test_tok_all_words(self): ids_as_txt = etree.tounicode(with_ids) # print('with ids="' + ids_as_txt + '"') ref_with_ids = """ -Bonjour! Comment ça va? -Voici une deuxième phrase. +Bonjour! Comment ça va? +Voici une deuxième phrase. """ self.assertEqual(ids_as_txt, ref_with_ids) @@ -63,10 +63,10 @@ def test_tok_some_words(self): # print('as_txt="' + as_txt +'"') ref = """ -

Bonjour! Comment ça va?

-

Bonjour! Comment ça va?

-Voici une deuxième phrase. -Un mot ou deux exclure. +

Bonjour! Comment ça va?

+

Bonjour! Comment ça va?

+Voici une deuxième phrase. +Un mot ou deux à exclure.
""" self.assertEqual(as_txt, ref) @@ -74,10 +74,10 @@ def test_tok_some_words(self): ids_as_txt = etree.tounicode(with_ids) # print('with ids="' + ids_as_txt + '"') ref_with_ids = """ -

Bonjour! Comment ça va?

-

Bonjour! Comment ça va?

-Voici une deuxième phrase. -Un mot ou deux exclure. +

Bonjour! Comment ça va?

+

Bonjour! Comment ça va?

+Voici une deuxième phrase. +Un mot ou deux à exclure.
""" self.assertEqual(ids_as_txt, ref_with_ids) diff --git a/test/test_tokenize_xml.py b/test/test_tokenize_xml.py index 744ce499..782488c4 100755 --- a/test/test_tokenize_xml.py +++ b/test/test_tokenize_xml.py @@ -39,7 +39,7 @@ def test_mixed_lang(self): """ ref = """ Kwei! Tan e ici matisihin? -Bonjour! Comment ça va? +Bonjour! Comment ça va? """ xml = parse_xml(txt) with redirect_stderr(StringIO()): diff --git a/test/test_web_api.py b/test/test_web_api.py index bcb39720..ab829d7a 100755 --- a/test/test_web_api.py +++ b/test/test_web_api.py @@ -188,7 +188,7 @@ def test_logs(self): response = self.API_CLIENT.post("/api/v1/assemble", json=request) content = response.json() # print("Content", content) - self.assertIn('Could not g2p "ña" as French', content["log"]) + self.assertIn('Could not g2p "ña" as French', content["log"]) def test_debug(self): # Test the assemble endpoint with debug mode on