Skip to content

Commit

Permalink
Merge pull request #253 from ReadAlongs/dev.ej/convert_api
Browse files Browse the repository at this point in the history
Add a convert_to_readalong() function for use by other projects, e.g., EveryVoice
  • Loading branch information
joanise authored Dec 10, 2024
2 parents 2afc316 + eb1e7b3 commit c6d0f64
Show file tree
Hide file tree
Showing 9 changed files with 186 additions and 40 deletions.
2 changes: 1 addition & 1 deletion readalongs/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
VERSION = "1.1.0"
VERSION = "1.2.0"

READALONG_FILE_FORMAT_VERSION = "1.2"
2 changes: 1 addition & 1 deletion readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
"""


def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
def create_ras_from_text(lines: Iterable[str], text_languages: Sequence[str]) -> str:
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
Uses the line sequence to infer paragraph and sentence structure from plain text:
Assumes a double blank line marks a page break, and a single blank line
Expand Down
122 changes: 112 additions & 10 deletions readalongs/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,48 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
you come accross such an exception and you believe the
problem is not in your own code.
- log: any logging messages issued during execution
Additional API function:
convert_to_readalong(sentences: Sequence[Sequence[Token]], language: Sequence[str]) -> str:
convert a list of sentences into a readalong XML string ready to print to file.
Just like align and make_xml, this function expects a black line (empty list) to
make a paragraph break, and two consecutive blank lines to make a page break.
Unlike the other functions here, this function is not a wrapper around the CLI and
it just returns the string, non status.
"""

import io
import logging
from typing import Optional, Tuple
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Sequence, Tuple, Union

import click

from readalongs import cli
from readalongs.align import create_ras_from_text
from readalongs.log import LOGGER
from readalongs.text.add_ids_to_xml import add_ids
from readalongs.text.util import parse_xml
from readalongs.util import JoinerCallbackForClick, get_langs_deferred


def align(
textfile, audiofile, output_base, language=(), output_formats=(), **kwargs
textfile: Union[str, Path],
audiofile: Union[str, Path],
output_base: Union[str, Path],
language: Sequence[str] = (),
output_formats: Sequence[str] = (),
**kwargs
) -> Tuple[int, Optional[Exception], str]:
"""Run the "readalongs align" command from within a Python script.
Args:
textfile (str | Path): input text file (XML or plain text)
audiofile (str | Path): input audio file (format supported by ffmpeg)
output_base (str | Path): basename for output files
language (List[str]): Specify only of textfile is plain text;
textfile: input text file (XML or plain text)
audiofile: input audio file (format supported by ffmpeg)
output_base: basename for output files
language: Specify only if textfile is plain text;
list of languages for g2p and g2p cascade
save_temps (bool): Optional; whether to save temporary files
Expand Down Expand Up @@ -100,14 +119,17 @@ def align(


def make_xml(
plaintextfile, xmlfile, language, **kwargs
plaintextfile: Union[str, Path],
xmlfile: Union[str, Path],
language: Sequence[str],
**kwargs
) -> Tuple[int, Optional[Exception], str]:
"""Run the "readalongs make-xml" command from within a Python script.
Args:
plaintextfile (str | Path): input plain text file
xmlfile (str | Path): output XML file
language (List[str]): list of languages for g2p and g2p cascade
plaintextfile: input plain text file
xmlfile: output XML file
language: list of languages for g2p and g2p cascade
Run "readalongs make-xml -h" or consult
https://readalong-studio.readthedocs.io/en/latest/cli-ref.html#readalongs-make-xml
Expand All @@ -116,11 +138,13 @@ def make_xml(
Returns: (status, exception, log_text)
"""
# plaintextfile is not a file object if passed from click

plaintextfile = (
plaintextfile.name
if isinstance(plaintextfile, click.utils.LazyFile)
else plaintextfile
)
xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile
logging_stream = io.StringIO()
logging_handler = logging.StreamHandler(logging_stream)
try:
Expand Down Expand Up @@ -157,3 +181,81 @@ def prepare(*args, **kwargs):
"readalongs.api.prepare() is deprecated. Please use make_xml() instead."
)
return make_xml(*args, **kwargs)


@dataclass
class Token:
"""A token in a readalong: a word has a time and dur, a non-word does not."""

text: str
time: Optional[float]
dur: Optional[float]
is_word: bool

def __init__(
self,
text: str,
time: Optional[float] = None,
dur: Optional[float] = None,
is_word: Optional[bool] = None,
):
"""Create a word token:
t = Token("asdf", time=1.3, dur=.34) or t = Token("asdf", 1.3, .34)
Create a non-word token (e.g., punctuation, spacing):
t = Token(", ")
"""
self.text = text
self.time = time
self.dur = dur
self.is_word = is_word if is_word is not None else bool(time is not None)


def convert_to_readalong(
sentences: Sequence[Sequence[Token]],
language: Sequence[str] = ("und",),
) -> str:
"""Convert a list of sentences/paragraphs/pages of tokens into a readalong XML string.
Args:
sentences: a list of sentences, each of which is a list of Token objects
Paragraph breaks are marked by a empty sentence (i.e., an empty list)
Page breaks are marked by two empty sentences in a row
language: list of languages to declare at the top of the readalong
(has no functional effect since g2p is not applied, it's only metadata)
Returns:
str: the readalong XML string, ready to print to a .readalong file
"""
from lxml import etree

xml_text = create_ras_from_text(
["".join(token.text for token in sentence) for sentence in sentences],
language,
)
xml = parse_xml(xml_text)
filtered_sentences = [sentence for sentence in sentences if sentence]
for sentence, sentence_xml in zip(filtered_sentences, xml.findall(".//s")):
sentence_xml.text = ""
for token in sentence:
if token.is_word:
w = etree.Element("w")
w.text = token.text
w.attrib["time"] = str(token.time)
w.attrib["dur"] = str(token.dur)
sentence_xml.append(w)
else:
if len(sentence_xml): # if it has children
if not sentence_xml[-1].tail:
sentence_xml[-1].tail = ""
sentence_xml[-1].tail += token.text
else:
sentence_xml.text += token.text

xml = add_ids(xml)
xml_text = etree.tostring(
xml,
encoding="utf-8",
xml_declaration=True,
).decode("utf8")

return xml_text + "\n"
4 changes: 1 addition & 3 deletions readalongs/text/tokenize_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from lxml import etree

from readalongs.log import LOGGER
from readalongs.text.util import get_lang_attrib, is_do_not_align, unicode_normalize_xml
from readalongs.text.util import get_lang_attrib, is_do_not_align


def tokenize_xml_in_place(xml):
Expand Down Expand Up @@ -115,8 +115,6 @@ def add_word_children(element):
def tokenize_xml(xml):
"""Returns a deep copy of xml with all words wrapped in a "w" XML element"""
xml = deepcopy(xml)
# FIXME: different langs have different normalizations, is this necessary?
unicode_normalize_xml(xml)
words = xml.xpath(".//w")
if words:
LOGGER.info("Words (<w>) already present; skipping tokenization")
Expand Down
10 changes: 0 additions & 10 deletions readalongs/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from io import TextIOWrapper
from pathlib import Path
from typing import IO, Union
from unicodedata import normalize

from lxml import etree

Expand Down Expand Up @@ -275,15 +274,6 @@ def save_minimal_index_html(
)


def unicode_normalize_xml(element):
if element.text:
element.text = normalize("NFD", unicode(element.text))
for child in element.getchildren():
unicode_normalize_xml(child)
if child.tail:
child.tail = normalize("NFD", unicode(child.tail))


def parse_time(time_string: str) -> int:
"""Parse a time stamp in h/m/s(default)/ms or any combination of these units.
Expand Down
58 changes: 57 additions & 1 deletion test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import os
import re
from contextlib import redirect_stderr
from io import StringIO
from unittest import main
Expand All @@ -13,7 +14,7 @@
from basic_test_case import BasicTestCase
from sound_swallower_stub import SoundSwallowerStub

import readalongs.api as api
from readalongs import api
from readalongs.log import LOGGER


Expand Down Expand Up @@ -96,6 +97,61 @@ def test_deprecated_prepare(self):
api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
self.assertIn("deprecated", "\n".join(cm.output))

def test_convert_to_readalong(self):
sentences = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

readalong = api.convert_to_readalong(sentences)
# print(readalong)

# Make the reference by calling align with the same text and adjusting
# things we expect to be different.
sentences_as_text = "\n".join(
"".join(token.text for token in sentence) for sentence in sentences
)
with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
f.write(sentences_as_text)
with redirect_stderr(StringIO()):
result = api.align(
self.tempdir / "sentences.txt",
self.data_dir / "noise.mp3",
self.tempdir / "output",
("und",),
)
if result[0] != 0:
print("align error:", result)
with open(self.tempdir / "output/www/output.readalong", encoding="utf8") as f:
align_result = f.read()

align_result = re.sub(r" ARPABET=\".*?\"", "", align_result)
align_result = re.sub(
r'<w (id=".*?") time=".*?" dur=".*?"',
r'<w time="ttt" dur="ddd" \1',
align_result,
)
readalong = re.sub(r"time=\".*?\"", 'time="ttt"', readalong)
readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
self.assertEqual(readalong, align_result)


if __name__ == "__main__":
main()
24 changes: 12 additions & 12 deletions test/test_dna_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ def test_tok_all_words(self):
# print(etree.tounicode(tokenized))

ref = """<document xml:lang="fra">
<s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
<s><w>Voici</w> <w>une</w> <w>deuxième</w> <w>phrase</w>.</s>
<s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
<s><w>Voici</w> <w>une</w> <w>deuxième</w> <w>phrase</w>.</s>
</document>"""
# print('as_txt="' + as_txt +'"')
# print('ref="' + ref +'"')
Expand All @@ -42,8 +42,8 @@ def test_tok_all_words(self):
ids_as_txt = etree.tounicode(with_ids)
# print('with ids="' + ids_as_txt + '"')
ref_with_ids = """<document xml:lang="fra">
<s id="s0"><w id="s0w0">Bonjour</w>! <w id="s0w1">Comment</w> <w id="s0w2">ça</w> <w id="s0w3">va</w>?</s>
<s id="s1"><w id="s1w0">Voici</w> <w id="s1w1">une</w> <w id="s1w2">deuxième</w> <w id="s1w3">phrase</w>.</s>
<s id="s0"><w id="s0w0">Bonjour</w>! <w id="s0w1">Comment</w> <w id="s0w2">ça</w> <w id="s0w3">va</w>?</s>
<s id="s1"><w id="s1w0">Voici</w> <w id="s1w1">une</w> <w id="s1w2">deuxième</w> <w id="s1w3">phrase</w>.</s>
</document>"""
self.assertEqual(ids_as_txt, ref_with_ids)

Expand All @@ -63,21 +63,21 @@ def test_tok_some_words(self):
# print('as_txt="' + as_txt +'"')

ref = """<document xml:lang="fra">
<p><s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s></p>
<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
<s do-not-align="TRUE">Voici une deuxième phrase.</s>
<s><w>Un</w> <foo do-not-align="1">mot ou deux</foo> <w></w> <w>exclure</w>.</s>
<p><s><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s></p>
<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
<s do-not-align="TRUE">Voici une deuxième phrase.</s>
<s><w>Un</w> <foo do-not-align="1">mot ou deux</foo> <w>à</w> <w>exclure</w>.</s>
</document>"""
self.assertEqual(as_txt, ref)

with_ids = add_ids(tokenized)
ids_as_txt = etree.tounicode(with_ids)
# print('with ids="' + ids_as_txt + '"')
ref_with_ids = """<document xml:lang="fra">
<p id="p0"><s id="p0s0"><w id="p0s0w0">Bonjour</w>! <w id="p0s0w1">Comment</w> <w id="p0s0w2">ça</w> <w id="p0s0w3">va</w>?</s></p>
<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
<s do-not-align="TRUE">Voici une deuxième phrase.</s>
<s id="s0"><w id="s0w0">Un</w> <foo do-not-align="1">mot ou deux</foo> <w id="s0w1"></w> <w id="s0w2">exclure</w>.</s>
<p id="p0"><s id="p0s0"><w id="p0s0w0">Bonjour</w>! <w id="p0s0w1">Comment</w> <w id="p0s0w2">ça</w> <w id="p0s0w3">va</w>?</s></p>
<p do-not-align="true"><s>Bonjour! Comment ça va?</s></p>
<s do-not-align="TRUE">Voici une deuxième phrase.</s>
<s id="s0"><w id="s0w0">Un</w> <foo do-not-align="1">mot ou deux</foo> <w id="s0w1">à</w> <w id="s0w2">exclure</w>.</s>
</document>"""
self.assertEqual(ids_as_txt, ref_with_ids)

Expand Down
2 changes: 1 addition & 1 deletion test/test_tokenize_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_mixed_lang(self):
"""
ref = """<document>
<s xml:lang="atj"><w>Kwei</w>! <w>Tan</w> <w>e</w> <w>ici</w> <w>matisihin</w>?</s>
<s xml:lang="fra"><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
<s xml:lang="fra"><w>Bonjour</w>! <w>Comment</w> <w>ça</w> <w>va</w>?</s>
</document>"""
xml = parse_xml(txt)
with redirect_stderr(StringIO()):
Expand Down
2 changes: 1 addition & 1 deletion test/test_web_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def test_logs(self):
response = self.API_CLIENT.post("/api/v1/assemble", json=request)
content = response.json()
# print("Content", content)
self.assertIn('Could not g2p "ña" as French', content["log"])
self.assertIn('Could not g2p "ña" as French', content["log"])

def test_debug(self):
# Test the assemble endpoint with debug mode on
Expand Down

0 comments on commit c6d0f64

Please sign in to comment.