Skip to content

Commit

Permalink
feat: add convert_to_offline_html() to api.py
Browse files Browse the repository at this point in the history
  • Loading branch information
joanise committed Dec 13, 2024
1 parent b3f83fd commit 975f3c4
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 27 deletions.
42 changes: 39 additions & 3 deletions readalongs/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,18 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
import io
import logging
import os
import tempfile
from dataclasses import dataclass
from typing import Optional, Sequence, Tuple, Union

import click
from lxml import etree

from readalongs import cli
from readalongs.align import create_ras_from_text
from readalongs.log import LOGGER
from readalongs.text.add_ids_to_xml import add_ids
from readalongs.text.make_package import create_web_component_html
from readalongs.text.util import parse_xml
from readalongs.util import JoinerCallbackForClick, get_langs_deferred

Expand Down Expand Up @@ -222,12 +225,11 @@ def convert_to_readalong(
Page breaks are marked by two empty sentences in a row
language: list of languages to declare at the top of the readalong
(has no functional effect since g2p is not applied, it's only metadata)
offline_html: if True, return the full offline HTML instead of just the .readlong XML
Returns:
str: the readalong XML string, ready to print to a .readalong file
str: the readalong XML or HTML file contents, ready to print to .readalong or .html
"""
from lxml import etree

xml_text = create_ras_from_text(
["".join(token.text for token in sentence) for sentence in sentences],
language,
Expand Down Expand Up @@ -259,3 +261,37 @@ def convert_to_readalong(
).decode("utf8")

return xml_text + "\n"


def convert_to_offline_html(
sentences: Sequence[Sequence[Token]],
audio_file_name: Union[str, os.PathLike],
language: Sequence[str] = ("und",),
) -> str:
"""Convert a list of sentences/paragraphs/pages of tokens, with corresponding autdio,
into a readalong Offline HTML
Args:
sentences: a list of sentences, each of which is a list of Token objects
Paragraph breaks are marked by a empty sentence (i.e., an empty list)
Page breaks are marked by two empty sentences in a row
audio_file_name: the name of the audio file to be used in the offline HTML
language: list of languages to declare at the top of the readalong
(has no functional effect since g2p is not applied, it's only metadata)
Returns:
str: the readalong XML or HTML file contents, ready to print to .readalong or .html
"""

readalong_xml = convert_to_readalong(sentences, language)
try:
readalong_file = tempfile.NamedTemporaryFile(
"w", encoding="utf8", delete=False, suffix=".readalong"
)
readalong_file.write(readalong_xml)
readalong_file.close()
print(readalong_file.name)
offline_html = create_web_component_html(readalong_file.name, audio_file_name)
return offline_html
finally:
os.unlink(readalong_file.name)
65 changes: 41 additions & 24 deletions test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,36 +97,38 @@ def test_deprecated_prepare(self):
api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
self.assertIn("deprecated", "\n".join(cm.output))

sentences_to_convert = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

def test_convert_to_readalong(self):
sentences = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

readalong = api.convert_to_readalong(sentences)

readalong = api.convert_to_readalong(self.sentences_to_convert)
# print(readalong)

# Make the reference by calling align with the same text and adjusting
# things we expect to be different.
sentences_as_text = "\n".join(
"".join(token.text for token in sentence) for sentence in sentences
"".join(token.text for token in sentence)
for sentence in self.sentences_to_convert
)
with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
f.write(sentences_as_text)
Expand All @@ -152,6 +154,21 @@ def test_convert_to_readalong(self):
readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
self.assertEqual(readalong, align_result)

def test_convert_to_offline_html(self):
html = api.convert_to_offline_html(
self.sentences_to_convert, str(self.data_dir / "noise.mp3")
)
with open("test.html", "w", encoding="utf8") as f:
f.write(html)
# print(html)
self.assertIn("<html", html)
self.assertIn("<body", html)
self.assertIn('<meta name="generator" content="@readalongs/studio (cli)', html)
self.assertIn('<read-along href="data:application/readalong+xml;base64', html)
self.assertIn('audio="data:audio/', html)
self.assertIn("<span slot='read-along-header'>", html)
self.assertIn("<span slot='read-along-subheader'>", html)


if __name__ == "__main__":
main()

0 comments on commit 975f3c4

Please sign in to comment.