Skip to content

Commit

Permalink
Merge pull request #256 from ReadAlongs/dev.ej/convert-to-offline-html
Browse files Browse the repository at this point in the history
Dev.ej/convert to offline html
  • Loading branch information
joanise authored Dec 17, 2024
2 parents d28e070 + 25f2dad commit 75d7e27
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 47 deletions.
85 changes: 70 additions & 15 deletions readalongs/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,37 +34,50 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
Additional API function:
convert_to_readalong(sentences: Sequence[Sequence[Token]], language: Sequence[str]) -> str:
convert_prealigned_text_to_readalong():
convert a list of sentences into a readalong XML string ready to print to file.
Just like align and make_xml, this function expects a black line (empty list) to
make a paragraph break, and two consecutive blank lines to make a page break.
Unlike the other functions here, this function is not a wrapper around the CLI and
it just returns the string, non status.
it just returns the string, with no status.
convert_prealigned_text_to_offline_html():
same as convert_prealigned_text_to_readalong, but also creates an offline HTML file.
See their respective docstrings for more details.
"""

import io
import logging
import os
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Sequence, Tuple, Union

import click
from lxml import etree

from readalongs import cli
from readalongs.align import create_ras_from_text
from readalongs.log import LOGGER
from readalongs.text.add_ids_to_xml import add_ids
from readalongs.text.make_package import (
DEFAULT_HEADER,
DEFAULT_SUBHEADER,
DEFAULT_TITLE,
create_web_component_html,
)
from readalongs.text.util import parse_xml
from readalongs.util import JoinerCallbackForClick, get_langs_deferred


def align(
textfile: Union[str, Path],
audiofile: Union[str, Path],
output_base: Union[str, Path],
textfile: Union[str, os.PathLike],
audiofile: Union[str, os.PathLike],
output_base: Union[str, os.PathLike],
language: Sequence[str] = (),
output_formats: Sequence[str] = (),
**kwargs
**kwargs,
) -> Tuple[int, Optional[Exception], str]:
"""Run the "readalongs align" command from within a Python script.
Expand Down Expand Up @@ -119,10 +132,10 @@ def align(


def make_xml(
plaintextfile: Union[str, Path],
xmlfile: Union[str, Path],
plaintextfile: Union[str, os.PathLike],
xmlfile: Union[str, os.PathLike],
language: Sequence[str],
**kwargs
**kwargs,
) -> Tuple[int, Optional[Exception], str]:
"""Run the "readalongs make-xml" command from within a Python script.
Expand All @@ -144,7 +157,7 @@ def make_xml(
if isinstance(plaintextfile, click.utils.LazyFile)
else plaintextfile
)
xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile
xmlfile = str(xmlfile) if isinstance(xmlfile, os.PathLike) else xmlfile
logging_stream = io.StringIO()
logging_handler = logging.StreamHandler(logging_stream)
try:
Expand Down Expand Up @@ -210,7 +223,7 @@ def __init__(
self.is_word = is_word if is_word is not None else bool(time is not None)


def convert_to_readalong(
def convert_prealigned_text_to_readalong(
sentences: Sequence[Sequence[Token]],
language: Sequence[str] = ("und",),
) -> str:
Expand All @@ -224,10 +237,8 @@ def convert_to_readalong(
(has no functional effect since g2p is not applied, it's only metadata)
Returns:
str: the readalong XML string, ready to print to a .readalong file
str: the readalong XML file contents, ready to print to .readalong
"""
from lxml import etree

xml_text = create_ras_from_text(
["".join(token.text for token in sentence) for sentence in sentences],
language,
Expand Down Expand Up @@ -259,3 +270,47 @@ def convert_to_readalong(
).decode("utf8")

return xml_text + "\n"


def convert_prealigned_text_to_offline_html(
sentences: Sequence[Sequence[Token]],
audio_file_name: Union[str, os.PathLike],
language: Sequence[str] = ("und",),
title: str = DEFAULT_TITLE,
header: str = DEFAULT_HEADER,
subheader: str = DEFAULT_SUBHEADER,
) -> Tuple[str, str]:
"""Convert a list of sentences/paragraphs/pages of tokens, with corresponding audio,
into a readalong Offline HTML
Args:
sentences: a list of sentences, each of which is a list of Token objects
Paragraph breaks are marked by a empty sentence (i.e., an empty list)
Page breaks are marked by two empty sentences in a row
audio_file_name: the name of the audio file to be used in the offline HTML
language: list of languages to declare at the top of the readalong
(has no functional effect since g2p is not applied, it's only metadata)
title: optional title, will fill the HTML <title> tag
header: optional header, will fill the readalong <span slot='read-along-header'>
subheader: optional subheader, will fill the readalong <span slot='read-along-subheader'>
Returns:
(html_contents, readalong_contents):
- the readalong Offline HTML file contents, ready to print to .html
- the readalong XML file contents, ready to print to .readalong
"""

readalong_xml = convert_prealigned_text_to_readalong(sentences, language)
try:
readalong_file = tempfile.NamedTemporaryFile(
"w", encoding="utf8", delete=False, suffix=".readalong"
)
readalong_file.write(readalong_xml)
readalong_file.close()
# print(readalong_file.name)
offline_html = create_web_component_html(
readalong_file.name, audio_file_name, title, header, subheader
)
return offline_html, readalong_xml
finally:
os.unlink(readalong_file.name)
11 changes: 5 additions & 6 deletions readalongs/text/make_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import os
from base64 import b64encode
from mimetypes import guess_type
from typing import Any
from typing import Any, Union

from lxml import etree

Expand All @@ -36,7 +36,6 @@
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
<meta name="application-name" content="read along">
<meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
<meta name="generator" content="@readalongs/studio (cli) {studio_version}">
<title>{title}</title>
<script>{js}</script>
Expand All @@ -57,11 +56,11 @@
DEFAULT_SUBHEADER = "Your read-along subtitle goes here"


def encode_from_path(path: str) -> str:
def encode_from_path(path: Union[str, os.PathLike]) -> str:
"""Encode file from bytes to b64 string with data and mime signature
Args:
path (str): path to file
path: path to file
Returns:
str: base64 string with data and mime signature
Expand Down Expand Up @@ -118,8 +117,8 @@ def encode_from_path(path: str) -> str:


def create_web_component_html(
ras_path: str,
audio_path: str,
ras_path: Union[str, os.PathLike],
audio_path: Union[str, os.PathLike],
title=DEFAULT_TITLE,
header=DEFAULT_HEADER,
subheader=DEFAULT_SUBHEADER,
Expand Down
3 changes: 1 addition & 2 deletions readalongs/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from collections import OrderedDict
from datetime import datetime
from io import TextIOWrapper
from pathlib import Path
from typing import IO, Union

from lxml import etree
Expand Down Expand Up @@ -104,7 +103,7 @@ def is_do_not_align(element):
return dna in ("true", "True", "TRUE", "1")


def load_xml(input_path: Union[str, Path, IO]) -> etree.ElementTree:
def load_xml(input_path: Union[str, os.PathLike, IO]) -> etree.ElementTree:
"""Safely load an XML file with etree.parse to respect encoding
Return: the root of the XML etree
Expand Down
67 changes: 43 additions & 24 deletions test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,36 +97,38 @@ def test_deprecated_prepare(self):
api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
self.assertIn("deprecated", "\n".join(cm.output))

sentences_to_convert = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

def test_convert_to_readalong(self):
sentences = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

readalong = api.convert_to_readalong(sentences)

readalong = api.convert_prealigned_text_to_readalong(self.sentences_to_convert)
# print(readalong)

# Make the reference by calling align with the same text and adjusting
# things we expect to be different.
sentences_as_text = "\n".join(
"".join(token.text for token in sentence) for sentence in sentences
"".join(token.text for token in sentence)
for sentence in self.sentences_to_convert
)
with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
f.write(sentences_as_text)
Expand All @@ -152,6 +154,23 @@ def test_convert_to_readalong(self):
readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
self.assertEqual(readalong, align_result)

def test_convert_to_offline_html(self):
html, _ = api.convert_prealigned_text_to_offline_html(
self.sentences_to_convert,
str(self.data_dir / "noise.mp3"),
subheader="by Jove!",
)
# with open("test.html", "w", encoding="utf8") as f:
# f.write(html)
# print(html)
self.assertIn("<html", html)
self.assertIn("<body", html)
self.assertIn('<meta name="generator" content="@readalongs/studio (cli)', html)
self.assertIn('<read-along href="data:application/readalong+xml;base64', html)
self.assertIn('audio="data:audio/', html)
self.assertIn("<span slot='read-along-header'>", html)
self.assertIn("<span slot='read-along-subheader'>by Jove!</span>", html)


if __name__ == "__main__":
main()

0 comments on commit 75d7e27

Please sign in to comment.