diff --git a/readalongs/api.py b/readalongs/api.py index 8ca72b6d..4a7f2b30 100644 --- a/readalongs/api.py +++ b/readalongs/api.py @@ -34,37 +34,50 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json" Additional API function: -convert_to_readalong(sentences: Sequence[Sequence[Token]], language: Sequence[str]) -> str: +convert_prealigned_text_to_readalong(): convert a list of sentences into a readalong XML string ready to print to file. Just like align and make_xml, this function expects a black line (empty list) to make a paragraph break, and two consecutive blank lines to make a page break. Unlike the other functions here, this function is not a wrapper around the CLI and - it just returns the string, non status. + it just returns the string, with no status. + +convert_prealigned_text_to_offline_html(): + same as convert_prealigned_text_to_readalong, but also creates an offline HTML file. + +See their respective docstrings for more details. """ import io import logging +import os +import tempfile from dataclasses import dataclass -from pathlib import Path from typing import Optional, Sequence, Tuple, Union import click +from lxml import etree from readalongs import cli from readalongs.align import create_ras_from_text from readalongs.log import LOGGER from readalongs.text.add_ids_to_xml import add_ids +from readalongs.text.make_package import ( + DEFAULT_HEADER, + DEFAULT_SUBHEADER, + DEFAULT_TITLE, + create_web_component_html, +) from readalongs.text.util import parse_xml from readalongs.util import JoinerCallbackForClick, get_langs_deferred def align( - textfile: Union[str, Path], - audiofile: Union[str, Path], - output_base: Union[str, Path], + textfile: Union[str, os.PathLike], + audiofile: Union[str, os.PathLike], + output_base: Union[str, os.PathLike], language: Sequence[str] = (), output_formats: Sequence[str] = (), - **kwargs + **kwargs, ) -> Tuple[int, Optional[Exception], str]: """Run the "readalongs align" command from within a Python script. @@ -119,10 +132,10 @@ def align( def make_xml( - plaintextfile: Union[str, Path], - xmlfile: Union[str, Path], + plaintextfile: Union[str, os.PathLike], + xmlfile: Union[str, os.PathLike], language: Sequence[str], - **kwargs + **kwargs, ) -> Tuple[int, Optional[Exception], str]: """Run the "readalongs make-xml" command from within a Python script. @@ -144,7 +157,7 @@ def make_xml( if isinstance(plaintextfile, click.utils.LazyFile) else plaintextfile ) - xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile + xmlfile = str(xmlfile) if isinstance(xmlfile, os.PathLike) else xmlfile logging_stream = io.StringIO() logging_handler = logging.StreamHandler(logging_stream) try: @@ -210,7 +223,7 @@ def __init__( self.is_word = is_word if is_word is not None else bool(time is not None) -def convert_to_readalong( +def convert_prealigned_text_to_readalong( sentences: Sequence[Sequence[Token]], language: Sequence[str] = ("und",), ) -> str: @@ -224,10 +237,8 @@ def convert_to_readalong( (has no functional effect since g2p is not applied, it's only metadata) Returns: - str: the readalong XML string, ready to print to a .readalong file + str: the readalong XML file contents, ready to print to .readalong """ - from lxml import etree - xml_text = create_ras_from_text( ["".join(token.text for token in sentence) for sentence in sentences], language, @@ -259,3 +270,47 @@ def convert_to_readalong( ).decode("utf8") return xml_text + "\n" + + +def convert_prealigned_text_to_offline_html( + sentences: Sequence[Sequence[Token]], + audio_file_name: Union[str, os.PathLike], + language: Sequence[str] = ("und",), + title: str = DEFAULT_TITLE, + header: str = DEFAULT_HEADER, + subheader: str = DEFAULT_SUBHEADER, +) -> Tuple[str, str]: + """Convert a list of sentences/paragraphs/pages of tokens, with corresponding audio, + into a readalong Offline HTML + + Args: + sentences: a list of sentences, each of which is a list of Token objects + Paragraph breaks are marked by a empty sentence (i.e., an empty list) + Page breaks are marked by two empty sentences in a row + audio_file_name: the name of the audio file to be used in the offline HTML + language: list of languages to declare at the top of the readalong + (has no functional effect since g2p is not applied, it's only metadata) + title: optional title, will fill the HTML tag + header: optional header, will fill the readalong <span slot='read-along-header'> + subheader: optional subheader, will fill the readalong <span slot='read-along-subheader'> + + Returns: + (html_contents, readalong_contents): + - the readalong Offline HTML file contents, ready to print to .html + - the readalong XML file contents, ready to print to .readalong + """ + + readalong_xml = convert_prealigned_text_to_readalong(sentences, language) + try: + readalong_file = tempfile.NamedTemporaryFile( + "w", encoding="utf8", delete=False, suffix=".readalong" + ) + readalong_file.write(readalong_xml) + readalong_file.close() + # print(readalong_file.name) + offline_html = create_web_component_html( + readalong_file.name, audio_file_name, title, header, subheader + ) + return offline_html, readalong_xml + finally: + os.unlink(readalong_file.name) diff --git a/readalongs/text/make_package.py b/readalongs/text/make_package.py index 8f821f31..3f657423 100644 --- a/readalongs/text/make_package.py +++ b/readalongs/text/make_package.py @@ -16,7 +16,7 @@ import os from base64 import b64encode from mimetypes import guess_type -from typing import Any +from typing import Any, Union from lxml import etree @@ -36,7 +36,6 @@ <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0"> <meta name="application-name" content="read along"> - <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0"> <meta name="generator" content="@readalongs/studio (cli) {studio_version}"> <title>{title} @@ -57,11 +56,11 @@ DEFAULT_SUBHEADER = "Your read-along subtitle goes here" -def encode_from_path(path: str) -> str: +def encode_from_path(path: Union[str, os.PathLike]) -> str: """Encode file from bytes to b64 string with data and mime signature Args: - path (str): path to file + path: path to file Returns: str: base64 string with data and mime signature @@ -118,8 +117,8 @@ def encode_from_path(path: str) -> str: def create_web_component_html( - ras_path: str, - audio_path: str, + ras_path: Union[str, os.PathLike], + audio_path: Union[str, os.PathLike], title=DEFAULT_TITLE, header=DEFAULT_HEADER, subheader=DEFAULT_SUBHEADER, diff --git a/readalongs/text/util.py b/readalongs/text/util.py index 3c3c6780..874f6150 100644 --- a/readalongs/text/util.py +++ b/readalongs/text/util.py @@ -14,7 +14,6 @@ from collections import OrderedDict from datetime import datetime from io import TextIOWrapper -from pathlib import Path from typing import IO, Union from lxml import etree @@ -104,7 +103,7 @@ def is_do_not_align(element): return dna in ("true", "True", "TRUE", "1") -def load_xml(input_path: Union[str, Path, IO]) -> etree.ElementTree: +def load_xml(input_path: Union[str, os.PathLike, IO]) -> etree.ElementTree: """Safely load an XML file with etree.parse to respect encoding Return: the root of the XML etree diff --git a/test/test_api.py b/test/test_api.py index 7499de13..70a1bfdc 100755 --- a/test/test_api.py +++ b/test/test_api.py @@ -97,36 +97,38 @@ def test_deprecated_prepare(self): api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",)) self.assertIn("deprecated", "\n".join(cm.output)) + sentences_to_convert = [ + [ + api.Token("Bonjöûr,", 0.2, 1.0), + api.Token(" "), + api.Token("hello", 1.0, 0.2), + api.Token("!"), + ], + [api.Token("Sentence2", 4.2, 0.2), api.Token("!")], + [], + [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")], + [], + [], + [ + api.Token("("), + api.Token('"'), + api.Token("Page2", 5.2, 0.2), + api.Token("."), + api.Token('"'), + api.Token(")"), + ], + ] + def test_convert_to_readalong(self): - sentences = [ - [ - api.Token("Bonjöûr,", 0.2, 1.0), - api.Token(" "), - api.Token("hello", 1.0, 0.2), - api.Token("!"), - ], - [api.Token("Sentence2", 4.2, 0.2), api.Token("!")], - [], - [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")], - [], - [], - [ - api.Token("("), - api.Token('"'), - api.Token("Page2", 5.2, 0.2), - api.Token("."), - api.Token('"'), - api.Token(")"), - ], - ] - - readalong = api.convert_to_readalong(sentences) + + readalong = api.convert_prealigned_text_to_readalong(self.sentences_to_convert) # print(readalong) # Make the reference by calling align with the same text and adjusting # things we expect to be different. sentences_as_text = "\n".join( - "".join(token.text for token in sentence) for sentence in sentences + "".join(token.text for token in sentence) + for sentence in self.sentences_to_convert ) with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f: f.write(sentences_as_text) @@ -152,6 +154,23 @@ def test_convert_to_readalong(self): readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong) self.assertEqual(readalong, align_result) + def test_convert_to_offline_html(self): + html, _ = api.convert_prealigned_text_to_offline_html( + self.sentences_to_convert, + str(self.data_dir / "noise.mp3"), + subheader="by Jove!", + ) + # with open("test.html", "w", encoding="utf8") as f: + # f.write(html) + # print(html) + self.assertIn("", html) + self.assertIn("by Jove!", html) + if __name__ == "__main__": main()