Merge pull request #256 from ReadAlongs/dev.ej/convert-to-offline-html

Dev.ej/convert to offline html
ReadAlongs · Dec 17, 2024 · 75d7e27 · 75d7e27
2 parents d28e070 + 25f2dad
commit 75d7e27
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 47 deletions.
diff --git a/readalongs/api.py b/readalongs/api.py
@@ -34,37 +34,50 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
 
 Additional API function:
 
-convert_to_readalong(sentences: Sequence[Sequence[Token]], language: Sequence[str]) -> str:
+convert_prealigned_text_to_readalong():
     convert a list of sentences into a readalong XML string ready to print to file.
     Just like align and make_xml, this function expects a black line (empty list) to
     make a paragraph break, and two consecutive blank lines to make a page break.
     Unlike the other functions here, this function is not a wrapper around the CLI and
-    it just returns the string, non status.
+    it just returns the string, with no status.
+
+convert_prealigned_text_to_offline_html():
+    same as convert_prealigned_text_to_readalong, but also creates an offline HTML file.
+
+See their respective docstrings for more details.
 """
 
 import io
 import logging
+import os
+import tempfile
 from dataclasses import dataclass
-from pathlib import Path
 from typing import Optional, Sequence, Tuple, Union
 
 import click
+from lxml import etree
 
 from readalongs import cli
 from readalongs.align import create_ras_from_text
 from readalongs.log import LOGGER
 from readalongs.text.add_ids_to_xml import add_ids
+from readalongs.text.make_package import (
+    DEFAULT_HEADER,
+    DEFAULT_SUBHEADER,
+    DEFAULT_TITLE,
+    create_web_component_html,
+)
 from readalongs.text.util import parse_xml
 from readalongs.util import JoinerCallbackForClick, get_langs_deferred
 
 
 def align(
-    textfile: Union[str, Path],
-    audiofile: Union[str, Path],
-    output_base: Union[str, Path],
+    textfile: Union[str, os.PathLike],
+    audiofile: Union[str, os.PathLike],
+    output_base: Union[str, os.PathLike],
     language: Sequence[str] = (),
     output_formats: Sequence[str] = (),
-    **kwargs
+    **kwargs,
 ) -> Tuple[int, Optional[Exception], str]:
     """Run the "readalongs align" command from within a Python script.
 
@@ -119,10 +132,10 @@ def align(
 
 
 def make_xml(
-    plaintextfile: Union[str, Path],
-    xmlfile: Union[str, Path],
+    plaintextfile: Union[str, os.PathLike],
+    xmlfile: Union[str, os.PathLike],
     language: Sequence[str],
-    **kwargs
+    **kwargs,
 ) -> Tuple[int, Optional[Exception], str]:
     """Run the "readalongs make-xml" command from within a Python script.
 
@@ -144,7 +157,7 @@ def make_xml(
         if isinstance(plaintextfile, click.utils.LazyFile)
         else plaintextfile
     )
-    xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile
+    xmlfile = str(xmlfile) if isinstance(xmlfile, os.PathLike) else xmlfile
     logging_stream = io.StringIO()
     logging_handler = logging.StreamHandler(logging_stream)
     try:
@@ -210,7 +223,7 @@ def __init__(
         self.is_word = is_word if is_word is not None else bool(time is not None)
 
 
-def convert_to_readalong(
+def convert_prealigned_text_to_readalong(
     sentences: Sequence[Sequence[Token]],
     language: Sequence[str] = ("und",),
 ) -> str:
@@ -224,10 +237,8 @@ def convert_to_readalong(
             (has no functional effect since g2p is not applied, it's only metadata)
 
     Returns:
-        str: the readalong XML string, ready to print to a .readalong file
+        str: the readalong XML file contents, ready to print to .readalong
     """
-    from lxml import etree
-
     xml_text = create_ras_from_text(
         ["".join(token.text for token in sentence) for sentence in sentences],
         language,
@@ -259,3 +270,47 @@ def convert_to_readalong(
     ).decode("utf8")
 
     return xml_text + "\n"
+
+
+def convert_prealigned_text_to_offline_html(
+    sentences: Sequence[Sequence[Token]],
+    audio_file_name: Union[str, os.PathLike],
+    language: Sequence[str] = ("und",),
+    title: str = DEFAULT_TITLE,
+    header: str = DEFAULT_HEADER,
+    subheader: str = DEFAULT_SUBHEADER,
+) -> Tuple[str, str]:
+    """Convert a list of sentences/paragraphs/pages of tokens, with corresponding audio,
+    into a readalong Offline HTML
+
+    Args:
+        sentences: a list of sentences, each of which is a list of Token objects
+            Paragraph breaks are marked by a empty sentence (i.e., an empty list)
+            Page breaks are marked by two empty sentences in a row
+        audio_file_name: the name of the audio file to be used in the offline HTML
+        language: list of languages to declare at the top of the readalong
+            (has no functional effect since g2p is not applied, it's only metadata)
+        title: optional title, will fill the HTML <title> tag
+        header: optional header, will fill the readalong <span slot='read-along-header'>
+        subheader: optional subheader, will fill the readalong <span slot='read-along-subheader'>
+
+    Returns:
+        (html_contents, readalong_contents):
+         - the readalong Offline HTML file contents, ready to print to .html
+         - the readalong XML file contents, ready to print to .readalong
+    """
+
+    readalong_xml = convert_prealigned_text_to_readalong(sentences, language)
+    try:
+        readalong_file = tempfile.NamedTemporaryFile(
+            "w", encoding="utf8", delete=False, suffix=".readalong"
+        )
+        readalong_file.write(readalong_xml)
+        readalong_file.close()
+        # print(readalong_file.name)
+        offline_html = create_web_component_html(
+            readalong_file.name, audio_file_name, title, header, subheader
+        )
+        return offline_html, readalong_xml
+    finally:
+        os.unlink(readalong_file.name)
diff --git a/readalongs/text/make_package.py b/readalongs/text/make_package.py
@@ -16,7 +16,7 @@
 import os
 from base64 import b64encode
 from mimetypes import guess_type
-from typing import Any
+from typing import Any, Union
 
 from lxml import etree
 
@@ -36,7 +36,6 @@
   <meta charset="utf-8">
   <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
   <meta name="application-name" content="read along">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=1.0, maximum-scale=5.0">
   <meta name="generator" content="@readalongs/studio (cli) {studio_version}">
   <title>{title}</title>
   <script>{js}</script>
@@ -57,11 +56,11 @@
 DEFAULT_SUBHEADER = "Your read-along subtitle goes here"
 
 
-def encode_from_path(path: str) -> str:
+def encode_from_path(path: Union[str, os.PathLike]) -> str:
     """Encode file from bytes to b64 string with data and mime signature
 
     Args:
-        path (str): path to file
+        path: path to file
 
     Returns:
         str: base64 string with data and mime signature
@@ -118,8 +117,8 @@ def encode_from_path(path: str) -> str:
 
 
 def create_web_component_html(
-    ras_path: str,
-    audio_path: str,
+    ras_path: Union[str, os.PathLike],
+    audio_path: Union[str, os.PathLike],
     title=DEFAULT_TITLE,
     header=DEFAULT_HEADER,
     subheader=DEFAULT_SUBHEADER,

diff --git a/readalongs/text/util.py b/readalongs/text/util.py
@@ -14,7 +14,6 @@
 from collections import OrderedDict
 from datetime import datetime
 from io import TextIOWrapper
-from pathlib import Path
 from typing import IO, Union
 
 from lxml import etree
@@ -104,7 +103,7 @@ def is_do_not_align(element):
     return dna in ("true", "True", "TRUE", "1")
 
 
-def load_xml(input_path: Union[str, Path, IO]) -> etree.ElementTree:
+def load_xml(input_path: Union[str, os.PathLike, IO]) -> etree.ElementTree:
     """Safely load an XML file with etree.parse to respect encoding
 
     Return: the root of the XML etree

diff --git a/test/test_api.py b/test/test_api.py
@@ -97,36 +97,38 @@ def test_deprecated_prepare(self):
             api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
         self.assertIn("deprecated", "\n".join(cm.output))
 
+    sentences_to_convert = [
+        [
+            api.Token("Bonjöûr,", 0.2, 1.0),
+            api.Token(" "),
+            api.Token("hello", 1.0, 0.2),
+            api.Token("!"),
+        ],
+        [api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
+        [],
+        [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
+        [],
+        [],
+        [
+            api.Token("("),
+            api.Token('"'),
+            api.Token("Page2", 5.2, 0.2),
+            api.Token("."),
+            api.Token('"'),
+            api.Token(")"),
+        ],
+    ]
+
     def test_convert_to_readalong(self):
-        sentences = [
-            [
-                api.Token("Bonjöûr,", 0.2, 1.0),
-                api.Token(" "),
-                api.Token("hello", 1.0, 0.2),
-                api.Token("!"),
-            ],
-            [api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
-            [],
-            [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
-            [],
-            [],
-            [
-                api.Token("("),
-                api.Token('"'),
-                api.Token("Page2", 5.2, 0.2),
-                api.Token("."),
-                api.Token('"'),
-                api.Token(")"),
-            ],
-        ]
-
-        readalong = api.convert_to_readalong(sentences)
+
+        readalong = api.convert_prealigned_text_to_readalong(self.sentences_to_convert)
         # print(readalong)
 
         # Make the reference by calling align with the same text and adjusting
         # things we expect to be different.
         sentences_as_text = "\n".join(
-            "".join(token.text for token in sentence) for sentence in sentences
+            "".join(token.text for token in sentence)
+            for sentence in self.sentences_to_convert
         )
         with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
             f.write(sentences_as_text)
@@ -152,6 +154,23 @@ def test_convert_to_readalong(self):
         readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
         self.assertEqual(readalong, align_result)
 
+    def test_convert_to_offline_html(self):
+        html, _ = api.convert_prealigned_text_to_offline_html(
+            self.sentences_to_convert,
+            str(self.data_dir / "noise.mp3"),
+            subheader="by Jove!",
+        )
+        # with open("test.html", "w", encoding="utf8") as f:
+        #     f.write(html)
+        # print(html)
+        self.assertIn("<html", html)
+        self.assertIn("<body", html)
+        self.assertIn('<meta name="generator" content="@readalongs/studio (cli)', html)
+        self.assertIn('<read-along href="data:application/readalong+xml;base64', html)
+        self.assertIn('audio="data:audio/', html)
+        self.assertIn("<span slot='read-along-header'>", html)
+        self.assertIn("<span slot='read-along-subheader'>by Jove!</span>", html)
+
 
 if __name__ == "__main__":
     main()