From 29ec752088a0ccdc930cc490c65c4fadca295328 Mon Sep 17 00:00:00 2001 From: Aidan Pine Date: Fri, 12 Jul 2024 10:50:56 -0700 Subject: [PATCH] feat: including generator meta tag in .readalong files (#226) * feat: including generator meta tag in .readalong files test: fixing ci conflict resolution: VERSION is now only in readalongs/_version.py TODO: __version__ is no longer defined, and we need a FORMAT_VERSION * fix: resolve the version variable conflicts between format and software * docs: updated DTD version --------- Co-authored-by: Delasie Torkornoo Co-authored-by: Eric Joanis --- .github/workflows/tests.yml | 2 +- .gitignore | 4 + docs/cli-guide.md | 2 +- readalongs/_version.py | 2 + readalongs/align.py | 14 +- readalongs/static/read-along-1.1.dtd | 103 ++ readalongs/text/make_package.py | 2 +- readalongs/text/util.py | 2 +- readalongs/web_api.py | 7 +- test/data/cs-ref.readalong | 2 +- test/data/ej-fra-anchors.readalong | 3 +- test/data/ej-fra-anchors2.readalong | 3 +- test/data/ej-fra-dna.readalong | 3 +- test/data/ej-fra-package.readalong | 3 +- test/data/ej-fra-silence-bad.readalong | 3 +- test/data/ej-fra-silence.readalong | 3 +- test/data/ej-fra-subword.readalong | 3 +- test/data/ej-fra-translated.readalong | 3 +- test/data/ej-fra.readalong | 3 +- test/data/fra-prepared.readalong | 3 +- test/data/fra-tokenized.readalong | 3 +- test/data/mixed-langs.readalong | 3 +- test/data/patrickxtlan.readalong | 3 +- test/test_align_cli.py | 1260 ++++++++++++------------ test/test_dtd.py | 2 +- test/test_g2p_cli.py | 7 +- test/test_make_xml_cli.py | 16 +- test/test_misc.py | 11 +- test/test_web_api.py | 22 +- 29 files changed, 830 insertions(+), 667 deletions(-) create mode 100644 readalongs/static/read-along-1.1.dtd diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 62165446..1fccc7dd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -121,5 +121,5 @@ jobs: # This test will fail if the output encoding is cp1252 # Warning: the diff line below is PowerShell syntax, not bash! run: | - echo ćś | readalongs make-xml -l fra - - > cs.readalong + echo ćś | readalongs make-xml -l fra - - | findstr /v meta > cs.readalong if (diff (cat cs.readalong) (cat test/data/cs-ref.readalong)) { throw "Output did not match reference" } diff --git a/.gitignore b/.gitignore index 60c0f607..cb3111ba 100644 --- a/.gitignore +++ b/.gitignore @@ -208,3 +208,7 @@ $RECYCLE.BIN/ *.lnk # End of https://www.gitignore.io/api/linux,macos,python,windows,visualstudiocode + +#temporary file +.tmp +.conda diff --git a/docs/cli-guide.md b/docs/cli-guide.md index 5c0f1583..b683431d 100644 --- a/docs/cli-guide.md +++ b/docs/cli-guide.md @@ -67,7 +67,7 @@ The format of the generated XML is based on [TEI Lite](https://tei-c.org/guidelines/customization/lite/) but is considerably simplified. The DTD (document type definition) can be found in the ReadAlong Studio source code under -`readalongs/static/read-along-1.0.dtd`. +`readalongs/static/read-along-1.1.dtd`. (dna)= diff --git a/readalongs/_version.py b/readalongs/_version.py index 2a3eb2f3..a70b1685 100644 --- a/readalongs/_version.py +++ b/readalongs/_version.py @@ -1 +1,3 @@ VERSION = "1.1.0" + +READALONG_FILE_FORMAT_VERSION = "1.1" diff --git a/readalongs/align.py b/readalongs/align.py index 5e2259aa..60237483 100644 --- a/readalongs/align.py +++ b/readalongs/align.py @@ -18,6 +18,7 @@ from pympi.Praat import TextGrid from webvtt import Caption, WebVTT +from readalongs._version import READALONG_FILE_FORMAT_VERSION, VERSION from readalongs.audio_utils import ( extract_section, mute_section, @@ -175,7 +176,7 @@ def parse_and_make_xml( """Parse XML input and run tokenization and G2P. Args: - xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.0.dtd) + xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.1.dtd) config (dict): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings @@ -568,7 +569,7 @@ def align_audio( """Align an XML input file to an audio file. Args: - xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.0.dtd) + xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.1.dtd) audio_path (str): Path to audio input. Must be in a format supported by ffmpeg unit (str): Optional; Element to create alignments for, by default 'w' bare (boolean): Optional; @@ -1156,7 +1157,8 @@ def convert_to_xhtml(tokenized_xml, title="Book"): # TODO: add this to template RAS_TEMPLATE = """ - + + {{#pages}} @@ -1177,7 +1179,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"): def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str: - """Create input xml in ReadAlong XML format (see static/read-along-1.0.dtd) + """Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd) Uses the line sequence to infer paragraph and sentence structure from plain text: Assumes a double blank line marks a page break, and a single blank line marks a paragraph break. @@ -1194,6 +1196,8 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> kwargs = { "main_lang": text_languages[0], "fallback_langs": ",".join(text_languages[1:]), + "studio_version": VERSION, + "format_version": READALONG_FILE_FORMAT_VERSION, } pages: List[dict] = [] paragraphs: List[dict] = [] @@ -1223,7 +1227,7 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> def create_input_ras(**kwargs): - """Create input xml in ReadAlong XML format (see static/read-along-1.0.dtd) + """Create input xml in ReadAlong XML format (see static/read-along-1.1.dtd) Uses readlines to infer paragraph and sentence structure from plain text. Assumes a double blank line marks a page break, and a single blank line marks a paragraph break. diff --git a/readalongs/static/read-along-1.1.dtd b/readalongs/static/read-along-1.1.dtd new file mode 100644 index 00000000..8202e169 --- /dev/null +++ b/readalongs/static/read-along-1.1.dtd @@ -0,0 +1,103 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/readalongs/text/make_package.py b/readalongs/text/make_package.py index 7482ad80..312f8ba4 100644 --- a/readalongs/text/make_package.py +++ b/readalongs/text/make_package.py @@ -37,7 +37,7 @@ - + {title} diff --git a/readalongs/text/util.py b/readalongs/text/util.py index f283693a..6243f4f6 100644 --- a/readalongs/text/util.py +++ b/readalongs/text/util.py @@ -230,7 +230,7 @@ def copy_file_to_zip(zip_path, origin_path, destination_path): - + {title} diff --git a/readalongs/web_api.py b/readalongs/web_api.py index 9e30bc63..29b2e5e8 100644 --- a/readalongs/web_api.py +++ b/readalongs/web_api.py @@ -41,6 +41,7 @@ from pydantic import BaseModel, Field from starlette.background import BackgroundTask +from readalongs._version import READALONG_FILE_FORMAT_VERSION, VERSION from readalongs.align import create_ras_from_text, save_label_files, save_subtitles from readalongs.log import LOGGER, capture_logs from readalongs.text.add_ids_to_xml import add_ids @@ -77,7 +78,7 @@ # Call get_langs() when the server loads to load the languages into memory LANGS = get_langs() # Get the DTD -DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.0.dtd") +DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.1.dtd") with open(DTDPATH) as dtdfh: DTD = etree.DTD(dtdfh) @@ -323,7 +324,8 @@ class ConvertRequest(BaseModel): dedent( """\ - + +
@@ -337,6 +339,7 @@ class ConvertRequest(BaseModel): """ + % (READALONG_FILE_FORMAT_VERSION, VERSION) ) ], ) diff --git a/test/data/cs-ref.readalong b/test/data/cs-ref.readalong index 71f3d659..b5d09e82 100644 --- a/test/data/cs-ref.readalong +++ b/test/data/cs-ref.readalong @@ -1,5 +1,5 @@ - +
diff --git a/test/data/ej-fra-anchors.readalong b/test/data/ej-fra-anchors.readalong index bef3896c..8f593076 100644 --- a/test/data/ej-fra-anchors.readalong +++ b/test/data/ej-fra-anchors.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/ej-fra-anchors2.readalong b/test/data/ej-fra-anchors2.readalong index b929b7e9..f2595346 100644 --- a/test/data/ej-fra-anchors2.readalong +++ b/test/data/ej-fra-anchors2.readalong @@ -1,5 +1,6 @@ - + + diff --git a/test/data/ej-fra-dna.readalong b/test/data/ej-fra-dna.readalong index bf885889..73bdbb93 100644 --- a/test/data/ej-fra-dna.readalong +++ b/test/data/ej-fra-dna.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/ej-fra-package.readalong b/test/data/ej-fra-package.readalong index ae1f9f70..1ecda7e9 100644 --- a/test/data/ej-fra-package.readalong +++ b/test/data/ej-fra-package.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/ej-fra-silence-bad.readalong b/test/data/ej-fra-silence-bad.readalong index f40b4a80..d8bbdd5c 100644 --- a/test/data/ej-fra-silence-bad.readalong +++ b/test/data/ej-fra-silence-bad.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/ej-fra-silence.readalong b/test/data/ej-fra-silence.readalong index a24dd15b..d9de6e72 100644 --- a/test/data/ej-fra-silence.readalong +++ b/test/data/ej-fra-silence.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/ej-fra-subword.readalong b/test/data/ej-fra-subword.readalong index 173811aa..070ceb13 100644 --- a/test/data/ej-fra-subword.readalong +++ b/test/data/ej-fra-subword.readalong @@ -1,5 +1,6 @@ - + + diff --git a/test/data/ej-fra-translated.readalong b/test/data/ej-fra-translated.readalong index 63ffb0ab..7f8bca50 100644 --- a/test/data/ej-fra-translated.readalong +++ b/test/data/ej-fra-translated.readalong @@ -1,5 +1,6 @@ - + + diff --git a/test/data/ej-fra.readalong b/test/data/ej-fra.readalong index 5c78c992..7531c352 100644 --- a/test/data/ej-fra.readalong +++ b/test/data/ej-fra.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/fra-prepared.readalong b/test/data/fra-prepared.readalong index a7b8bc6f..898f01a4 100644 --- a/test/data/fra-prepared.readalong +++ b/test/data/fra-prepared.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/fra-tokenized.readalong b/test/data/fra-tokenized.readalong index 2c5d597a..058b883e 100644 --- a/test/data/fra-tokenized.readalong +++ b/test/data/fra-tokenized.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/mixed-langs.readalong b/test/data/mixed-langs.readalong index 17c6554e..af345dea 100644 --- a/test/data/mixed-langs.readalong +++ b/test/data/mixed-langs.readalong @@ -1,5 +1,6 @@ - + +
diff --git a/test/data/patrickxtlan.readalong b/test/data/patrickxtlan.readalong index 75ebf312..4b8a8560 100644 --- a/test/data/patrickxtlan.readalong +++ b/test/data/patrickxtlan.readalong @@ -1,5 +1,6 @@ - + +

diff --git a/test/test_align_cli.py b/test/test_align_cli.py index c10cb640..868d7c85 100755 --- a/test/test_align_cli.py +++ b/test/test_align_cli.py @@ -1,629 +1,633 @@ -#!/usr/bin/env python - -""" -Unit test suite for the readalongs align CLI command -""" - -import os -import pathlib -import tempfile -from os.path import exists, join -from unittest import main - -from basic_test_case import BasicTestCase -from lxml.html import fromstring -from sound_swallower_stub import SoundSwallowerStub - -from readalongs.cli import align, langs - - -def write_file(filename: str, file_contents: str) -> str: - """Write file_contents to file filename, and return its name (filename)""" - with open(filename, mode="w", encoding="utf8") as f: - f.write(file_contents) - return filename - - -class TestAlignCli(BasicTestCase): - """Unit test suite for the readalongs align CLI command""" - - def test_invoke_align(self): - """Basic readalongs align invocation and some variants""" - output = join(self.tempdir, "output") - with open("image-for-page1.jpg", "wb"): - pass - # Run align from plain text - results = self.runner.invoke( - align, - [ - "-s", - "-o", - "vtt", - "-o", # tests that we can use -o more than once - "srt:TextGrid,eaf", # tests that we can give -o multiple values, separated by : or , - "-l", - "fra", - "--align-mode", - "auto", - "--config", - join(self.data_dir, "sample-config.json"), - self.add_bom(join(self.data_dir, "ej-fra.txt")), - join(self.data_dir, "ej-fra.m4a"), - output, - ], - ) - # print(results.output) - self.assertEqual(results.exit_code, 0) - expected_output_files = [ - "output.readalong", - "output.m4a", - "index.html", - "output.TextGrid", - "output.eaf", - "output_sentences.srt", - "output_sentences.vtt", - "output_words.srt", - "output_words.vtt", - "readme.txt", - ] - for f in expected_output_files: - self.assertTrue( - exists(join(output, f)), f"successful alignment should have created {f}" - ) - with open(join(output, "index.html"), encoding="utf8") as f: - self.assertIn( - '', - 'This ', - 'is ', - 'some ', - 'text ', - 'that ', - 'we ', - 'will ', - 'run ', - 'through ', - 'the ', - 'English ', - 'lexicon ', - 'grapheme ', - 'to ', - 'morpheme ', - 'approach', - ".", - ) - ) - - tokenized_file = join( - self.tempdir, "eng-output", "tempfiles", "eng-output.g2p.readalong" - ) - with open(tokenized_file, "r", encoding="utf8") as f: - tok_output = f.read() - - self.assertIn(g2p_ref, tok_output) - - def test_invalid_config(self): - """unit testing for invalid config specifications""" - - # --config parameter needs to be .json, text with .txt instead - result = self.runner.invoke( - align, - [ - "--config", - join(self.data_dir, "fra.txt"), - join(self.data_dir, "fra.txt"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "out-invalid-config-1"), - ], - ) - self.assertIn("must be in JSON format", result.stdout) - - # --config parameters needs to contain valid json, test with garbage - config_file = join(self.tempdir, "bad-config.json") - with open(config_file, "w", encoding="utf8") as f: - print("not valid json", file=f) - result = self.runner.invoke( - align, - [ - "--config", - config_file, - join(self.data_dir, "fra.txt"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "out-invalid-config-2"), - ], - ) - self.assertIn("is not in valid JSON format", result.stdout) - - def test_bad_anchors(self): - """Make sure invalid anchors yield appropriate errors""" - - xml_text = """ -

- Bonjour. -

- """ - xml_file = join(self.tempdir, "bad-anchor.readalong") - with open(xml_file, "w", encoding="utf8") as f: - print(xml_text, file=f) - bad_anchors_result = self.runner.invoke( - align, - [ - xml_file, - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "out-bad-anchors"), - ], - ) - for msg in [ - 'missing "time" attribute', - 'invalid "time" attribute "invalid"', - "Could not parse all anchors", - "Aborting.", - ]: - self.assertIn(msg, bad_anchors_result.stdout) - - def test_misc_align_errors(self): - """Test calling readalongs align with misc CLI errors""" - results = self.runner.invoke( - align, - [ - join(self.data_dir, "ej-fra.txt"), - join(self.data_dir, "ej-fra.m4a"), - join(self.tempdir, "out-missing-l"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("No input language specified", results.output) - - with SoundSwallowerStub("[NOISE]:0:1"): - results = self.runner.invoke( - align, - [ - join(self.data_dir, "fra-prepared.readalong"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "noise-only"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("produced 0 segments", results.output) - - with SoundSwallowerStub( - "[NOISE]:0:1", "w0:1:1000", ":1000:1100", "w1:1100:2000" - ): - results = self.runner.invoke( - align, - [ - join(self.data_dir, "ej-fra.readalong"), - join(self.data_dir, "ej-fra.m4a"), - join(self.tempdir, "two-words"), - ], - ) - # print(results.output) - # We don't check results.exit_code since that's a soft warning, not a hard error - self.assertIn("produced 2 segments", results.output) - self.assertIn( - "Alignment produced a different number of segments and tokens than were in the input.", - results.output, - ) - - def test_infer_plain_text_or_xml(self): - """align -i is obsolete, now we infer plain text vs XML; test that!""" - - # plain text with guess by contents - infile1 = write_file(join(self.tempdir, "infile1"), "some plain text") - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - infile1, - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir1"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - # This error message confirms it's being processed as plain text - self.assertIn("No input language specified for plain text", results.output) - - # plain text by extension - infile2 = write_file(join(self.tempdir, "infile2.txt"), "blah blah", - ) - ) - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - infile3, - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir3"), - ], - ) - self.assertEqual(results.exit_code, 0) - - # XML with guess by contents, but with content error - infile4 = write_file( - join(self.tempdir, "infile4"), - "blah blah", - ) - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - infile4, - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir4"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("Error parsing XML", results.output) - - # XML by file extension - infile5 = write_file(join(self.tempdir, "infile5.readalong"), "Not XML!") - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - infile5, - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir5"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("Error parsing XML", results.output) - - def test_obsolete_switches(self): - # Giving -i switch generates an obsolete-switch error message - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - "-i", - join(self.data_dir, "fra.txt"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir6"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("is obsolete.", results.output) - - # Giving --g2p-verbose switch generates an obsolete-switch error message - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - "--g2p-verbose", - join(self.data_dir, "fra.txt"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir7"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("is obsolete.", results.output) - - # Giving --g2p-fallback switch generates an obsolete-switch error message - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - "--g2p-fallback", - "fra:end:und", - join(self.data_dir, "fra.txt"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir8"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("is obsolete.", results.output) - - def test_oo_option(self): - """Exercise the hidden -oo / --output-orth option""" - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - "-oo", - "eng-arpabet", - join(self.data_dir, "ej-fra.readalong"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir9"), - ], - ) - self.assertEqual(results.exit_code, 0) - - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - "-oo", - "not-an-alphabet", - join(self.data_dir, "ej-fra.readalong"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir10"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("Could not g2p", results.output) - self.assertIn("not-an-alphabet", results.output) - - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - "-oo", - "dan-ipa", - join(self.data_dir, "ej-fra.readalong"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir11"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("Could not g2p", results.output) - self.assertIn("no path", results.output) - - with SoundSwallowerStub("word:0:1"): - results = self.runner.invoke( - align, - [ - "-oo", - "dan-ipa", - "-l", - "eng", - join(self.data_dir, "fra.txt"), - join(self.data_dir, "noise.mp3"), - join(self.tempdir, "outdir12"), - ], - ) - self.assertNotEqual(results.exit_code, 0) - self.assertIn("Could not g2p", results.output) - self.assertIn('no path from "eng" to ', results.output) - - def add_bom(self, filename): - """Create a temporary copy of filename with the a BOM in it, in self.tempdir""" - # We pepper calls to add_bom() around the test suite, to make sure all - # different kinds of input files are accepted with and without a BOM - output_file = tempfile.NamedTemporaryFile( - mode="wb", - dir=self.tempdir, - delete=False, - prefix="bom_", - suffix=os.path.basename(filename), - ) - output_file.write(b"\xef\xbb\xbf") - with open(filename, "rb") as file_binary: - output_file.write(file_binary.read()) - output_file.close() - return output_file.name - - def test_add_bom(self): - """Make sure add_bom does what we mean it to, i.e., test the test harness.""" - - def slurp_bin(filename): - with open(filename, "rb") as f: - return f.read() - - def slurp_text(filename, encoding): - with open(filename, "r", encoding=encoding) as f: - return f.read() - - base_file = write_file(self.tempdir / "add-bom-input.txt", "Random Text été") - bom_file = self.add_bom(base_file) - self.assertEqual( - slurp_text(base_file, "utf-8"), slurp_text(bom_file, "utf-8-sig") - ) - self.assertEqual( - slurp_text(bom_file, "utf-8"), "\ufeff" + slurp_text(base_file, "utf-8") - ) - self.assertNotEqual(slurp_bin(base_file), slurp_bin(bom_file)) - self.assertEqual(b"\xef\xbb\xbf" + slurp_bin(base_file), slurp_bin(bom_file)) - - bom_file_pathlib = self.add_bom(pathlib.Path(base_file)) - self.assertEqual( - slurp_text(base_file, "utf-8"), slurp_text(bom_file_pathlib, "utf-8-sig") - ) - - -if __name__ == "__main__": +#!/usr/bin/env python + +""" +Unit test suite for the readalongs align CLI command +""" + +import os +import pathlib +import tempfile +from os.path import exists, join +from unittest import main + +from basic_test_case import BasicTestCase +from lxml.html import fromstring +from sound_swallower_stub import SoundSwallowerStub + +from readalongs._version import READALONG_FILE_FORMAT_VERSION, VERSION +from readalongs.cli import align, langs + + +def write_file(filename: str, file_contents: str) -> str: + """Write file_contents to file filename, and return its name (filename)""" + with open(filename, mode="w", encoding="utf8") as f: + f.write(file_contents) + return filename + + +class TestAlignCli(BasicTestCase): + """Unit test suite for the readalongs align CLI command""" + + def test_invoke_align(self): + """Basic readalongs align invocation and some variants""" + output = join(self.tempdir, "output") + with open("image-for-page1.jpg", "wb"): + pass + # Run align from plain text + results = self.runner.invoke( + align, + [ + "-s", + "-o", + "vtt", + "-o", # tests that we can use -o more than once + "srt:TextGrid,eaf", # tests that we can give -o multiple values, separated by : or , + "-l", + "fra", + "--align-mode", + "auto", + "--config", + join(self.data_dir, "sample-config.json"), + self.add_bom(join(self.data_dir, "ej-fra.txt")), + join(self.data_dir, "ej-fra.m4a"), + output, + ], + ) + # print(results.output) + self.assertEqual(results.exit_code, 0) + expected_output_files = [ + "output.readalong", + "output.m4a", + "index.html", + "output.TextGrid", + "output.eaf", + "output_sentences.srt", + "output_sentences.vtt", + "output_words.srt", + "output_words.vtt", + "readme.txt", + ] + for f in expected_output_files: + self.assertTrue( + exists(join(output, f)), f"successful alignment should have created {f}" + ) + with open(join(output, "index.html"), encoding="utf8") as f: + self.assertIn( + '', + 'This ', + 'is ', + 'some ', + 'text ', + 'that ', + 'we ', + 'will ', + 'run ', + 'through ', + 'the ', + 'English ', + 'lexicon ', + 'grapheme ', + 'to ', + 'morpheme ', + 'approach', + ".", + ) + ) + + tokenized_file = join( + self.tempdir, "eng-output", "tempfiles", "eng-output.g2p.readalong" + ) + with open(tokenized_file, "r", encoding="utf8") as f: + tok_output = f.read() + + self.assertIn(g2p_ref, tok_output) + + def test_invalid_config(self): + """unit testing for invalid config specifications""" + + # --config parameter needs to be .json, text with .txt instead + result = self.runner.invoke( + align, + [ + "--config", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "out-invalid-config-1"), + ], + ) + self.assertIn("must be in JSON format", result.stdout) + + # --config parameters needs to contain valid json, test with garbage + config_file = join(self.tempdir, "bad-config.json") + with open(config_file, "w", encoding="utf8") as f: + print("not valid json", file=f) + result = self.runner.invoke( + align, + [ + "--config", + config_file, + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "out-invalid-config-2"), + ], + ) + self.assertIn("is not in valid JSON format", result.stdout) + + def test_bad_anchors(self): + """Make sure invalid anchors yield appropriate errors""" + + xml_text = """ +

+ Bonjour. +

+ """ % ( + READALONG_FILE_FORMAT_VERSION, + VERSION, + ) + xml_file = join(self.tempdir, "bad-anchor.readalong") + with open(xml_file, "w", encoding="utf8") as f: + print(xml_text, file=f) + bad_anchors_result = self.runner.invoke( + align, + [ + xml_file, + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "out-bad-anchors"), + ], + ) + for msg in [ + 'missing "time" attribute', + 'invalid "time" attribute "invalid"', + "Could not parse all anchors", + "Aborting.", + ]: + self.assertIn(msg, bad_anchors_result.stdout) + + def test_misc_align_errors(self): + """Test calling readalongs align with misc CLI errors""" + results = self.runner.invoke( + align, + [ + join(self.data_dir, "ej-fra.txt"), + join(self.data_dir, "ej-fra.m4a"), + join(self.tempdir, "out-missing-l"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("No input language specified", results.output) + + with SoundSwallowerStub("[NOISE]:0:1"): + results = self.runner.invoke( + align, + [ + join(self.data_dir, "fra-prepared.readalong"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "noise-only"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("produced 0 segments", results.output) + + with SoundSwallowerStub( + "[NOISE]:0:1", "w0:1:1000", ":1000:1100", "w1:1100:2000" + ): + results = self.runner.invoke( + align, + [ + join(self.data_dir, "ej-fra.readalong"), + join(self.data_dir, "ej-fra.m4a"), + join(self.tempdir, "two-words"), + ], + ) + # print(results.output) + # We don't check results.exit_code since that's a soft warning, not a hard error + self.assertIn("produced 2 segments", results.output) + self.assertIn( + "Alignment produced a different number of segments and tokens than were in the input.", + results.output, + ) + + def test_infer_plain_text_or_xml(self): + """align -i is obsolete, now we infer plain text vs XML; test that!""" + + # plain text with guess by contents + infile1 = write_file(join(self.tempdir, "infile1"), "some plain text") + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + infile1, + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir1"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + # This error message confirms it's being processed as plain text + self.assertIn("No input language specified for plain text", results.output) + + # plain text by extension + infile2 = write_file(join(self.tempdir, "infile2.txt"), "blah blah", + ) + ) + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + infile3, + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir3"), + ], + ) + self.assertEqual(results.exit_code, 0) + + # XML with guess by contents, but with content error + infile4 = write_file( + join(self.tempdir, "infile4"), + "blah blah", + ) + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + infile4, + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir4"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Error parsing XML", results.output) + + # XML by file extension + infile5 = write_file(join(self.tempdir, "infile5.readalong"), "Not XML!") + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + infile5, + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir5"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Error parsing XML", results.output) + + def test_obsolete_switches(self): + # Giving -i switch generates an obsolete-switch error message + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-i", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir6"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("is obsolete.", results.output) + + # Giving --g2p-verbose switch generates an obsolete-switch error message + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "--g2p-verbose", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir7"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("is obsolete.", results.output) + + # Giving --g2p-fallback switch generates an obsolete-switch error message + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "--g2p-fallback", + "fra:end:und", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir8"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("is obsolete.", results.output) + + def test_oo_option(self): + """Exercise the hidden -oo / --output-orth option""" + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "eng-arpabet", + join(self.data_dir, "ej-fra.readalong"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir9"), + ], + ) + self.assertEqual(results.exit_code, 0) + + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "not-an-alphabet", + join(self.data_dir, "ej-fra.readalong"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir10"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Could not g2p", results.output) + self.assertIn("not-an-alphabet", results.output) + + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "dan-ipa", + join(self.data_dir, "ej-fra.readalong"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir11"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Could not g2p", results.output) + self.assertIn("no path", results.output) + + with SoundSwallowerStub("word:0:1"): + results = self.runner.invoke( + align, + [ + "-oo", + "dan-ipa", + "-l", + "eng", + join(self.data_dir, "fra.txt"), + join(self.data_dir, "noise.mp3"), + join(self.tempdir, "outdir12"), + ], + ) + self.assertNotEqual(results.exit_code, 0) + self.assertIn("Could not g2p", results.output) + self.assertIn('no path from "eng" to ', results.output) + + def add_bom(self, filename): + """Create a temporary copy of filename with the a BOM in it, in self.tempdir""" + # We pepper calls to add_bom() around the test suite, to make sure all + # different kinds of input files are accepted with and without a BOM + output_file = tempfile.NamedTemporaryFile( + mode="wb", + dir=self.tempdir, + delete=False, + prefix="bom_", + suffix=os.path.basename(filename), + ) + output_file.write(b"\xef\xbb\xbf") + with open(filename, "rb") as file_binary: + output_file.write(file_binary.read()) + output_file.close() + return output_file.name + + def test_add_bom(self): + """Make sure add_bom does what we mean it to, i.e., test the test harness.""" + + def slurp_bin(filename): + with open(filename, "rb") as f: + return f.read() + + def slurp_text(filename, encoding): + with open(filename, "r", encoding=encoding) as f: + return f.read() + + base_file = write_file(self.tempdir / "add-bom-input.txt", "Random Text été") + bom_file = self.add_bom(base_file) + self.assertEqual( + slurp_text(base_file, "utf-8"), slurp_text(bom_file, "utf-8-sig") + ) + self.assertEqual( + slurp_text(bom_file, "utf-8"), "\ufeff" + slurp_text(base_file, "utf-8") + ) + self.assertNotEqual(slurp_bin(base_file), slurp_bin(bom_file)) + self.assertEqual(b"\xef\xbb\xbf" + slurp_bin(base_file), slurp_bin(bom_file)) + + bom_file_pathlib = self.add_bom(pathlib.Path(base_file)) + self.assertEqual( + slurp_text(base_file, "utf-8"), slurp_text(bom_file_pathlib, "utf-8-sig") + ) + + +if __name__ == "__main__": main() diff --git a/test/test_dtd.py b/test/test_dtd.py index fde55957..3e36e749 100644 --- a/test/test_dtd.py +++ b/test/test_dtd.py @@ -9,7 +9,7 @@ from lxml import etree DTDPATH = os.path.join( - dirname(__file__), "..", "readalongs", "static", "read-along-1.0.dtd" + dirname(__file__), "..", "readalongs", "static", "read-along-1.1.dtd" ) VALID_RAS = """ diff --git a/test/test_g2p_cli.py b/test/test_g2p_cli.py index 953c1a09..830ce549 100755 --- a/test/test_g2p_cli.py +++ b/test/test_g2p_cli.py @@ -9,6 +9,7 @@ from basic_test_case import BasicTestCase from lxml import etree from sound_swallower_stub import SoundSwallowerStub +from test_make_xml_cli import updateFormatVersion, updateStudioVersion from readalongs.align import align_audio from readalongs.cli import align, g2p, make_xml, tokenize @@ -76,9 +77,13 @@ def test_mixed_langs(self): ref_file, encoding="utf8" ) as ref_f: self.maxDiff = None + # update version info + ref_list = list(ref_f) + ref_list[1] = updateFormatVersion(ref_list[1]) + ref_list[2] = updateStudioVersion(ref_list[2]) self.assertListEqual( list(output_f), - list(ref_f), + ref_list, f"output {g2p_file} and reference {ref_file} differ.", ) diff --git a/test/test_make_xml_cli.py b/test/test_make_xml_cli.py index a36a23db..9a448532 100755 --- a/test/test_make_xml_cli.py +++ b/test/test_make_xml_cli.py @@ -10,10 +10,18 @@ from basic_test_case import BasicTestCase +# from readalongs.log import LOGGER +from readalongs._version import READALONG_FILE_FORMAT_VERSION, VERSION from readalongs.align import create_input_ras, create_ras_from_text from readalongs.cli import align, make_xml -# from readalongs.log import LOGGER + +def updateFormatVersion(input): + return input.replace("{{format_version}}", READALONG_FILE_FORMAT_VERSION) + + +def updateStudioVersion(input): + return input.replace("{{studio_version}}", VERSION) class TestMakeXMLCli(BasicTestCase): @@ -90,9 +98,13 @@ def test_output_correct(self): ref_file, encoding="utf8" ) as ref_f: self.maxDiff = None + # update version info + ref_list = list(ref_f) + ref_list[1] = updateFormatVersion(ref_list[1]) + ref_list[2] = updateStudioVersion(ref_list[2]) self.assertListEqual( list(output_f), - list(ref_f), + ref_list, f"output {xml_file} and reference {ref_file} differ.", ) diff --git a/test/test_misc.py b/test/test_misc.py index 340ea948..9ce219ec 100755 --- a/test/test_misc.py +++ b/test/test_misc.py @@ -13,7 +13,7 @@ from pep440 import is_canonical from test_dna_utils import segments_from_pairs -from readalongs._version import VERSION +from readalongs._version import READALONG_FILE_FORMAT_VERSION, VERSION from readalongs.align import split_silences from readalongs.log import LOGGER, capture_logs from readalongs.text.util import ( @@ -91,7 +91,8 @@ def test_split_silences(self): self.assertEqual(words, ref) def test_get_attrib_recursive(self): - raw_xml = """ + raw_xml = """ +

stuffnonsense

stuffnonsense

@@ -103,8 +104,12 @@ def test_get_attrib_recursive(self):

stuffnonsense!

- """ + """ % ( + READALONG_FILE_FORMAT_VERSION, + VERSION, + ) xml = parse_xml(raw_xml) + for i, s, lang in zip( itertools.count(), xml.xpath(".//s"), diff --git a/test/test_web_api.py b/test/test_web_api.py index dab9ba4f..9d92e3f1 100755 --- a/test/test_web_api.py +++ b/test/test_web_api.py @@ -7,6 +7,7 @@ from basic_test_case import BasicTestCase +from readalongs._version import READALONG_FILE_FORMAT_VERSION, VERSION from readalongs.log import LOGGER from readalongs.text.add_ids_to_xml import add_ids from readalongs.text.convert_xml import convert_xml @@ -31,7 +32,12 @@ def API_CLIENT(self): def slurp_data_file(self, filename: str) -> str: """Convenience function to slurp a whole file in self.data_dir""" with open(os.path.join(self.data_dir, filename), encoding="utf8") as f: - return f.read().strip() + return ( + f.read() + .strip() + .replace("{{format_version}}", READALONG_FILE_FORMAT_VERSION) + .replace("{{studio_version}}", VERSION) + ) def test_assemble_from_plain_text(self): # Test the assemble endpoint with plain text @@ -198,9 +204,9 @@ def test_debug(self): self.assertIsNone(content["g2ped"]) hej_verden_xml = dedent( - """\ - - + """ + +
@@ -215,6 +221,7 @@ def test_debug(self): """ + % (READALONG_FILE_FORMAT_VERSION, VERSION) ) def test_convert_to_TextGrid(self): @@ -431,9 +438,9 @@ def test_cleanup_even_if_error(self): # that exception in a sane way, with a 422 status code, while # also making sure the temporary directory gets deleted. overlap_xml = dedent( - """\ - - + """ + +
@@ -448,6 +455,7 @@ def test_cleanup_even_if_error(self): """ + % (READALONG_FILE_FORMAT_VERSION, VERSION) ) request = { "dur": 83.1,