From 7f650f6160d44ae6270c6ce6d7bf10f685d51bb2 Mon Sep 17 00:00:00 2001 From: Delasie Torkornoo Date: Thu, 10 Oct 2024 09:36:04 -0400 Subject: [PATCH 1/2] feat: bump RAS format to 1.3 --- docs/cli-guide.md | 2 +- readalongs/_version.py | 5 +- readalongs/align.py | 8 +- readalongs/static/read-along-1.3.dtd | 108 +++++++++++++++++++++++++++ readalongs/text/make_package.py | 10 ++- readalongs/text/util.py | 4 +- readalongs/web_api.py | 2 +- test/data/ras-dtd-1.1.readalong | 16 ++-- test/data/ras-dtd-1.3.readalong | 35 +++++++++ test/test_dtd.py | 3 +- 10 files changed, 174 insertions(+), 19 deletions(-) create mode 100644 readalongs/static/read-along-1.3.dtd create mode 100644 test/data/ras-dtd-1.3.readalong diff --git a/docs/cli-guide.md b/docs/cli-guide.md index c7e27e40..198119e6 100644 --- a/docs/cli-guide.md +++ b/docs/cli-guide.md @@ -67,7 +67,7 @@ The format of the generated XML is based on [TEI Lite](https://tei-c.org/guidelines/customization/lite/) but is considerably simplified. The DTD (document type definition) can be found in the ReadAlong Studio source code under -`readalongs/static/read-along-1.2.dtd`. +`readalongs/static/read-along-1.3.dtd`. (dna)= diff --git a/readalongs/_version.py b/readalongs/_version.py index a17ef292..e364245b 100644 --- a/readalongs/_version.py +++ b/readalongs/_version.py @@ -1,3 +1,6 @@ VERSION = "1.1.0" -READALONG_FILE_FORMAT_VERSION = "1.2" +READALONG_FILE_FORMAT_VERSION = "1.3" + + +CURRENT_WEB_APP_VERSION = "1.5.x" diff --git a/readalongs/align.py b/readalongs/align.py index 88df880a..99da749b 100644 --- a/readalongs/align.py +++ b/readalongs/align.py @@ -181,7 +181,7 @@ def parse_and_make_xml( """Parse XML input and run tokenization and G2P. Args: - xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.2.dtd) + xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.3.dtd) config (dict): Optional; ReadAlong-Studio configuration to use save_temps (str): Optional; Save temporary files, by default None verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings @@ -574,7 +574,7 @@ def align_audio( """Align an XML input file to an audio file. Args: - xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.2.dtd) + xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.3.dtd) audio_path (str): Path to audio input. Must be in a format supported by ffmpeg unit (str): Optional; Element to create alignments for, by default 'w' bare (boolean): Optional; @@ -1192,7 +1192,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"): def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str: - """Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd) + """Create input xml in ReadAlong XML format (see static/read-along-1.3.dtd) Uses the line sequence to infer paragraph and sentence structure from plain text: Assumes a double blank line marks a page break, and a single blank line marks a paragraph break. @@ -1240,7 +1240,7 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> def create_input_ras(**kwargs): - """Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd) + """Create input xml in ReadAlong XML format (see static/read-along-1.3.dtd) Uses readlines to infer paragraph and sentence structure from plain text. Assumes a double blank line marks a page break, and a single blank line marks a paragraph break. diff --git a/readalongs/static/read-along-1.3.dtd b/readalongs/static/read-along-1.3.dtd new file mode 100644 index 00000000..749fb0d0 --- /dev/null +++ b/readalongs/static/read-along-1.3.dtd @@ -0,0 +1,108 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/readalongs/text/make_package.py b/readalongs/text/make_package.py index 8819a136..bbb20f97 100644 --- a/readalongs/text/make_package.py +++ b/readalongs/text/make_package.py @@ -20,13 +20,17 @@ from lxml import etree -from readalongs._version import VERSION +from readalongs._version import CURRENT_WEB_APP_VERSION, VERSION from readalongs.log import LOGGER from readalongs.text.util import parse_xml -JS_BUNDLE_URL = "https://unpkg.com/@readalongs/web-component@^1.4.0/dist/bundle.js" +JS_BUNDLE_URL = ( + "https://unpkg.com/@readalongs/web-component@%s/dist/bundle.js" + % CURRENT_WEB_APP_VERSION +) FONTS_BUNDLE_URL = ( - "https://unpkg.com/@readalongs/web-component@^1.4.0/dist/fonts.b64.css" + "https://unpkg.com/@readalongs/web-component@%s/dist/fonts.b64.css" + % CURRENT_WEB_APP_VERSION ) BASIC_HTML = """ diff --git a/readalongs/text/util.py b/readalongs/text/util.py index 68cf2980..add2b31a 100644 --- a/readalongs/text/util.py +++ b/readalongs/text/util.py @@ -20,13 +20,11 @@ from lxml import etree -from readalongs._version import VERSION +from readalongs._version import CURRENT_WEB_APP_VERSION, VERSION # removed "try: unicode() except" block (was for Python 2), but this file uses unicode() # too many times, so define it anyway. unicode = str -# todo: sync with web component major and minor releases -CURRENT_WEB_APP_VERSION = "1.4.x" def ensure_dirs(path): diff --git a/readalongs/web_api.py b/readalongs/web_api.py index 30ff8126..8f4e858f 100644 --- a/readalongs/web_api.py +++ b/readalongs/web_api.py @@ -78,7 +78,7 @@ # Call get_langs() when the server loads to load the languages into memory LANGS = get_langs() # Get the DTD -DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.2.dtd") +DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.3.dtd") with open(DTDPATH) as dtdfh: DTD = etree.DTD(dtdfh) diff --git a/test/data/ras-dtd-1.1.readalong b/test/data/ras-dtd-1.1.readalong index 8e4b9c2a..2a5c3115 100644 --- a/test/data/ras-dtd-1.1.readalong +++ b/test/data/ras-dtd-1.1.readalong @@ -7,17 +7,23 @@

Bonjour. Je m'appelle Éric Joanis. - Je suis programmeur au sein de l'équipe des technologies pour les langues autochtones au CNRC. + Je suis programmeur au sein de l'équipe des technologies pour les langues + autochtones au CNRC.

- J'ai fait une bonne partie de ma carrière en traduction automatique statistique, mais maintenant cette approche est déclassée par l'apprentissage profond. - En ce moment je travaille à l'alignement du hansard du Nunavut pour produire un corpus bilingue anglais-inuktitut. - Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que d'autres applications de traitement du langage naturel. + J'ai fait une bonne partie de ma carrière en traduction automatique + statistique, mais maintenant cette approche est déclassée par + l'apprentissage profond. + En ce moment je travaille à l'alignement du hansard du Nunavut pour produire + un corpus bilingue anglais-inuktitut. + Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que + d'autres applications de traitement du langage naturel.

- En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus robuste. + En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus + robuste.

diff --git a/test/data/ras-dtd-1.3.readalong b/test/data/ras-dtd-1.3.readalong new file mode 100644 index 00000000..5f305718 --- /dev/null +++ b/test/data/ras-dtd-1.3.readalong @@ -0,0 +1,35 @@ + + + + + + + + + +
+ +

+ Bonjour. + + Kwei. + + Hello. + Je m'appelle Éric Joanis. + Éric + Joanis nindijinikàz. + My + name is Éric Joanis. +

+
+ + +
+
diff --git a/test/test_dtd.py b/test/test_dtd.py index 78483c77..aeffaffe 100644 --- a/test/test_dtd.py +++ b/test/test_dtd.py @@ -11,7 +11,7 @@ from readalongs.text.util import load_xml DTDPATH = os.path.join( - dirname(__file__), "..", "readalongs", "static", "read-along-1.2.dtd" + dirname(__file__), "..", "readalongs", "static", "read-along-1.3.dtd" ) VALID_RAS = """ @@ -75,6 +75,7 @@ def test_backwards_compatibility(self): "ras-dtd-1.0.readalong", "ras-dtd-1.1.readalong", "ras-dtd-1.2.readalong", + "ras-dtd-1.3.readalong", ] for name in versions: path = os.path.join(dirname(__file__), "data", name.strip()) From 4cf9841469e52c10bddb85e808bf405a1aad54f8 Mon Sep 17 00:00:00 2001 From: Delasie Torkornoo Date: Thu, 10 Oct 2024 10:45:53 -0400 Subject: [PATCH 2/2] fix: xmlns added to s tag ( RAS format to 1.3) --- readalongs/static/read-along-1.3.dtd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/readalongs/static/read-along-1.3.dtd b/readalongs/static/read-along-1.3.dtd index 749fb0d0..818caef6 100644 --- a/readalongs/static/read-along-1.3.dtd +++ b/readalongs/static/read-along-1.3.dtd @@ -76,7 +76,8 @@ time CDATA #IMPLIED dur CDATA #IMPLIED annotation-id CDATA #IMPLIED - sentence-id CDATA #IMPLIED> + sentence-id CDATA #IMPLIED + xmlns CDATA #IMPLIED>