ReadAlongs · deltork · Oct 10, 2024 · Oct 10, 2024 · joanise · Oct 10, 2024
diff --git a/docs/cli-guide.md b/docs/cli-guide.md
@@ -67,7 +67,7 @@ The format of the generated XML is based on [TEI
 Lite](https://tei-c.org/guidelines/customization/lite/) but is
 considerably simplified.  The DTD (document type definition) can be
 found in the ReadAlong Studio source code under
-`readalongs/static/read-along-1.2.dtd`.
+`readalongs/static/read-along-1.3.dtd`.
 
 (dna)=
 

diff --git a/readalongs/_version.py b/readalongs/_version.py
@@ -1,3 +1,6 @@
 VERSION = "1.1.0"
 
-READALONG_FILE_FORMAT_VERSION = "1.2"
+READALONG_FILE_FORMAT_VERSION = "1.3"
+
+
+CURRENT_WEB_APP_VERSION = "1.5.x"
diff --git a/readalongs/align.py b/readalongs/align.py
@@ -181,7 +181,7 @@ def parse_and_make_xml(
     """Parse XML input and run tokenization and G2P.
 
     Args:
-        xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.2.dtd)
+        xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.3.dtd)
         config (dict): Optional; ReadAlong-Studio configuration to use
         save_temps (str): Optional; Save temporary files, by default None
         verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
@@ -574,7 +574,7 @@ def align_audio(
     """Align an XML input file to an audio file.
 
     Args:
-        xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.2.dtd)
+        xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.3.dtd)
         audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
         unit (str): Optional; Element to create alignments for, by default 'w'
         bare (boolean): Optional;
@@ -1192,7 +1192,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
 
 
 def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
-    """Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
+    """Create input xml in ReadAlong XML format (see static/read-along-1.3.dtd)
         Uses the line sequence to infer paragraph and sentence structure from plain text:
         Assumes a double blank line marks a page break, and a single blank line
         marks a paragraph break.
@@ -1240,7 +1240,7 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->
 
 
 def create_input_ras(**kwargs):
-    """Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
+    """Create input xml in ReadAlong XML format (see static/read-along-1.3.dtd)
         Uses readlines to infer paragraph and sentence structure from plain text.
         Assumes a double blank line marks a page break, and a single blank line
         marks a paragraph break.

diff --git a/readalongs/static/read-along-1.3.dtd b/readalongs/static/read-along-1.3.dtd
@@ -0,0 +1,109 @@
+<!-- VERSION: 1.2 -->
+<!ELEMENT read-along (meta|text|body|div|span|anchor|silence|graphic|p|s|w)*>
+<!ATTLIST read-along
+  use-assets-folder CDATA #IMPLIED
+  href CDATA #IMPLIED
+  audio CDATA #IMPLIED
+  xml:lang CDATA #IMPLIED
+  language CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  version CDATA #IMPLIED
+  xmlns CDATA #IMPLIED>
+
+<!ELEMENT text (body|div|span|anchor|silence|graphic|p|s|w)*>
+<!ATTLIST text
+  xml:lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  fallback-langs CDATA #IMPLIED
+  id CDATA #IMPLIED>
+
+<!ELEMENT body (div|span|anchor|silence|graphic|p|s|w)*>
+<!ATTLIST body
+  xml:lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  id CDATA #IMPLIED>
+
+<!ELEMENT anchor EMPTY>
+<!ATTLIST anchor time CDATA #REQUIRED>
+
+<!ELEMENT silence EMPTY>
+<!ATTLIST silence dur CDATA #REQUIRED>
+
+<!ELEMENT graphic EMPTY>
+<!ATTLIST graphic
+  url CDATA #REQUIRED
+  id CDATA #IMPLIED>
+
+<!ELEMENT div (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
+<!ATTLIST div
+  xml:lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  id CDATA #IMPLIED
+  class CDATA #IMPLIED
+  type CDATA #IMPLIED
+  do-not-align CDATA #IMPLIED
+  time CDATA #IMPLIED
+  dur CDATA #IMPLIED>
+
+<!ELEMENT span (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
+<!ATTLIST span
+  xml:lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  id CDATA #IMPLIED
+  class CDATA #IMPLIED
+  type CDATA #IMPLIED
+  do-not-align CDATA #IMPLIED
+  time CDATA #IMPLIED
+  dur CDATA #IMPLIED>
+
+<!ELEMENT p (#PCDATA|span|anchor|silence|s|w)*>
+<!ATTLIST p
+  xml:lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  id CDATA #IMPLIED
+  class CDATA #IMPLIED
+  do-not-align CDATA #IMPLIED
+  time CDATA #IMPLIED
+  dur CDATA #IMPLIED>
+
+<!ELEMENT s (#PCDATA|span|anchor|silence|w)*>
+<!ATTLIST s
+  xml:lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  id CDATA #IMPLIED
+  class CDATA #IMPLIED
+  do-not-align CDATA #IMPLIED
+  time CDATA #IMPLIED
+  dur CDATA #IMPLIED
+  annotation-id CDATA #IMPLIED
+  sentence-id CDATA #IMPLIED
+  xmlns CDATA #IMPLIED>
+
+<!ELEMENT w (#PCDATA|span|syl)*>
+<!ATTLIST w
+  xml:lang CDATA #IMPLIED
+  effective-g2p-lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  id CDATA #IMPLIED
+  class CDATA #IMPLIED
+  do-not-align CDATA #IMPLIED
+  ARPABET CDATA #IMPLIED
+  time CDATA #IMPLIED
+  dur CDATA #IMPLIED>
+
+<!ELEMENT syl (#PCDATA|span)*>
+<!ATTLIST syl
+  xml:lang CDATA #IMPLIED
+  lang CDATA #IMPLIED
+  id CDATA #IMPLIED
+  class CDATA #IMPLIED
+  do-not-align CDATA #IMPLIED
+  ARPABET CDATA #IMPLIED
+  time CDATA #IMPLIED
+  dur CDATA #IMPLIED>
+
+<!ELEMENT meta EMPTY>
+<!ATTLIST meta
+  name CDATA #REQUIRED
+  content CDATA #REQUIRED
+  id CDATA #IMPLIED>
diff --git a/readalongs/text/make_package.py b/readalongs/text/make_package.py
@@ -20,13 +20,17 @@
 
 from lxml import etree
 
-from readalongs._version import VERSION
+from readalongs._version import CURRENT_WEB_APP_VERSION, VERSION
 from readalongs.log import LOGGER
 from readalongs.text.util import parse_xml
 
-JS_BUNDLE_URL = "https://unpkg.com/@readalongs/web-component@^1.4.0/dist/bundle.js"
+JS_BUNDLE_URL = (
+    "https://unpkg.com/@readalongs/web-component@%s/dist/bundle.js"
+    % CURRENT_WEB_APP_VERSION
+)
 FONTS_BUNDLE_URL = (
-    "https://unpkg.com/@readalongs/web-component@^1.4.0/dist/fonts.b64.css"
+    "https://unpkg.com/@readalongs/web-component@%s/dist/fonts.b64.css"
+    % CURRENT_WEB_APP_VERSION
 )
 
 BASIC_HTML = """

diff --git a/readalongs/text/util.py b/readalongs/text/util.py
@@ -20,13 +20,11 @@
 
 from lxml import etree
 
-from readalongs._version import VERSION
+from readalongs._version import CURRENT_WEB_APP_VERSION, VERSION
 
 # removed "try: unicode() except" block (was for Python 2), but this file uses unicode()
 # too many times, so define it anyway.
 unicode = str
-# todo: sync with web component major and minor releases
-CURRENT_WEB_APP_VERSION = "1.4.x"
 
 
 def ensure_dirs(path):

diff --git a/readalongs/web_api.py b/readalongs/web_api.py
@@ -78,7 +78,7 @@
 # Call get_langs() when the server loads to load the languages into memory
 LANGS = get_langs()
 # Get the DTD
-DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.2.dtd")
+DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.3.dtd")
 with open(DTDPATH) as dtdfh:
     DTD = etree.DTD(dtdfh)
 

diff --git a/test/data/ras-dtd-1.1.readalong b/test/data/ras-dtd-1.1.readalong
@@ -7,17 +7,23 @@
                 <p>
                     <s>Bonjour.</s>
                     <s>Je m'appelle Éric Joanis.</s>
-                    <s>Je suis programmeur au sein de l'équipe des technologies pour les langues autochtones au CNRC.</s>
+                    <s>Je suis programmeur au sein de l'équipe des technologies pour les langues
+                        autochtones au CNRC.</s>
                 </p>
             </div>
             <div type="page">
                 <p>
-                    <s>J'ai fait une bonne partie de ma carrière en traduction automatique statistique, mais maintenant cette approche est déclassée par l'apprentissage profond.</s>
-                    <s>En ce moment je travaille à l'alignement du hansard du Nunavut pour produire un corpus bilingue anglais-inuktitut.</s>
-                    <s>Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que d'autres applications de traitement du langage naturel.</s>
+                    <s>J'ai fait une bonne partie de ma carrière en traduction automatique
+                        statistique, mais maintenant cette approche est déclassée par
+                        l'apprentissage profond.</s>
+                    <s>En ce moment je travaille à l'alignement du hansard du Nunavut pour produire
+                        un corpus bilingue anglais-inuktitut.</s>
+                    <s>Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que
+                        d'autres applications de traitement du langage naturel.</s>
                 </p>
                 <p>
-                    <s>En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus robuste.</s>
+                    <s>En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus
+                        robuste.</s>
                 </p>
             </div>
         </body>

diff --git a/test/data/ras-dtd-1.3.readalong b/test/data/ras-dtd-1.3.readalong
@@ -0,0 +1,35 @@
+<?xml version='1.0' encoding='utf-8'?>
+<read-along version="1.3" xmlns="">
+  <meta name="generator" content="human made" id="meta0" />
+  <meta name="annotations-ids" content="translation1, translation2" id="meta1" />
+  <meta name="annotations-labels" content="Algonquin, English" id="meta2" />
+  <meta name="annotations-labels-eng" content="Algonquin, English" id="meta3" />
+  <meta name="annotations-labels-fra" content="algonquin, anglais" id="meta4" />
+  <text xml:lang="fra" id="t0">
+    <body id="t0b0">
+      <div type="page" id="t0b0d0" class="two-column-layout-page">
+        <graphic url="avatar.png" id="t0b0d0graphic0" />
+        <p id="t0b0d0p0">
+          <s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165">Bonjour</w>.</s>
+          <s do-not-align="true" xml:lang="otw" id="t0b0d0p0s0an01" annotation-id="translation1"
+            sentence-id="t0b0d0p0s0">
+            Kwei.</s>
+          <s do-not-align="true" xml:lang="eng" id="t0b0d0p0s0an02" annotation-id="translation2"
+            sentence-id="t0b0d0p0s0">
+            Hello.</s>
+          <s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w
+              id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760"
+              dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="1.705">Éric</w> <w
+              id="t0b0d0p0s1w4" time="3.705" dur="1.905">Joanis</w>.</s>
+          <s do-not-align="true" xml:lang="otw" id="t0b0d0p0s1an01" annotation-id="translation1"
+            sentence-id="t0b0d0p0s1">Éric
+            Joanis nindijinikàz.</s>
+          <s do-not-align="true" xml:lang="eng" id="t0b0d0p0s1an01" annotation-id="translation2"
+            sentence-id="t0b0d0p0s1">My
+            name is Éric Joanis.</s>
+        </p>
+      </div>
+
+    </body>
+  </text>
+</read-along>
diff --git a/test/test_dtd.py b/test/test_dtd.py
@@ -11,7 +11,7 @@
 from readalongs.text.util import load_xml
 
 DTDPATH = os.path.join(
-    dirname(__file__), "..", "readalongs", "static", "read-along-1.2.dtd"
+    dirname(__file__), "..", "readalongs", "static", "read-along-1.3.dtd"
 )
 
 VALID_RAS = """
@@ -75,6 +75,7 @@ def test_backwards_compatibility(self):
             "ras-dtd-1.0.readalong",
             "ras-dtd-1.1.readalong",
             "ras-dtd-1.2.readalong",
+            "ras-dtd-1.3.readalong",
         ]
         for name in versions:
             path = os.path.join(dirname(__file__), "data", name.strip())