Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: bump RAS format to 1.3 #247

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/cli-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ The format of the generated XML is based on [TEI
Lite](https://tei-c.org/guidelines/customization/lite/) but is
considerably simplified. The DTD (document type definition) can be
found in the ReadAlong Studio source code under
`readalongs/static/read-along-1.2.dtd`.
`readalongs/static/read-along-1.3.dtd`.

(dna)=

Expand Down
5 changes: 4 additions & 1 deletion readalongs/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
VERSION = "1.1.0"

READALONG_FILE_FORMAT_VERSION = "1.2"
READALONG_FILE_FORMAT_VERSION = "1.3"


CURRENT_WEB_APP_VERSION = "1.5.x"
8 changes: 4 additions & 4 deletions readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def parse_and_make_xml(
"""Parse XML input and run tokenization and G2P.

Args:
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.2.dtd)
xml_path (str): Path to input in ReadAlong XML format (see static/read-along-1.3.dtd)
config (dict): Optional; ReadAlong-Studio configuration to use
save_temps (str): Optional; Save temporary files, by default None
verbose_g2p_warnings (boolean): Optional; display all g2p errors and warnings
Expand Down Expand Up @@ -574,7 +574,7 @@ def align_audio(
"""Align an XML input file to an audio file.

Args:
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.2.dtd)
xml_path (str): Path to input file in ReadAlong XML format (see static/read-along-1.3.dtd)
audio_path (str): Path to audio input. Must be in a format supported by ffmpeg
unit (str): Optional; Element to create alignments for, by default 'w'
bare (boolean): Optional;
Expand Down Expand Up @@ -1192,7 +1192,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):


def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.3.dtd)
Uses the line sequence to infer paragraph and sentence structure from plain text:
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down Expand Up @@ -1240,7 +1240,7 @@ def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) ->


def create_input_ras(**kwargs):
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
"""Create input xml in ReadAlong XML format (see static/read-along-1.3.dtd)
Uses readlines to infer paragraph and sentence structure from plain text.
Assumes a double blank line marks a page break, and a single blank line
marks a paragraph break.
Expand Down
109 changes: 109 additions & 0 deletions readalongs/static/read-along-1.3.dtd
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<!-- VERSION: 1.2 -->
deltork marked this conversation as resolved.
Show resolved Hide resolved
<!ELEMENT read-along (meta|text|body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST read-along
use-assets-folder CDATA #IMPLIED
href CDATA #IMPLIED
audio CDATA #IMPLIED
xml:lang CDATA #IMPLIED
language CDATA #IMPLIED
lang CDATA #IMPLIED
version CDATA #IMPLIED
xmlns CDATA #IMPLIED>

<!ELEMENT text (body|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST text
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
fallback-langs CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT body (div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST body
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED>

<!ELEMENT anchor EMPTY>
<!ATTLIST anchor time CDATA #REQUIRED>

<!ELEMENT silence EMPTY>
<!ATTLIST silence dur CDATA #REQUIRED>

<!ELEMENT graphic EMPTY>
<!ATTLIST graphic
url CDATA #REQUIRED
id CDATA #IMPLIED>

<!ELEMENT div (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST div
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT span (#PCDATA|div|span|anchor|silence|graphic|p|s|w)*>
<!ATTLIST span
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
type CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT p (#PCDATA|span|anchor|silence|s|w)*>
<!ATTLIST p
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT s (#PCDATA|span|anchor|silence|w)*>
<!ATTLIST s
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
time CDATA #IMPLIED
dur CDATA #IMPLIED
annotation-id CDATA #IMPLIED
sentence-id CDATA #IMPLIED
xmlns CDATA #IMPLIED>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why we need xmlns here. I get it for the top-level read-along element, and I would get it if every element needed to allow it, but only one additional element I don't get.


<!ELEMENT w (#PCDATA|span|syl)*>
<!ATTLIST w
xml:lang CDATA #IMPLIED
effective-g2p-lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we make this case insensitive?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking the same. However, I am worried about backwards compatibility. Attributes in HTML are case insensitive but are case sensitive in XML (.readalong is a subset of XML). We need a good look at the whole pipeline before we make the switch. We use both XML and HTML parsers in the various parts of the pileline. As far as I know, only the CLIs consume the ARPABET attribute.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe an option is to put both ARPABET and arpabet is the dtd?

time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT syl (#PCDATA|span)*>
<!ATTLIST syl
xml:lang CDATA #IMPLIED
lang CDATA #IMPLIED
id CDATA #IMPLIED
class CDATA #IMPLIED
do-not-align CDATA #IMPLIED
ARPABET CDATA #IMPLIED
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto?

time CDATA #IMPLIED
dur CDATA #IMPLIED>

<!ELEMENT meta EMPTY>
<!ATTLIST meta
name CDATA #REQUIRED
content CDATA #REQUIRED
id CDATA #IMPLIED>
10 changes: 7 additions & 3 deletions readalongs/text/make_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,17 @@

from lxml import etree

from readalongs._version import VERSION
from readalongs._version import CURRENT_WEB_APP_VERSION, VERSION
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice idea, moving CURRENT_WEB_APP_VERSION to _version.py and reusing it here.

from readalongs.log import LOGGER
from readalongs.text.util import parse_xml

JS_BUNDLE_URL = "https://unpkg.com/@readalongs/web-component@^1.4.0/dist/bundle.js"
JS_BUNDLE_URL = (
"https://unpkg.com/@readalongs/web-component@%s/dist/bundle.js"
% CURRENT_WEB_APP_VERSION
)
FONTS_BUNDLE_URL = (
"https://unpkg.com/@readalongs/web-component@^1.4.0/dist/fonts.b64.css"
"https://unpkg.com/@readalongs/web-component@%s/dist/fonts.b64.css"
% CURRENT_WEB_APP_VERSION
)

BASIC_HTML = """
Expand Down
4 changes: 1 addition & 3 deletions readalongs/text/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,11 @@

from lxml import etree

from readalongs._version import VERSION
from readalongs._version import CURRENT_WEB_APP_VERSION, VERSION

# removed "try: unicode() except" block (was for Python 2), but this file uses unicode()
# too many times, so define it anyway.
unicode = str
# todo: sync with web component major and minor releases
CURRENT_WEB_APP_VERSION = "1.4.x"


def ensure_dirs(path):
Expand Down
2 changes: 1 addition & 1 deletion readalongs/web_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@
# Call get_langs() when the server loads to load the languages into memory
LANGS = get_langs()
# Get the DTD
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.2.dtd")
DTDPATH = os.path.join(os.path.dirname(__file__), "static", "read-along-1.3.dtd")
with open(DTDPATH) as dtdfh:
DTD = etree.DTD(dtdfh)

Expand Down
16 changes: 11 additions & 5 deletions test/data/ras-dtd-1.1.readalong
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,23 @@
<p>
<s>Bonjour.</s>
<s>Je m'appelle Éric Joanis.</s>
<s>Je suis programmeur au sein de l'équipe des technologies pour les langues autochtones au CNRC.</s>
<s>Je suis programmeur au sein de l'équipe des technologies pour les langues
autochtones au CNRC.</s>
</p>
</div>
<div type="page">
<p>
<s>J'ai fait une bonne partie de ma carrière en traduction automatique statistique, mais maintenant cette approche est déclassée par l'apprentissage profond.</s>
<s>En ce moment je travaille à l'alignement du hansard du Nunavut pour produire un corpus bilingue anglais-inuktitut.</s>
<s>Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que d'autres applications de traitement du langage naturel.</s>
<s>J'ai fait une bonne partie de ma carrière en traduction automatique
statistique, mais maintenant cette approche est déclassée par
l'apprentissage profond.</s>
<s>En ce moment je travaille à l'alignement du hansard du Nunavut pour produire
un corpus bilingue anglais-inuktitut.</s>
<s>Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que
d'autres applications de traitement du langage naturel.</s>
</p>
<p>
<s>En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus robuste.</s>
<s>En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus
robuste.</s>
</p>
</div>
</body>
Expand Down
35 changes: 35 additions & 0 deletions test/data/ras-dtd-1.3.readalong
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.3" xmlns="">
<meta name="generator" content="human made" id="meta0" />
<meta name="annotations-ids" content="translation1, translation2" id="meta1" />
<meta name="annotations-labels" content="Algonquin, English" id="meta2" />
<meta name="annotations-labels-eng" content="Algonquin, English" id="meta3" />
<meta name="annotations-labels-fra" content="algonquin, anglais" id="meta4" />
<text xml:lang="fra" id="t0">
<body id="t0b0">
<div type="page" id="t0b0d0" class="two-column-layout-page">
<graphic url="avatar.png" id="t0b0d0graphic0" />
<p id="t0b0d0p0">
<s id="t0b0d0p0s0"><w id="t0b0d0p0s0w0" time="0.455" dur="1.165">Bonjour</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s0an01" annotation-id="translation1"
sentence-id="t0b0d0p0s0">
Kwei.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s0an02" annotation-id="translation2"
sentence-id="t0b0d0p0s0">
Hello.</s>
<s id="t0b0d0p0s1"><w id="t0b0d0p0s1w0" time="1.620" dur="0.070">Je</w> <w
id="t0b0d0p0s1w1" time="1.690" dur="0.070">m</w>'<w id="t0b0d0p0s1w2" time="1.760"
dur="0.240">appelle</w> <w id="t0b0d0p0s1w3" time="2.000" dur="1.705">Éric</w> <w
id="t0b0d0p0s1w4" time="3.705" dur="1.905">Joanis</w>.</s>
<s do-not-align="true" xml:lang="otw" id="t0b0d0p0s1an01" annotation-id="translation1"
sentence-id="t0b0d0p0s1">Éric
Joanis nindijinikàz.</s>
<s do-not-align="true" xml:lang="eng" id="t0b0d0p0s1an01" annotation-id="translation2"
sentence-id="t0b0d0p0s1">My
name is Éric Joanis.</s>
</p>
</div>

</body>
</text>
</read-along>
3 changes: 2 additions & 1 deletion test/test_dtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from readalongs.text.util import load_xml

DTDPATH = os.path.join(
dirname(__file__), "..", "readalongs", "static", "read-along-1.2.dtd"
dirname(__file__), "..", "readalongs", "static", "read-along-1.3.dtd"
)

VALID_RAS = """
Expand Down Expand Up @@ -75,6 +75,7 @@ def test_backwards_compatibility(self):
"ras-dtd-1.0.readalong",
"ras-dtd-1.1.readalong",
"ras-dtd-1.2.readalong",
"ras-dtd-1.3.readalong",
]
for name in versions:
path = os.path.join(dirname(__file__), "data", name.strip())
Expand Down
Loading