Skip to content

Commit

Permalink
test: added backwards compatiblity test for DTD
Browse files Browse the repository at this point in the history
  • Loading branch information
deltork committed Jul 16, 2024
1 parent 73d2455 commit 5d78d27
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 6 deletions.
24 changes: 24 additions & 0 deletions test/data/ras-dtd-1.0.readalong
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.0">
<text xml:lang="fra">
<body>
<div type="page">
<p>
<s>Bonjour.</s>
<s>Je m'appelle Éric Joanis.</s>
<s>Je suis programmeur au sein de l'équipe des technologies pour les langues autochtones au CNRC.</s>
</p>
</div>
<div type="page">
<p>
<s>J'ai fait une bonne partie de ma carrière en traduction automatique statistique, mais maintenant cette approche est déclassée par l'apprentissage profond.</s>
<s>En ce moment je travaille à l'alignement du hansard du Nunavut pour produire un corpus bilingue anglais-inuktitut.</s>
<s>Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que d'autres applications de traitement du langage naturel.</s>
</p>
<p>
<s>En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus robuste.</s>
</p>
</div>
</body>
</text>
</read-along>
25 changes: 25 additions & 0 deletions test/data/ras-dtd-1.1.readalong
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?xml version='1.0' encoding='utf-8'?>
<read-along version="1.1">
<meta name="generator" content="human made" id="meta0" />
<text xml:lang="fra">
<body>
<div type="page">
<p>
<s>Bonjour.</s>
<s>Je m'appelle Éric Joanis.</s>
<s>Je suis programmeur au sein de l'équipe des technologies pour les langues autochtones au CNRC.</s>
</p>
</div>
<div type="page">
<p>
<s>J'ai fait une bonne partie de ma carrière en traduction automatique statistique, mais maintenant cette approche est déclassée par l'apprentissage profond.</s>
<s>En ce moment je travaille à l'alignement du hansard du Nunavut pour produire un corpus bilingue anglais-inuktitut.</s>
<s>Ce corpus permettra d'entraîner la TA, neuronale ou statistique, ainsi que d'autres applications de traitement du langage naturel.</s>
</p>
<p>
<s>En parallèle, j'aide à écrire des tests pour rendre le ReadAlong-Studio plus robuste.</s>
</p>
</div>
</body>
</text>
</read-along>
58 changes: 52 additions & 6 deletions test/test_dtd.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from lxml import etree

from readalongs.text.util import load_xml

DTDPATH = os.path.join(
dirname(__file__), "..", "readalongs", "static", "read-along-1.1.dtd"
)
Expand Down Expand Up @@ -46,17 +48,61 @@ def test_valid_inputs(self):
path = os.path.join(dirname(__file__), "data", name)
# DTD is text, XML is binary... okay
with open(path, "rb") as infh:
parsed = etree.parse(infh)
self.assertTrue(self.dtd.validate(parsed), f"{name} does not validate")
try:
parsed = load_xml(infh)
self.assertTrue(
self.dtd.validate(parsed), f"{name} does not validate"
)
except etree.ParseError as e:
self.fail("Error parsing XML input file %s: %s." % (path, e))

def test_invalid_inputs(self):
for name in INVALID_RAS:
path = os.path.join(dirname(__file__), "data", name)
with open(path, "rb") as infh:
parsed = etree.parse(infh)
self.assertFalse(
self.dtd.validate(parsed), f"{name} validates but shouldn't"
)
try:
parsed = load_xml(infh)
self.assertFalse(
self.dtd.validate(parsed), f"{name} validates but shouldn't"
)
except etree.ParseError as e:
self.fail("Error parsing XML input file %s: %s." % (path, e))

def test_backwards_compatibility(self):
# the DTD needs to be backwards compatible as long as the major version does not change
versions = ["ras-dtd-1.0.readalong", "ras-dtd-1.1.readalong"]
for name in versions:
path = os.path.join(dirname(__file__), "data", name.strip())
# DTD is text, XML is binary... okay
with open(path, "rb") as infh:
try:
parsed = load_xml(infh)
self.assertTrue(
self.dtd.validate(parsed), f"{name} does not validate"
)
except etree.ParseError as e:
self.fail("Error parsing XML input file %s: %s." % (path, e))

# test that previous DTD fails current version
# test DTD 1.0 with format 1.1
with open(
os.path.join(
dirname(__file__), "..", "readalongs", "static", "read-along-1.0.dtd"
),
"rt",
) as dtdFile:
dtd = etree.DTD(dtdFile)
with open(
os.path.join(dirname(__file__), "data", versions[1]), "rb"
) as rasFile:
try:
parsed = load_xml(rasFile)
self.assertFalse(
dtd.validate(parsed),
f"{versions[1]} validates with 1.0 but shouldn't",
)
except etree.ParseError as e:
self.fail("Error parsing XML input file %s: %s." % (rasFile, e))


if __name__ == "__main__":
Expand Down

0 comments on commit 5d78d27

Please sign in to comment.