Skip to content

Commit

Permalink
Improved tests
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Nov 10, 2024
1 parent 04895ab commit a678b9e
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 143 deletions.
42 changes: 23 additions & 19 deletions op_cellar/parsers/akomantoso.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .parser import Parser
import re
#import xml.etree.ElementTree as ET
from lxml import etree

class AkomaNtosoParser(Parser):
Expand All @@ -25,8 +24,10 @@ def __init__(self):
# Define the namespace mapping
self.namespaces = {
'akn': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0',
'an': 'http://docs.oasis-open.org/legaldocml/ns/akn/3.0',
'fmx': 'http://formex.publications.europa.eu/schema/formex-05.56-20160701.xd'


}

def remove_node(self, tree, node):
Expand All @@ -45,24 +46,25 @@ def remove_node(self, tree, node):
lxml.etree._Element
The modified XML tree with specified nodes removed.
"""
for item in tree.findall(node, namespaces=self.namespaces):
text = ' '.join(item.itertext()).strip()

# Find the parent and remove the <node> element
parent = item.getparent()
tail_text = item.tail
if parent is not None:
parent.remove(item)

# Preserve tail text if present
if tail_text:
if parent.getchildren():
# If there's a previous sibling, add the tail to the last child
previous_sibling = parent.getchildren()[-1]
previous_sibling.tail = (previous_sibling.tail or '') + tail_text
else:
# If no siblings, add the tail text to the parent's text
parent.text = (parent.text or '') + tail_text
if tree.findall(node, namespaces=self.namespaces) is not None:
for item in tree.findall(node, namespaces=self.namespaces):
text = ' '.join(item.itertext()).strip()

# Find the parent and remove the <node> element
parent = item.getparent()
tail_text = item.tail
if parent is not None:
parent.remove(item)

# Preserve tail text if present
if tail_text:
if parent.getchildren():
# If there's a previous sibling, add the tail to the last child
previous_sibling = parent.getchildren()[-1]
previous_sibling.tail = (previous_sibling.tail or '') + tail_text
else:
# If no siblings, add the tail text to the parent's text
parent.text = (parent.text or '') + tail_text

return tree

Expand Down Expand Up @@ -544,5 +546,7 @@ def parse(self, file: str) -> list[dict]:
"""
self.get_root(file)
self.get_body()
self.get_chapters()
self.get_articles()


188 changes: 64 additions & 124 deletions tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,191 +3,131 @@
import os
import lxml.etree as etree


# Define constants for file paths and directories
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
file_path = os.path.join(DATA_DIR, "32014L0092.akn")

class TestAkomaNtosoParser(unittest.TestCase):
# Set maxDiff to None to ensure full diff is displayed for assertion failures
maxDiff = None

def setUp(self):
"""Initialize the AkomaNtosoParser before each test."""
self.parser = AkomaNtosoParser()
self.parser.get_root(file_path)

def test_get_root(self):
def tearDown(self):
"""Cleanup after each test."""
self.parser = None

# Verify file exists
def test_get_root(self):
"""Test parsing and root element retrieval from the Akoma Ntoso file."""
self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}")

# Get and verify root
self.parser.get_root(file_path)
self.assertIsNotNone(self.parser.root, "Root element should not be None")

def test_get_meta_identification(self):
self.parser.get_root(file_path)
"""Test extraction of meta-identification from the root."""
meta_identification = self.parser.get_meta_identification()

self.assertIsNotNone(meta_identification)
self.assertIn('work', meta_identification)
self.assertEqual(meta_identification['work']['FRBRalias'], "32014L0092")

def test_get_meta_references(self):
self.parser.get_root(file_path)
"""Test extraction of meta references data from the file."""
references = self.parser.get_meta_references()
self.assertIsNotNone(references)
self.assertEqual(references['eId'], "cirsfid")
self.assertEqual(references.get('eId'), "cirsfid")

def test_get_meta_proprietary(self):
self.parser.get_root(file_path)
"""Test extraction of proprietary metadata."""
proprietary = self.parser.get_meta_proprietary()
self.assertIsNotNone(proprietary)
self.assertEqual(proprietary['file'], "L_2014257EN.01021401.doc.xml")
self.assertEqual(proprietary.get('file'), "L_2014257EN.01021401.doc.xml")

def test_get_preface(self):

self.parser.get_root(file_path)
"""Test the content extracted from the preface section."""
preface_text = self.parser.get_preface()
self.assertIsNotNone(preface_text, "Preface element not found")

# Validate the content of each paragraph
self.assertEqual("Directive 2014/92/EU of the European Parliament and of the Council", preface_text[0],
"First paragraph text does not match expected content.")
self.assertEqual("of 23 July 2014", preface_text[1], "Second paragraph text does not match expected content.")
self.assertEqual("on the comparability of fees related to payment accounts, payment account switching and access to payment accounts with basic features", preface_text[2],
"Third paragraph text does not match expected content.")
self.assertEqual("(Text with EEA relevance)", preface_text[3], "Fourth paragraph text does not match expected content.")

def test_get_preamble(self):
self.parser.get_root(file_path)
expected_preface = [
"Directive 2014/92/EU of the European Parliament and of the Council",
"of 23 July 2014",
"on the comparability of fees related to payment accounts, payment account switching and access to payment accounts with basic features",
"(Text with EEA relevance)"
]
for i, expected in enumerate(expected_preface):
with self.subTest(paragraph=i):
self.assertEqual(preface_text[i], expected)

def test_get_preamble(self):
"""Test retrieval of preamble data from the XML file."""
preamble_data = self.parser.get_preamble()
self.assertIsNotNone(preamble_data, "Preamble data not found")

def test_get_preamble_formula(self):
self.parser.get_root(file_path)
"""Test extraction of formula text within the preamble."""
formula_data = self.parser.get_preamble_formula()
self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data)

# Verify formula text
self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data,
"Formula text in preamble does not match expected content")

def test_get_preamble_citations(self):
self.parser.get_root(file_path)
"""Test citation extraction in the preamble section."""
citations_data = self.parser.get_preamble_citations()

# Verify first citation content
self.assertGreater(len(citations_data), 0, "No citations found in preamble")

first_citation = citations_data[0]
self.assertIn("Having regard to the Treaty on the Functioning of the European Union, and in particular Article 114", first_citation['citation_text'],
"First citation text does not match expected content")

# Verify authorial note in fourth citation
fourth_citation = citations_data[3]
self.assertIn("OJ C 51, 22.2.2014, p. 3", fourth_citation['authorial_notes'][0],
"Expected authorial note text not found in fourth citation")

# Verify date in the last citation's authorial note
last_citation = citations_data[-1]
self.assertIn("Position of the European Parliament of 15 April 2014", last_citation['authorial_notes'][0],
"Expected text in authorial note of last citation not found")

def test_get_preamble_recitals(self):
self.parser.get_root(file_path)
first_citation = citations_data[0]
expected_text = "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 114"
self.assertIn(expected_text, first_citation['citation_text'])

def test_get_preamble_recitals(self):
"""Test retrieval and content verification of recitals in the preamble."""
recitals_data = self.parser.get_preamble_recitals()
self.assertIsNotNone(recitals_data, "Recitals section not found in <preamble>")

# Check the number of recitals extracted
self.assertEqual(len(recitals_data), 59, "Incorrect number of recitals extracted")

# Test first recital content
intro_recital = recitals_data[0]
self.assertEqual(intro_recital['eId'], "recs_1__intro_1", "Intro does not match expected value")
self.assertEqual("Whereas:",
intro_recital['recital_text'], "Intro recitals text does not match expected content")

# Test second recital content with authorial note
second_recital = recitals_data[2]
self.assertEqual(second_recital['eId'], "recs_1__rec_(2)", "Second recital eId does not match expected value")
self.assertEqual("In this respect, Directive 2007/64/EC of the European Parliament and of the Council established basic transparency requirements for fees charged by payment service providers in relation to services offered on payment accounts. This has substantially facilitated the activity of payment service providers, creating uniform rules with respect to the provision of payment services and the information to be provided, reduced the administrative burden and generated cost savings for payment service providers.",
second_recital['recital_text'], "Second recital text does not match expected content")

# Test third recital content
third_recital = recitals_data[3]
self.assertEqual(third_recital['eId'], "recs_1__rec_(3)", "Third recital eId does not match expected value")
self.assertEqual("The smooth functioning of the internal market and the development of a modern, socially inclusive economy increasingly depends on the universal provision of payment services. Any new legislation in this regard must be part of a smart economic strategy for the Union, which must effectively take into account the needs of more vulnerable consumers.",
third_recital['recital_text'], "Third recital text does not match expected content")

# Test fourth recital content with date
other_recital = recitals_data[16]
self.assertEqual(other_recital['eId'], "recs_1__rec_(16)", "Sixteenth recital eId does not match expected value")
self.assertEqual("Consumers would benefit most from information that is concise, standardised and easy to compare between different payment service providers. The tools made available to consumers to compare payment account offers would not have a positive impact if the time invested in going through lengthy lists of fees for different offers outweighed the benefit of choosing the offer that represents the best value. Those tools should be multifold and consumer testing should be conducted. At this stage, fee terminology should only be standardised for the most representative terms and definitions within Member States in order to avoid the risk of excessive information and to facilitate swift implementation.",
other_recital['recital_text'], "Sixteenth recital text does not match expected content")
expected_recitals = {
0: {'eId': "recs_1__intro_1", 'text': "Whereas:"},
2: {'eId': "recs_1__rec_(2)", 'text': "In this respect, Directive 2007/64/EC of the European Parliament and of the Council established basic transparency requirements for fees charged by payment service providers in relation to services offered on payment accounts. This has substantially facilitated the activity of payment service providers, creating uniform rules with respect to the provision of payment services and the information to be provided, reduced the administrative burden and generated cost savings for payment service providers."},
3: {'eId': "recs_1__rec_(3)", 'text': "The smooth functioning of the internal market and the development of a modern, socially inclusive economy increasingly depends on the universal provision of payment services. Any new legislation in this regard must be part of a smart economic strategy for the Union, which must effectively take into account the needs of more vulnerable consumers."},
16: {'eId': "recs_1__rec_(16)", 'text': "Consumers would benefit most from information that is concise, standardised and easy to compare between different payment service providers. The tools made available to consumers to compare payment account offers would not have a positive impact if the time invested in going through lengthy lists of fees for different offers outweighed the benefit of choosing the offer that represents the best value. Those tools should be multifold and consumer testing should be conducted. At this stage, fee terminology should only be standardised for the most representative terms and definitions within Member States in order to avoid the risk of excessive information and to facilitate swift implementation."}
}
# Iterate over the selected recitals to verify content and ID
for index, expected_values in expected_recitals.items():
with self.subTest(recital=index):
self.assertEqual(recitals_data[index]['eId'], expected_values['eId'],
f"Recital {index} ID does not match expected value")
self.assertIn(expected_values['text'], recitals_data[index]['recital_text'],
f"Recital {index} text does not match expected content")

def test_get_act(self):

# Get and verify root
self.parser.get_root(file_path)
# Run get_act to set the `act` attribute
"""Test retrieval of the act element."""
self.parser.get_act()

# Verify that `act` is an instance of etree._Element
self.assertEqual(type(self.parser.act), etree._Element, "Act element should be an lxml.etree._Element")

# Additional debug information if `act` is None
if self.parser.act is None:
print("No act element found. Available elements at root level:")
for child in self.parser.root:
print(f"- {child.tag}")
self.assertIsInstance(self.parser.act, etree._Element, "Act element should be an lxml.etree._Element")

def test_get_body(self):
self.parser.get_root(file_path)
# Test the get_body method
"""Test retrieval of the body element."""
self.parser.get_body()

# Check if body is not None
self.assertIsNotNone(self.parser.body, "Body element not found in the XML")
# Check if `body` is an instance of etree._Element
self.assertIsInstance(self.parser.body, etree._Element, "Body element should be an etree._Element")

# More detailed assertion
if self.parser.body is None:
# Print available elements for debugging
print("Available elements at root level:")
for child in self.parser.root:
print(f"- {child.tag}")

def test_get_chapters(self):
self.parser.get_root(file_path)
"""Test retrieval and content of chapter headings."""
self.parser.get_body()

# Call get_chapters to populate self.chapters
self.parser.get_chapters()

# Expected chapters data
expected_chapters = [
{'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},
{'eId': 'chp_II', 'chapter_num': 'CHAPTER II', 'chapter_heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'},
{'eId': 'chp_III', 'chapter_num': 'CHAPTER III', 'chapter_heading': 'SWITCHING'},
{'eId': 'chp_IV', 'chapter_num': 'CHAPTER IV', 'chapter_heading': 'ACCESS TO PAYMENT ACCOUNTS'},
{'eId': 'chp_V', 'chapter_num': 'CHAPTER V', 'chapter_heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'},
{'eId': 'chp_VI', 'chapter_num': 'CHAPTER VI', 'chapter_heading': 'SANCTIONS'},
{'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},
{'eId': 'chp_II', 'chapter_num': 'CHAPTER II', 'chapter_heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'},
{'eId': 'chp_III', 'chapter_num': 'CHAPTER III', 'chapter_heading': 'SWITCHING'},
{'eId': 'chp_IV', 'chapter_num': 'CHAPTER IV', 'chapter_heading': 'ACCESS TO PAYMENT ACCOUNTS'},
{'eId': 'chp_V', 'chapter_num': 'CHAPTER V', 'chapter_heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'},
{'eId': 'chp_VI', 'chapter_num': 'CHAPTER VI', 'chapter_heading': 'SANCTIONS'},
{'eId': 'chp_VII', 'chapter_num': 'CHAPTER VII', 'chapter_heading': 'FINAL PROVISIONS'}
]

# Assert that self.chapters matches expected output
self.assertEqual(self.parser.chapters, expected_chapters, "Chapters data does not match expected content")

def test_get_articles(self):

self.parser.get_root(file_path)
def test_get_articles(self):
"""Test retrieval of articles within the body."""
self.parser.get_body()

# Call get_chapters to populate self.chapters
self.parser.get_articles()
#pprint(self.parser.articles)
# Assert that self.chapters matches expected output
self.assertEqual(self.parser.articles, False, "Chapters data does not match expected content")

self.assertEqual(len(self.parser.articles), 31, "Incorrect number of articles extracted")

if __name__ == '__main__':
unittest.main()
unittest.main()

0 comments on commit a678b9e

Please sign in to comment.