Skip to content

Commit

Permalink
Initialised the parametrisation of the AKN and Formex classes
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 23, 2024
1 parent aefc994 commit abaa652
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 209 deletions.
22 changes: 11 additions & 11 deletions tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import lxml.etree as etree

# Define constants for file paths and directories
file_path = os.path.join(os.path.dirname(__file__), '..\data\\akn\eu', '32014L0092.akn')
file_path = os.path.join(os.path.dirname(__file__), '..\\data\\akn\\eu', '32014L0092.akn')

class TestAkomaNtosoParser(unittest.TestCase):
maxDiff = None
Expand Down Expand Up @@ -44,7 +44,7 @@ def test_get_meta_proprietary(self):

def test_get_preface(self):
"""Test the content extracted from the preface section."""
self.parser.get_preface()
self.parser.get_preface(preface_xpath='.//akn:preface', paragraph_xpath='.//akn:p')
self.assertIsNotNone(self.parser.preface, "Preface element not found")

expected_preface = "Directive 2014/92/EU of the European Parliament and of the Council of 23 July 2014 on the comparability of fees related to payment accounts, payment account switching and access to payment accounts with basic features (Text with EEA relevance)"
Expand All @@ -59,23 +59,23 @@ def test_get_preamble(self):
self.assertIsNotNone(self.parser.recitals, "Recitals data not found")


def test_get_preamble_formula(self):
def test_get_formula(self):
"""Test extraction of formula text within the preamble."""
formula_data = self.parser.get_preamble_formula()
formula_data = self.parser.get_formula()
self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data)

def test_get_preamble_citations(self):
def test_get_citations(self):
"""Test citation extraction in the preamble section."""
citations_data = self.parser.get_preamble_citations()
citations_data = self.parser.get_citations()
self.assertGreater(len(citations_data), 0, "No citations found in preamble")

first_citation = citations_data[0]
expected_text = "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 114"
self.assertIn(expected_text, first_citation['citation_text'])

def test_get_preamble_recitals(self):
def test_get_recitals(self):
"""Test retrieval and content verification of recitals in the preamble."""
recitals_data = self.parser.get_preamble_recitals()
recitals_data = self.parser.get_recitals()
self.assertIsNotNone(recitals_data, "Recitals section not found in <preamble>")
self.assertEqual(len(recitals_data), 59, "Incorrect number of recitals extracted")
expected_recitals = {
Expand All @@ -99,12 +99,12 @@ def test_get_act(self):

def test_get_body(self):
"""Test retrieval of the body element."""
self.parser.get_body()
self.parser.get_body(body_xpath='.//akn:body')
self.assertIsInstance(self.parser.body, etree._Element, "Body element should be an etree._Element")

def test_get_chapters(self):
"""Test retrieval and content of chapter headings."""
self.parser.get_body()
self.parser.get_body(body_xpath='.//akn:body')
self.parser.get_chapters()

expected_chapters = [
Expand All @@ -120,7 +120,7 @@ def test_get_chapters(self):

def test_get_articles(self):
"""Test retrieval of articles within the body."""
self.parser.get_body()
self.parser.get_body(body_xpath='.//akn:body')
self.parser.get_articles()

self.assertEqual(len(self.parser.articles), 31, "Incorrect number of articles extracted")
Expand Down
8 changes: 4 additions & 4 deletions tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def test_get_preface(self):
self.maxDiff = None # Allow full diff if needed


result = self.parser.get_preface()
self.parser.get_preface(preface_xpath='.//TITLE', paragraph_xpath='.//P')
expected = (
"Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 "
"fixing representative prices in the poultrymeat and egg sectors and for egg "
Expand Down Expand Up @@ -111,14 +111,14 @@ def test_get_preamble_recitals(self):
self.assertEqual(self.parser.recitals, recitals)

def test_get_body(self):
self.parser.get_body()
self.parser.get_body(body_xpath='.//ENACTING.TERMS')
self.assertIsNotNone(self.parser.body, "Body element should not be None")

def test_get_chapters(self):
"""Test retrieval and content of chapter headings."""
self.parser = Formex4Parser()
self.parser.get_root(iopa)
self.parser.get_body()
self.parser.get_body(body_xpath='.//ENACTING.TERMS')
self.parser.get_chapters()

expected_chapters = [
Expand All @@ -133,7 +133,7 @@ def test_get_chapters(self):
self.assertEqual(self.parser.chapters, expected_chapters, "Chapters data does not match expected content")

def test_get_articles(self):
self.parser.get_body()
self.parser.get_body(body_xpath='.//ENACTING.TERMS')
self.parser.get_articles()

# Expected articles based on sample data in XML file
Expand Down
100 changes: 8 additions & 92 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ def __init__(self):

self.act = None

self.schema = None

self.debug_info = {}
self.valid = False
self.validation_errors = None


# Define the namespace mapping
Expand Down Expand Up @@ -223,30 +221,7 @@ def get_meta_proprietary(self):

return meta_proprietary

### Preface
def get_preface(self) -> None:
"""
Extracts paragraphs from the preface section of the document.
Returns
-------
list or None
List of strings containing the text content of each paragraph
in the preface. Returns None if no preface is found.
"""
preface = self.root.find('.//akn:preface', namespaces=self.namespaces)
if preface is None:
return None

paragraphs = []
for p in preface.findall('akn:p', namespaces=self.namespaces):
# Join all text parts in <p>, removing any inner tags
paragraph_text = ''.join(p.itertext()).strip()
paragraphs.append(paragraph_text)

self.preface = ' '.join(paragraphs)

### Preamble block
def get_preamble(self):
"""
Extracts complete preamble data from the document.
Expand Down Expand Up @@ -310,6 +285,8 @@ def get_citations(self):

return citations



def get_recitals(self):
"""
Extracts recitals from the preamble.
Expand Down Expand Up @@ -371,23 +348,7 @@ def get_act(self) -> None:
if self.act is None:
# Fallback: try without namespace
self.act = self.root.find('.//act')

### Enacting terms block
def get_body(self) -> None:
"""
Extracts the body element from the document.
Returns
-------
None
Updates the instance's body attribute with the found body element.
"""
# Use the namespace-aware find
self.body = self.root.find('.//akn:body', namespaces=self.namespaces)
if self.body is None:
# Fallback: try without namespace
self.body = self.root.find('.//body')


def get_chapters(self) -> None:
"""
Extracts chapter information from the document.
Expand Down Expand Up @@ -548,51 +509,6 @@ def get_conclusions(self):
'date': signature_date,
'signatures': signatures
}

def load_schema(self):
"""
Loads the XSD schema for XML validation using an absolute path.
"""
try:
# Resolve the absolute path to the XSD file
base_dir = os.path.dirname(os.path.abspath(__file__))
schema_path = os.path.join(base_dir, 'assets', 'akomantoso30.xsd')

# Parse the schema
with open(schema_path, 'r') as f:
schema_doc = etree.parse(f)
self.schema = etree.XMLSchema(schema_doc)
print("Schema loaded successfully.")
except Exception as e:
print(f"Error loading schema: {e}")

def validate(self, file: str) -> bool:
"""
Validates an XML file against the loaded XSD schema.
Args:
file (str): Path to the XML file to validate.
Returns:
bool: True if the XML file is valid, False otherwise.
"""
if not self.schema:
print("No schema loaded. Please load an XSD schema first.")
return False

try:
with open(file, 'r', encoding='utf-8') as f:
xml_doc = etree.parse(f)
self.schema.assertValid(xml_doc)
print(f"{file} is a valid Akoma Ntoso file.")
self.valid = True
except etree.DocumentInvalid as e:
print(f"{file} is not a valid Akoma Ntoso file. Validation errors: {e}")
self.valid = False
self.validation_errors = e.error_log
except Exception as e:
print(f"An error occurred during validation: {e}")
self.valid = False

def parse(self, file: str) -> list[dict]:
"""
Expand All @@ -610,8 +526,8 @@ def parse(self, file: str) -> list[dict]:
"""
debug_info = {}
try:
self.load_schema()
self.validate(file)
self.load_schema('akomantoso30.xsd')
self.validate(file, format='Akoma Ntoso')
if self.valid == True:
try:
self.get_root(file)
Expand All @@ -627,7 +543,7 @@ def parse(self, file: str) -> list[dict]:
print(f"Error in get_meta: {e}")

try:
self.get_preface()
self.get_preface(preface_xpath='.//akn:preface', paragraph_xpath='akn:p')
debug_info['preface'] = self.preface if hasattr(self, 'preface') else 0
print(f"Preface parsed successfully.")
except Exception as e:
Expand All @@ -640,7 +556,7 @@ def parse(self, file: str) -> list[dict]:
print(f"Error in get_preamble: {e}")

try:
self.get_body()
self.get_body(body_xpath='.//akn:body')
print("Body parsed successfully.")
except Exception as e:
print(f"Error in get_body: {e}")
Expand Down
Loading

0 comments on commit abaa652

Please sign in to comment.