Initialised the parametrisation of the AKN and Formex classes

AlessioNar · Dec 23, 2024 · abaa652 · abaa652
1 parent aefc994
commit abaa652
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 209 deletions.
diff --git a/tests/parsers/test_akomantoso.py b/tests/parsers/test_akomantoso.py
@@ -4,7 +4,7 @@
 import lxml.etree as etree
 
 # Define constants for file paths and directories
-file_path = os.path.join(os.path.dirname(__file__), '..\data\\akn\eu', '32014L0092.akn')
+file_path = os.path.join(os.path.dirname(__file__), '..\\data\\akn\\eu', '32014L0092.akn')
 
 class TestAkomaNtosoParser(unittest.TestCase):
     maxDiff = None
@@ -44,7 +44,7 @@ def test_get_meta_proprietary(self):
 
     def test_get_preface(self):
         """Test the content extracted from the preface section."""
-        self.parser.get_preface()
+        self.parser.get_preface(preface_xpath='.//akn:preface', paragraph_xpath='.//akn:p')
         self.assertIsNotNone(self.parser.preface, "Preface element not found")
 
         expected_preface = "Directive 2014/92/EU of the European Parliament and of the Council of 23 July 2014 on the comparability of fees related to payment accounts, payment account switching and access to payment accounts with basic features (Text with EEA relevance)"
@@ -59,23 +59,23 @@ def test_get_preamble(self):
         self.assertIsNotNone(self.parser.recitals, "Recitals data not found")
 
 
-    def test_get_preamble_formula(self):
+    def test_get_formula(self):
         """Test extraction of formula text within the preamble."""
-        formula_data = self.parser.get_preamble_formula()
+        formula_data = self.parser.get_formula()
         self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data)
 
-    def test_get_preamble_citations(self):
+    def test_get_citations(self):
         """Test citation extraction in the preamble section."""
-        citations_data = self.parser.get_preamble_citations()
+        citations_data = self.parser.get_citations()
         self.assertGreater(len(citations_data), 0, "No citations found in preamble")
 
         first_citation = citations_data[0]
         expected_text = "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 114"
         self.assertIn(expected_text, first_citation['citation_text'])
 
-    def test_get_preamble_recitals(self):
+    def test_get_recitals(self):
         """Test retrieval and content verification of recitals in the preamble."""
-        recitals_data = self.parser.get_preamble_recitals()
+        recitals_data = self.parser.get_recitals()
         self.assertIsNotNone(recitals_data, "Recitals section not found in <preamble>")
         self.assertEqual(len(recitals_data), 59, "Incorrect number of recitals extracted")
         expected_recitals = {
@@ -99,12 +99,12 @@ def test_get_act(self):
 
     def test_get_body(self):
         """Test retrieval of the body element."""
-        self.parser.get_body()
+        self.parser.get_body(body_xpath='.//akn:body')
         self.assertIsInstance(self.parser.body, etree._Element, "Body element should be an etree._Element")
 
     def test_get_chapters(self):
         """Test retrieval and content of chapter headings."""
-        self.parser.get_body()
+        self.parser.get_body(body_xpath='.//akn:body')
         self.parser.get_chapters()
 
         expected_chapters = [
@@ -120,7 +120,7 @@ def test_get_chapters(self):
 
     def test_get_articles(self):
         """Test retrieval of articles within the body."""
-        self.parser.get_body()
+        self.parser.get_body(body_xpath='.//akn:body')
         self.parser.get_articles()
 
         self.assertEqual(len(self.parser.articles), 31, "Incorrect number of articles extracted")

diff --git a/tests/parsers/test_formex.py b/tests/parsers/test_formex.py
@@ -48,7 +48,7 @@ def test_get_preface(self):
         self.maxDiff = None  # Allow full diff if needed
 
 
-        result = self.parser.get_preface()
+        self.parser.get_preface(preface_xpath='.//TITLE', paragraph_xpath='.//P')
         expected = (
             "Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 "
             "fixing representative prices in the poultrymeat and egg sectors and for egg "
@@ -111,14 +111,14 @@ def test_get_preamble_recitals(self):
         self.assertEqual(self.parser.recitals, recitals)      
 
     def test_get_body(self):
-        self.parser.get_body()
+        self.parser.get_body(body_xpath='.//ENACTING.TERMS')
         self.assertIsNotNone(self.parser.body, "Body element should not be None")    
 
     def test_get_chapters(self):
         """Test retrieval and content of chapter headings."""
         self.parser = Formex4Parser()
         self.parser.get_root(iopa)
-        self.parser.get_body()
+        self.parser.get_body(body_xpath='.//ENACTING.TERMS')
         self.parser.get_chapters()
 
         expected_chapters = [
@@ -133,7 +133,7 @@ def test_get_chapters(self):
         self.assertEqual(self.parser.chapters, expected_chapters, "Chapters data does not match expected content")
 
     def test_get_articles(self):
-        self.parser.get_body()
+        self.parser.get_body(body_xpath='.//ENACTING.TERMS')
         self.parser.get_articles()
 
         # Expected articles based on sample data in XML file

diff --git a/tulit/parsers/akomantoso.py b/tulit/parsers/akomantoso.py
@@ -29,10 +29,8 @@ def __init__(self):
 
         self.act = None
 
-        self.schema = None
+
         self.debug_info = {}
-        self.valid = False
-        self.validation_errors = None
 
 
         # Define the namespace mapping
@@ -223,30 +221,7 @@ def get_meta_proprietary(self):
 
         return meta_proprietary
 
-    ### Preface
-    def get_preface(self) -> None:
-        """
-        Extracts paragraphs from the preface section of the document.
-
-        Returns
-        -------
-        list or None
-            List of strings containing the text content of each paragraph
-            in the preface. Returns None if no preface is found.
-        """
-        preface = self.root.find('.//akn:preface', namespaces=self.namespaces)
-        if preface is None:
-            return None
-
-        paragraphs = []
-        for p in preface.findall('akn:p', namespaces=self.namespaces):
-            # Join all text parts in <p>, removing any inner tags
-            paragraph_text = ''.join(p.itertext()).strip()
-            paragraphs.append(paragraph_text)
-
-        self.preface = ' '.join(paragraphs)
 
-    ### Preamble block
     def get_preamble(self):
         """
         Extracts complete preamble data from the document.
@@ -310,6 +285,8 @@ def get_citations(self):
 
         return citations
 
+
+
     def get_recitals(self):
         """
         Extracts recitals from the preamble.
@@ -371,23 +348,7 @@ def get_act(self) -> None:
         if self.act is None:
             # Fallback: try without namespace
             self.act = self.root.find('.//act')
-
-    ### Enacting terms block
-    def get_body(self) -> None:
-        """
-        Extracts the body element from the document.
-
-        Returns
-        -------
-        None
-            Updates the instance's body attribute with the found body element.
-        """
-        # Use the namespace-aware find
-        self.body = self.root.find('.//akn:body', namespaces=self.namespaces)
-        if self.body is None:
-            # Fallback: try without namespace
-            self.body = self.root.find('.//body')
-
+
     def get_chapters(self) -> None:        
         """
         Extracts chapter information from the document.
@@ -548,51 +509,6 @@ def get_conclusions(self):
             'date': signature_date,
             'signatures': signatures
         }
-
-    def load_schema(self):
-        """
-        Loads the XSD schema for XML validation using an absolute path.
-        """
-        try:
-            # Resolve the absolute path to the XSD file
-            base_dir = os.path.dirname(os.path.abspath(__file__))
-            schema_path = os.path.join(base_dir, 'assets', 'akomantoso30.xsd')
-
-            # Parse the schema
-            with open(schema_path, 'r') as f:
-                schema_doc = etree.parse(f)
-                self.schema = etree.XMLSchema(schema_doc)
-            print("Schema loaded successfully.")
-        except Exception as e:
-            print(f"Error loading schema: {e}")
-
-    def validate(self, file: str) -> bool:
-        """
-        Validates an XML file against the loaded XSD schema.
-
-        Args:
-            file (str): Path to the XML file to validate.
-
-        Returns:
-            bool: True if the XML file is valid, False otherwise.
-        """
-        if not self.schema:
-            print("No schema loaded. Please load an XSD schema first.")
-            return False
-
-        try:
-            with open(file, 'r', encoding='utf-8') as f:
-                xml_doc = etree.parse(f)
-                self.schema.assertValid(xml_doc)
-            print(f"{file} is a valid Akoma Ntoso file.")
-            self.valid = True
-        except etree.DocumentInvalid as e:
-            print(f"{file} is not a valid Akoma Ntoso file. Validation errors: {e}")
-            self.valid = False
-            self.validation_errors = e.error_log
-        except Exception as e:
-            print(f"An error occurred during validation: {e}")
-            self.valid = False
 
     def parse(self, file: str) -> list[dict]:
         """
@@ -610,8 +526,8 @@ def parse(self, file: str) -> list[dict]:
         """
         debug_info = {}
         try:
-            self.load_schema()
-            self.validate(file)
+            self.load_schema('akomantoso30.xsd')
+            self.validate(file, format='Akoma Ntoso')
             if self.valid == True:
                 try:
                     self.get_root(file)
@@ -627,7 +543,7 @@ def parse(self, file: str) -> list[dict]:
                     print(f"Error in get_meta: {e}")
 
                 try:
-                    self.get_preface()
+                    self.get_preface(preface_xpath='.//akn:preface', paragraph_xpath='akn:p')
                     debug_info['preface'] = self.preface if hasattr(self, 'preface') else 0
                     print(f"Preface parsed successfully.")
                 except Exception as e:
@@ -640,7 +556,7 @@ def parse(self, file: str) -> list[dict]:
                     print(f"Error in get_preamble: {e}")
 
                 try:
-                    self.get_body()
+                    self.get_body(body_xpath='.//akn:body')
                     print("Body parsed successfully.")
                 except Exception as e:
                     print(f"Error in get_body: {e}")