Implemented get_recitals in generic XMLParser class with specific imp…

…lementations for formex and akn
AlessioNar · Dec 27, 2024 · 537887d · 537887d
1 parent 6250040
commit 537887d
Show file tree

Hide file tree

Showing 7 changed files with 155 additions and 116 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -33,6 +33,7 @@
 
 # -- Options for HTML output
 autodocs_mock_imports = ['tulit']
+autodoc_member_order = 'bysource'
 
 html_theme = 'sphinx_rtd_theme'
 

diff --git a/tests/parsers/test_akomantoso.py b/tests/parsers/test_akomantoso.py
@@ -91,7 +91,7 @@ def test_get_recitals(self):
             with self.subTest(recital=index):
                 self.assertEqual(self.parser.recitals[index]['eId'], expected_values['eId'], 
                                  f"Recital {index} ID does not match expected value")
-                self.assertIn(expected_values['text'], self.parser.recitals[index]['recital_text'], 
+                self.assertIn(expected_values['text'], self.parser.recitals[index]['text'], 
                               f"Recital {index} text does not match expected content")
 
     def test_get_act(self):

diff --git a/tests/parsers/test_formex.py b/tests/parsers/test_formex.py
@@ -93,11 +93,11 @@ def test_get_recitals(self):
         self.parser.get_recitals()
 
         recitals = [
-                {"eId": "rec_0", "recital_text": "Whereas:"},
-                {"eId": "(1)", "recital_text": "Commission Regulation (EC) No 1484/95 lays down detailed rules for implementing the system of additional import duties and fixes representative prices for poultrymeat and egg products and for egg albumin."}, 
-                {"eId": "(2)", "recital_text": "Regular monitoring of the data used to determine representative prices for poultrymeat and egg products and for egg albumin shows that the representative import prices for certain products should be amended to take account of variations in price according to origin. The representative prices should therefore be published."},
-                {"eId": "(3)", "recital_text": "In view of the situation on the market, this amendment should be applied as soon as possible."},
-                {"eId": "(4)", "recital_text": "The measures provided for in this Regulation are in accordance with the opinion of the Management Committee for the Common Organisation of Agricultural Markets,"},
+                {"eId": "rec_0", "text": "Whereas:"},
+                {"eId": "(1)", "text": "Commission Regulation (EC) No 1484/95 lays down detailed rules for implementing the system of additional import duties and fixes representative prices for poultrymeat and egg products and for egg albumin."}, 
+                {"eId": "(2)", "text": "Regular monitoring of the data used to determine representative prices for poultrymeat and egg products and for egg albumin shows that the representative import prices for certain products should be amended to take account of variations in price according to origin. The representative prices should therefore be published."},
+                {"eId": "(3)", "text": "In view of the situation on the market, this amendment should be applied as soon as possible."},
+                {"eId": "(4)", "text": "The measures provided for in this Regulation are in accordance with the opinion of the Management Committee for the Common Organisation of Agricultural Markets,"},
         ]
 
         preamble_final = {

diff --git a/tulit/parsers/akomantoso.py b/tulit/parsers/akomantoso.py
@@ -269,42 +269,26 @@ def get_recitals(self):
             List of dictionaries containing recital text and eId for each
             recital. Returns None if no recitals are found.
         """
-        recitals_section = self.preamble.find('.//akn:recitals', namespaces=self.namespaces)
-        if recitals_section is None:
-            return None
-
-        recitals = []
-
-        # Intro
-        recitals_intro = recitals_section.find('.//akn:intro', namespaces=self.namespaces)
-        recitals_intro_eId = recitals_intro.get('eId')
-        recitals_intro_text = ' '.join(p.text.strip() for p in recitals_intro.findall('.//akn:p', namespaces=self.namespaces) if p.text)
-        recitals.append({
-            'recital_text': recitals_intro_text,
-            'eId': recitals_intro_eId
-        })
-
-        # Removing all authorialNote nodes
-        recitals_section = self.remove_node(recitals_section, './/akn:authorialNote')
-
-        # Step 2: Process each <recital> element in the recitals_section without the <authorialNote> elements
-        for recital in recitals_section.findall('.//akn:recital', namespaces=self.namespaces):
-            eId = str(recital.get('eId'))
-
-            # Extract text from remaining <akn:p> elements
-            recital_text = ' '.join(' '.join(p.itertext()).strip() for p in recital.findall('.//akn:p', namespaces=self.namespaces))
-
-            # Remove any double spaces in the concatenated recital text
-            recital_text = re.sub(r'\s+', ' ', recital_text)
-
-            # Append the cleaned recital text and eId to the list
-            recitals.append({
-                'recital_text': recital_text,
-                'eId': eId
-            })
-
-        self.recitals = recitals
-
+
+        def extract_intro(recitals_section):
+            # Intro - different implementation
+            recitals_intro = recitals_section.find('.//akn:intro', namespaces=self.namespaces)
+            intro_eId = recitals_intro.get('eId')
+            intro_text = ''.join(p.text.strip() for p in recitals_intro.findall('.//akn:p', namespaces=self.namespaces) if p.text)
+            return intro_eId, intro_text
+
+        def extract_eId(recital):
+            return str(recital.get('eId'))
+
+        return super().get_recitals(
+            recitals_xpath='.//akn:recitals', 
+            recital_xpath='.//akn:recital',
+            text_xpath='.//akn:p',
+            extract_intro=extract_intro,
+            extract_eId=extract_eId,
+
+        )
+
     ### Act block
     def get_act(self) -> None:
         """

diff --git a/tulit/parsers/formex.py b/tulit/parsers/formex.py
@@ -103,34 +103,34 @@ def extract_eId(citation, index):
             extract_eId=extract_eId
         )
 
-    def get_recitals(self, recitals_xpath='.//GR.CONSID', recital_xpath='.//CONSID') -> None:
+    def get_recitals(self) -> None:
         """
         Extracts recitals from the preamble.
 
         Returns
         -------
-        list
-            List of dictionaries containing recital text and eId for each recital.
-        """
-
-        recitals = []
-        recitals.append({
-            "eId": 'rec_0',
-            "recital_text": self.preamble.findtext('.//GR.CONSID/GR.CONSID.INIT')
-            })
-
-        for recital in self.preamble.findall(recital_xpath):
-            recital_num = recital.findtext('.//NO.P')
-            recital_text = "".join(recital.find('.//TXT').itertext()).strip()
-            recitals.append({
-                    "eId": recital_num, 
-                    "recital_text": recital_text
-                })
-        #preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')
-
-        self.recitals = recitals
+        list or None
+            List of dictionaries containing recital text and eId for each
+            recital. Returns None if no recitals are found.
+        """
+
+        def extract_intro(recitals_section):        
+            # Intro - different implementation
+            intro_eId = 'rec_0'
+            intro_text = self.preamble.findtext('.//GR.CONSID.INIT')
+
+            return intro_eId, intro_text
 
-
+        def extract_eId(recital):
+            return recital.findtext('.//NO.P')
+
+        return super().get_recitals(
+            recitals_xpath='.//GR.CONSID', 
+            recital_xpath='.//CONSID',
+            text_xpath='.//TXT',
+            extract_intro=extract_intro,
+            extract_eId=extract_eId
+        )
 
     def get_chapters(self) -> None:
         """

diff --git a/tulit/parsers/html.py b/tulit/parsers/html.py
@@ -1,6 +1,7 @@
 from bs4 import BeautifulSoup
+from .parser import Parser
 
-class HTMLParser():
+class HTMLParser(Parser):
     def __init__(self):
         """
         Initializes the HTML parser and sets up the BeautifulSoup instance.

diff --git a/tulit/parsers/parser.py b/tulit/parsers/parser.py
@@ -3,34 +3,26 @@
 import os
 import re
 
-class XMLParser(ABC):
+class Parser(ABC):
     """
-    Abstract base class for XML parsers.
+    Abstract base class for parsers
     
     Attributes
     ----------
-    schema : lxml.etree.XMLSchema or None
-        The XML schema used for validation.
-    valid : bool or None
-        Indicates whether the XML file is valid against the schema.
-    validation_errors : lxml.etree._LogEntry or None
-        Validation errors if the XML file is invalid.
-    root : lxml.etree._Element
-        Root element of the XML document.
-    namespaces : dict
-        Dictionary containing XML namespaces.
+    root : lxml.etree._Element or bs4.BeautifulSoup
+        Root element of the XML or HTML document.
     preface : str or None
-        Extracted preface text from the XML document.
-    preamble : lxml.etree.Element or None
-        The preamble section of the XML document.
+        Extracted preface text from the document.
+    preamble : lxml.etree.Element or bs4.Tag or None
+        The preamble section of the document.
     formula : str or None
         The formula element extracted from the preamble.
     citations : list or None
         List of extracted citations from the preamble.
     recitals : list or None
         List of extracted recitals from the preamble.
-    body : lxml.etree.Element or None
-        The body section of the XML document.
+    body : lxml.etree.Element or bs4.Tag or None
+        The body section of the document.
     chapters : list or None
         List of extracted chapters from the body.
     articles : list or None
@@ -49,12 +41,8 @@ def __init__(self):
         ----------
         None
         """
-        self.schema = None
-        self.valid = None
-        self.validation_errors = None
-        self.root = None
-        self.namespaces = {}
-
+
+        self.root = None 
         self.preface = None
 
         self.preamble = None
@@ -68,13 +56,38 @@ def __init__(self):
         self.conclusions = None
 
         self.articles_text = []
-
-    @abstractmethod
-    def parse(self):
+
+class XMLParser(Parser):
+    """
+    Base class for XML parsers.
+    
+    Attributes
+    ----------
+    schema : lxml.etree.XMLSchema or None
+        The XML schema used for validation.
+    valid : bool or None
+        Indicates whether the XML file is valid against the schema.
+    validation_errors : lxml.etree._LogEntry or None
+        Validation errors if the XML file is invalid.
+    namespaces : dict
+        Dictionary containing XML namespaces.
+    """
+
+    def __init__(self):
         """
-        Abstract method to parse the data. This method must be implemented by the subclass.
+        Initializes the Parser object.
+
+        Parameters
+        ----------
+        None
         """
-        pass
+        super().__init__()
+
+        self.schema = None
+        self.valid = None
+        self.validation_errors = None
+
+        self.namespaces = {}
 
     def load_schema(self, schema):
         """
@@ -135,25 +148,6 @@ def validate(self, format, file: str) -> bool:
         except Exception as e:
             print(f"An error occurred during validation: {e}")
             self.valid = False
-
-    def get_root(self, file: str):
-        """
-        Parses an XML file and returns its root element.
-
-        Parameters
-        ----------
-        file : str
-            Path to the XML file.
-
-            
-        Returns
-        -------
-        None
-        """
-        with open(file, 'r', encoding='utf-8') as f:
-            tree = etree.parse(f)
-            self.root = tree.getroot()
-
 
     def remove_node(self, tree, node):
         """
@@ -193,6 +187,24 @@ def remove_node(self, tree, node):
 
         return tree
 
+    def get_root(self, file: str):
+        """
+        Parses an XML file and returns its root element.
+
+        Parameters
+        ----------
+        file : str
+            Path to the XML file.
+
+        Returns
+        -------
+        None
+        """
+        with open(file, 'r', encoding='utf-8') as f:
+            tree = etree.parse(f)
+            self.root = tree.getroot()
+
+
     def get_preface(self, preface_xpath, paragraph_xpath) -> None:
         """
         Extracts paragraphs from the preface section of the document.
@@ -241,9 +253,9 @@ def get_preamble(self, preamble_xpath, notes_xpath) -> None:
         if self.preamble is not None:            
             self.preamble = self.remove_node(self.preamble, notes_xpath)
             self.formula = self.get_formula()
-
-            #self.recitals = self.get_recitals()
-
+            #preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')
+
+        
     def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
         """
         Extracts citations from the preamble.
@@ -284,6 +296,40 @@ def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
 
         self.citations = citations
 
+    def get_recitals(self, recitals_xpath, recital_xpath, text_xpath, extract_intro=None, extract_eId=None):
+        """
+        Extracts recitals from the preamble.
+
+        Returns
+        -------
+        list or None
+            List of dictionaries containing recital text and eId for each
+            recital. Returns None if no recitals are found.
+        """
+        recitals_section = self.preamble.find(recitals_xpath, namespaces=self.namespaces)
+        if recitals_section is None:
+            return None
+
+        recitals = []
+        # Get an eId for the citation, depending on the XML format
+        intro_eId, intro_text = extract_intro(recitals_section) if extract_intro else (None, None)
+
+        recitals.append({
+            "eId": intro_eId,
+            "text": intro_text
+            })
+
+
+        for recital in recitals_section.findall(recital_xpath, namespaces=self.namespaces):
+            eId = extract_eId(recital) if extract_eId else None
+            text = ''.join(''.join(p.itertext()).strip() for p in recital.findall(text_xpath, namespaces=self.namespaces))
+            recitals.append({
+                    "eId": eId, 
+                    "text": text
+                })
+
+        self.recitals = recitals
+
     ### Enacting terms block
     def get_body(self, body_xpath) -> None:
         """
@@ -303,4 +349,11 @@ def get_body(self, body_xpath) -> None:
         self.body = self.root.find(body_xpath, namespaces=self.namespaces)
         if self.body is None:
             # Fallback: try without namespace
-            self.body = self.root.find(body_xpath)
+            self.body = self.root.find(body_xpath)
+
+    @abstractmethod
+    def parse(self):
+        """
+        Abstract method to parse the data. This method must be implemented by the subclass.
+        """
+        pass