initiated refactoring of get_chapter class

AlessioNar · Dec 27, 2024 · 6223fc9 · 6223fc9
1 parent 537887d
commit 6223fc9
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 26 deletions.
diff --git a/tests/parsers/test_akomantoso.py b/tests/parsers/test_akomantoso.py
@@ -60,6 +60,8 @@ def test_get_preamble(self):
 
     def test_get_formula(self):
         """Test extraction of formula text within the preamble."""
+        self.parser.get_preamble(preamble_xpath='.//akn:preamble', notes_xpath='.//akn:authorialNote')
+
         formula_data = self.parser.get_formula()
         self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data)
 
@@ -107,7 +109,7 @@ def test_get_body(self):
     def test_get_chapters(self):
         """Test retrieval and content of chapter headings."""
         self.parser.get_body(body_xpath='.//akn:body')
-        self.parser.get_chapters(chapter_xpath='.//akn:chapter', num_xpath='.//akn:num', heading_xpath='.//akn:heading')
+        self.parser.get_chapters()
 
         expected_chapters = [
             {'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},

diff --git a/tulit/parsers/akomantoso.py b/tulit/parsers/akomantoso.py
@@ -231,7 +231,7 @@ def get_formula(self):
             Concatenated text from all paragraphs within the formula element.
             Returns None if no formula is found.
         """
-        formula = self.root.find('.//akn:preamble/akn:formula', namespaces=self.namespaces)
+        formula = self.preamble.find('.//akn:formula', namespaces=self.namespaces)
         if formula is None:
             return None
 
@@ -305,18 +305,9 @@ def get_act(self) -> None:
             # Fallback: try without namespace
             self.act = self.root.find('.//act')
 
-    def get_chapters(self, chapter_xpath, num_xpath, heading_xpath) -> None:        
+    def get_chapters(self) -> None:
         """
         Extracts chapter information from the document.
-        
-        Parameters
-        ----------
-        chapter_xpath : str
-            XPath expression to locate the chapter elements.
-        num_xpath : str
-            XPath expression to locate the chapter number within each chapter element.
-        heading_xpath : str
-            XPath expression to locate the chapter heading within each chapter element.
 
         Returns
         -------
@@ -325,19 +316,16 @@ def get_chapters(self, chapter_xpath, num_xpath, heading_xpath) -> None:
             - 'eId': Chapter identifier
             - 'chapter_num': Chapter number
             - 'chapter_heading': Chapter heading text
-        """        
-        # Find all <chapter> elements in the body
-        for chapter in self.body.findall(chapter_xpath, namespaces=self.namespaces):
-            eId = chapter.get('eId')
-            chapter_num = chapter.find(num_xpath, namespaces=self.namespaces)
-            chapter_heading = chapter.find(heading_xpath, namespaces=self.namespaces)
-
-            # Add chapter data to chapters list
-            self.chapters.append({
-                'eId': eId,
-                'chapter_num': chapter_num.text if chapter_num is not None else None,
-                'chapter_heading': ''.join(chapter_heading.itertext()).strip() if chapter_heading is not None else None
-            })
+        """
+        def extract_eId(chapter, index):
+            return chapter.get('eId')
+
+        return super().get_chapters(
+            chapter_xpath='.//akn:chapter',
+            num_xpath='.//akn:num',
+            heading_xpath='.//akn:heading',
+            extract_eId=extract_eId
+        )
 
 
     def get_articles(self) -> None:

diff --git a/tulit/parsers/formex.py b/tulit/parsers/formex.py
@@ -153,7 +153,6 @@ def get_chapters(self) -> None:
                 if len(chapter.findall('.//HT')) > 1:      
                     chapter_heading = chapter.findall('.//HT')[1]
                     self.chapters.append({
-
                         "eId": index,
                         "chapter_num" : "".join(chapter_num.itertext()).strip(),
                         "chapter_heading": "".join(chapter_heading.itertext()).strip()

diff --git a/tulit/parsers/parser.py b/tulit/parsers/parser.py
@@ -351,6 +351,42 @@ def get_body(self, body_xpath) -> None:
             # Fallback: try without namespace
             self.body = self.root.find(body_xpath)
 
+    def get_chapters(self, chapter_xpath: str, num_xpath: str, heading_xpath: str, extract_eId=None) -> None:
+        """
+        Extracts chapter information from the document.
+
+        Parameters
+        ----------
+        chapter_xpath : str
+            XPath expression to locate the chapter elements.
+        num_xpath : str
+            XPath expression to locate the chapter number within each chapter element.
+        heading_xpath : str
+            XPath expression to locate the chapter heading within each chapter element.
+        extract_eId : function, optional
+            Function to handle the extraction or generation of eId.
+
+        Returns
+        -------
+        list
+            List of dictionaries containing chapter data with keys:
+            - 'eId': Chapter identifier
+            - 'chapter_num': Chapter number
+            - 'chapter_heading': Chapter heading text
+        """
+        self.chapters = []
+        chapters = self.body.findall(chapter_xpath, namespaces=self.namespaces)
+        for index, chapter in enumerate(chapters):
+            eId = extract_eId(chapter, index) if extract_eId else index
+            chapter_num = chapter.find(num_xpath, namespaces=self.namespaces)
+            chapter_heading = chapter.find(heading_xpath, namespaces=self.namespaces)
+
+            self.chapters.append({
+                'eId': eId,
+                'chapter_num': chapter_num.text if chapter_num is not None else None,
+                'chapter_heading': ''.join(chapter_heading.itertext()).strip() if chapter_heading is not None else None
+            })
+
     @abstractmethod
     def parse(self):
         """