Abstracted the get_citation method

AlessioNar · Dec 24, 2024 · 72ca8cf · 72ca8cf
1 parent 1f32ff1
commit 72ca8cf
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 40 deletions.
diff --git a/tests/parsers/test_akomantoso.py b/tests/parsers/test_akomantoso.py
@@ -66,7 +66,7 @@ def test_get_formula(self):
     def test_get_citations(self):
         """Test citation extraction in the preamble section."""
         self.parser.get_preamble(preamble_xpath='.//akn:preamble', notes_xpath='.//akn:authorialNote')
-        self.parser.get_citations(citations_xpath='.//akn:citations', citation_xpath='.//akn:citation')
+        self.parser.get_citations()
         self.assertIsNotNone(self.parser.citations, "Citations data not found")
 
         first_citation = self.parser.citations[0]

diff --git a/tests/parsers/test_formex.py b/tests/parsers/test_formex.py
@@ -74,7 +74,7 @@ def test_get_citations(self):
         self.maxDiff = None  # Allow full diff if needed
         self.parser.get_preamble(preamble_xpath='.//PREAMBLE', notes_xpath='.//NOTE')
 
-        self.parser.get_citations(citations_xpath='.//GR.VISA', citation_xpath='.//VISA')
+        self.parser.get_citations()
 
 
         citations =  [

diff --git a/tulit/parsers/akomantoso.py b/tulit/parsers/akomantoso.py
@@ -239,31 +239,24 @@ def get_formula(self):
         formula_text = ' '.join(p.text.strip() for p in formula.findall('akn:p', namespaces=self.namespaces) if p.text)
         return formula_text
 
-    def get_citations(self, citations_xpath, citation_xpath):
+    def get_citations(self) -> list:
         """
         Extracts citations from the preamble.
 
         Returns
         -------
-        list or None
+        list
             List of dictionaries containing citation text without the associated
-            authorial notes. Returns None if no citations are found.
+            authorial notes.
         """
-        citations_section = self.preamble.find(citations_xpath, namespaces=self.namespaces)
-        if citations_section is None:
-            return None
+        def extract_eId(citation, index):
+            return citation.get('eId')
 
-        citations = []
-        for citation in citations_section.findall(citation_xpath, namespaces=self.namespaces):
-            citation_text = "".join(citation.itertext()).strip()
-            citation_eId = citation.get('eId')
-
-            citations.append({
-                'eId' : citation_eId,
-                'citation_text': citation_text,
-            })
-
-        self.citations = citations
+        return super().get_citations(
+            citations_xpath='.//akn:citations',
+            citation_xpath='.//akn:citation',
+            extract_eId=extract_eId
+        )
 
     def get_recitals(self):
         """

diff --git a/tulit/parsers/formex.py b/tulit/parsers/formex.py
@@ -83,7 +83,7 @@ def get_formula(self):
 
         return self.formula
 
-    def get_citations(self, citations_xpath, citation_xpath):
+    def get_citations(self):
         """
         Extracts citations from the preamble.
 
@@ -92,25 +92,16 @@ def get_citations(self, citations_xpath, citation_xpath):
         list
             List of dictionaries containing citation text.
         """
-        citations_section = self.preamble.find(citations_xpath, namespaces=self.namespaces)
-        if citations_section is None:
-            return None
+        def extract_eId(citation, index):
+            return index
 
-        citations = []
-        for index, citation in enumerate(self.preamble.findall(citation_xpath)):
-            citation_text = "".join(citation.itertext()).strip()  # Using itertext() to get all nested text
-            citation_text = citation_text.replace('\n', '').replace('\t', '').replace('\r', '')  # remove newline and tab characters
-            citation_text = re.sub(' +', ' ', citation_text)  # replace multiple spaces with a single space
-
-            citations.append({
-                'eId': index,
-                'citation_text': citation_text
-            })
-
-        self.citations = citations
-
+        return super().get_citations(
+            citations_xpath='.//GR.VISA',
+            citation_xpath='.//VISA',
+            extract_eId=extract_eId
+        )
 
-    def get_recitals(self):
+    def get_recitals(self, recitals_xpath='.//GR.CONSID', recital_xpath='.//CONSID') -> None:
         """
         Extracts recitals from the preamble.
 
@@ -119,21 +110,22 @@ def get_recitals(self):
         list
             List of dictionaries containing recital text and eId for each recital.
         """
-        #preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')
 
         recitals = []
         recitals.append({
             "eId": 'rec_0',
             "recital_text": self.preamble.findtext('.//GR.CONSID/GR.CONSID.INIT')
             })
 
-        for recital in self.preamble.findall('.//CONSID'):
+        for recital in self.preamble.findall(recital_xpath):
             recital_num = recital.findtext('.//NO.P')
             recital_text = "".join(recital.find('.//TXT').itertext()).strip()
             recitals.append({
                     "eId": recital_num, 
                     "recital_text": recital_text
                 })
+        #preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')
+
         self.recitals = recitals
 
 

diff --git a/tulit/parsers/parser.py b/tulit/parsers/parser.py
@@ -1,6 +1,7 @@
 from abc import ABC, abstractmethod
 from lxml import etree
 import os
+import re
 
 class XMLParser(ABC):
     """
@@ -240,8 +241,45 @@ def get_preamble(self, preamble_xpath, notes_xpath) -> None:
         if self.preamble is not None:            
             self.preamble = self.remove_node(self.preamble, notes_xpath)
             self.formula = self.get_formula()
-            #self.citations = self.get_citations()
+
             #self.recitals = self.get_recitals()
+
+    def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
+        """
+        Extracts citations from the preamble.
+
+        Parameters
+        ----------
+        citations_xpath : str
+            XPath to locate the citations section.
+        citation_xpath : str
+            XPath to locate individual citations.
+        extract_eId : function, optional
+            Function to handle the extraction or generation of eId.
+
+        Returns
+        -------
+        list
+            List of dictionaries containing citation text.
+        """
+        citations_section = self.preamble.find(citations_xpath, namespaces=self.namespaces)
+        if citations_section is None:
+            return None
+
+        citations = []
+        for index, citation in enumerate(citations_section.findall(citation_xpath, namespaces=self.namespaces)):
+            citation_text = "".join(citation.itertext()).strip()
+            citation_text = citation_text.replace('\n', '').replace('\t', '').replace('\r', '')  # remove newline and tab characters
+            citation_text = re.sub(' +', ' ', citation_text)  # replace multiple spaces with a single space
+
+            eId = extract_eId(citation, index) if extract_eId else index
+            # Up until here, the code is the same as for Formex
+            citations.append({
+                'eId' : eId,
+                'citation_text': citation_text,
+            })
+
+        self.citations = citations
 
     ### Enacting terms block
     def get_body(self, body_xpath) -> None: