Skip to content

Commit

Permalink
Abstracted the get_citation method
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 24, 2024
1 parent 1f32ff1 commit 72ca8cf
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 40 deletions.
2 changes: 1 addition & 1 deletion tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_get_formula(self):
def test_get_citations(self):
"""Test citation extraction in the preamble section."""
self.parser.get_preamble(preamble_xpath='.//akn:preamble', notes_xpath='.//akn:authorialNote')
self.parser.get_citations(citations_xpath='.//akn:citations', citation_xpath='.//akn:citation')
self.parser.get_citations()
self.assertIsNotNone(self.parser.citations, "Citations data not found")

first_citation = self.parser.citations[0]
Expand Down
2 changes: 1 addition & 1 deletion tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_get_citations(self):
self.maxDiff = None # Allow full diff if needed
self.parser.get_preamble(preamble_xpath='.//PREAMBLE', notes_xpath='.//NOTE')

self.parser.get_citations(citations_xpath='.//GR.VISA', citation_xpath='.//VISA')
self.parser.get_citations()


citations = [
Expand Down
27 changes: 10 additions & 17 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,31 +239,24 @@ def get_formula(self):
formula_text = ' '.join(p.text.strip() for p in formula.findall('akn:p', namespaces=self.namespaces) if p.text)
return formula_text

def get_citations(self, citations_xpath, citation_xpath):
def get_citations(self) -> list:
"""
Extracts citations from the preamble.
Returns
-------
list or None
list
List of dictionaries containing citation text without the associated
authorial notes. Returns None if no citations are found.
authorial notes.
"""
citations_section = self.preamble.find(citations_xpath, namespaces=self.namespaces)
if citations_section is None:
return None
def extract_eId(citation, index):
return citation.get('eId')

citations = []
for citation in citations_section.findall(citation_xpath, namespaces=self.namespaces):
citation_text = "".join(citation.itertext()).strip()
citation_eId = citation.get('eId')

citations.append({
'eId' : citation_eId,
'citation_text': citation_text,
})

self.citations = citations
return super().get_citations(
citations_xpath='.//akn:citations',
citation_xpath='.//akn:citation',
extract_eId=extract_eId
)

def get_recitals(self):
"""
Expand Down
32 changes: 12 additions & 20 deletions tulit/parsers/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_formula(self):

return self.formula

def get_citations(self, citations_xpath, citation_xpath):
def get_citations(self):
"""
Extracts citations from the preamble.
Expand All @@ -92,25 +92,16 @@ def get_citations(self, citations_xpath, citation_xpath):
list
List of dictionaries containing citation text.
"""
citations_section = self.preamble.find(citations_xpath, namespaces=self.namespaces)
if citations_section is None:
return None
def extract_eId(citation, index):
return index

citations = []
for index, citation in enumerate(self.preamble.findall(citation_xpath)):
citation_text = "".join(citation.itertext()).strip() # Using itertext() to get all nested text
citation_text = citation_text.replace('\n', '').replace('\t', '').replace('\r', '') # remove newline and tab characters
citation_text = re.sub(' +', ' ', citation_text) # replace multiple spaces with a single space

citations.append({
'eId': index,
'citation_text': citation_text
})

self.citations = citations

return super().get_citations(
citations_xpath='.//GR.VISA',
citation_xpath='.//VISA',
extract_eId=extract_eId
)

def get_recitals(self):
def get_recitals(self, recitals_xpath='.//GR.CONSID', recital_xpath='.//CONSID') -> None:
"""
Extracts recitals from the preamble.
Expand All @@ -119,21 +110,22 @@ def get_recitals(self):
list
List of dictionaries containing recital text and eId for each recital.
"""
#preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')

recitals = []
recitals.append({
"eId": 'rec_0',
"recital_text": self.preamble.findtext('.//GR.CONSID/GR.CONSID.INIT')
})

for recital in self.preamble.findall('.//CONSID'):
for recital in self.preamble.findall(recital_xpath):
recital_num = recital.findtext('.//NO.P')
recital_text = "".join(recital.find('.//TXT').itertext()).strip()
recitals.append({
"eId": recital_num,
"recital_text": recital_text
})
#preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')

self.recitals = recitals


Expand Down
40 changes: 39 additions & 1 deletion tulit/parsers/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from abc import ABC, abstractmethod
from lxml import etree
import os
import re

class XMLParser(ABC):
"""
Expand Down Expand Up @@ -240,8 +241,45 @@ def get_preamble(self, preamble_xpath, notes_xpath) -> None:
if self.preamble is not None:
self.preamble = self.remove_node(self.preamble, notes_xpath)
self.formula = self.get_formula()
#self.citations = self.get_citations()

#self.recitals = self.get_recitals()

def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
"""
Extracts citations from the preamble.
Parameters
----------
citations_xpath : str
XPath to locate the citations section.
citation_xpath : str
XPath to locate individual citations.
extract_eId : function, optional
Function to handle the extraction or generation of eId.
Returns
-------
list
List of dictionaries containing citation text.
"""
citations_section = self.preamble.find(citations_xpath, namespaces=self.namespaces)
if citations_section is None:
return None

citations = []
for index, citation in enumerate(citations_section.findall(citation_xpath, namespaces=self.namespaces)):
citation_text = "".join(citation.itertext()).strip()
citation_text = citation_text.replace('\n', '').replace('\t', '').replace('\r', '') # remove newline and tab characters
citation_text = re.sub(' +', ' ', citation_text) # replace multiple spaces with a single space

eId = extract_eId(citation, index) if extract_eId else index
# Up until here, the code is the same as for Formex
citations.append({
'eId' : eId,
'citation_text': citation_text,
})

self.citations = citations

### Enacting terms block
def get_body(self, body_xpath) -> None:
Expand Down

0 comments on commit 72ca8cf

Please sign in to comment.