Skip to content

Commit

Permalink
initiated refactoring of get_chapter class
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 27, 2024
1 parent 537887d commit 6223fc9
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 26 deletions.
4 changes: 3 additions & 1 deletion tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def test_get_preamble(self):

def test_get_formula(self):
"""Test extraction of formula text within the preamble."""
self.parser.get_preamble(preamble_xpath='.//akn:preamble', notes_xpath='.//akn:authorialNote')

formula_data = self.parser.get_formula()
self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data)

Expand Down Expand Up @@ -107,7 +109,7 @@ def test_get_body(self):
def test_get_chapters(self):
"""Test retrieval and content of chapter headings."""
self.parser.get_body(body_xpath='.//akn:body')
self.parser.get_chapters(chapter_xpath='.//akn:chapter', num_xpath='.//akn:num', heading_xpath='.//akn:heading')
self.parser.get_chapters()

expected_chapters = [
{'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},
Expand Down
36 changes: 12 additions & 24 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def get_formula(self):
Concatenated text from all paragraphs within the formula element.
Returns None if no formula is found.
"""
formula = self.root.find('.//akn:preamble/akn:formula', namespaces=self.namespaces)
formula = self.preamble.find('.//akn:formula', namespaces=self.namespaces)
if formula is None:
return None

Expand Down Expand Up @@ -305,18 +305,9 @@ def get_act(self) -> None:
# Fallback: try without namespace
self.act = self.root.find('.//act')

def get_chapters(self, chapter_xpath, num_xpath, heading_xpath) -> None:
def get_chapters(self) -> None:
"""
Extracts chapter information from the document.
Parameters
----------
chapter_xpath : str
XPath expression to locate the chapter elements.
num_xpath : str
XPath expression to locate the chapter number within each chapter element.
heading_xpath : str
XPath expression to locate the chapter heading within each chapter element.
Returns
-------
Expand All @@ -325,19 +316,16 @@ def get_chapters(self, chapter_xpath, num_xpath, heading_xpath) -> None:
- 'eId': Chapter identifier
- 'chapter_num': Chapter number
- 'chapter_heading': Chapter heading text
"""
# Find all <chapter> elements in the body
for chapter in self.body.findall(chapter_xpath, namespaces=self.namespaces):
eId = chapter.get('eId')
chapter_num = chapter.find(num_xpath, namespaces=self.namespaces)
chapter_heading = chapter.find(heading_xpath, namespaces=self.namespaces)

# Add chapter data to chapters list
self.chapters.append({
'eId': eId,
'chapter_num': chapter_num.text if chapter_num is not None else None,
'chapter_heading': ''.join(chapter_heading.itertext()).strip() if chapter_heading is not None else None
})
"""
def extract_eId(chapter, index):
return chapter.get('eId')

return super().get_chapters(
chapter_xpath='.//akn:chapter',
num_xpath='.//akn:num',
heading_xpath='.//akn:heading',
extract_eId=extract_eId
)


def get_articles(self) -> None:
Expand Down
1 change: 0 additions & 1 deletion tulit/parsers/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,6 @@ def get_chapters(self) -> None:
if len(chapter.findall('.//HT')) > 1:
chapter_heading = chapter.findall('.//HT')[1]
self.chapters.append({

"eId": index,
"chapter_num" : "".join(chapter_num.itertext()).strip(),
"chapter_heading": "".join(chapter_heading.itertext()).strip()
Expand Down
36 changes: 36 additions & 0 deletions tulit/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,42 @@ def get_body(self, body_xpath) -> None:
# Fallback: try without namespace
self.body = self.root.find(body_xpath)

def get_chapters(self, chapter_xpath: str, num_xpath: str, heading_xpath: str, extract_eId=None) -> None:
"""
Extracts chapter information from the document.
Parameters
----------
chapter_xpath : str
XPath expression to locate the chapter elements.
num_xpath : str
XPath expression to locate the chapter number within each chapter element.
heading_xpath : str
XPath expression to locate the chapter heading within each chapter element.
extract_eId : function, optional
Function to handle the extraction or generation of eId.
Returns
-------
list
List of dictionaries containing chapter data with keys:
- 'eId': Chapter identifier
- 'chapter_num': Chapter number
- 'chapter_heading': Chapter heading text
"""
self.chapters = []
chapters = self.body.findall(chapter_xpath, namespaces=self.namespaces)
for index, chapter in enumerate(chapters):
eId = extract_eId(chapter, index) if extract_eId else index
chapter_num = chapter.find(num_xpath, namespaces=self.namespaces)
chapter_heading = chapter.find(heading_xpath, namespaces=self.namespaces)

self.chapters.append({
'eId': eId,
'chapter_num': chapter_num.text if chapter_num is not None else None,
'chapter_heading': ''.join(chapter_heading.itertext()).strip() if chapter_heading is not None else None
})

@abstractmethod
def parse(self):
"""
Expand Down

0 comments on commit 6223fc9

Please sign in to comment.