Skip to content

Commit

Permalink
Moved body_xpath selection to subclass
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 27, 2024
1 parent c618eb0 commit 88a5a5c
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 24 deletions.
6 changes: 3 additions & 3 deletions tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,12 @@ def test_get_act(self):

def test_get_body(self):
"""Test retrieval of the body element."""
self.parser.get_body(body_xpath='.//akn:body')
self.parser.get_body()
self.assertIsInstance(self.parser.body, etree._Element, "Body element should be an etree._Element")

def test_get_chapters(self):
"""Test retrieval and content of chapter headings."""
self.parser.get_body(body_xpath='.//akn:body')
self.parser.get_body()
self.parser.get_chapters()

expected_chapters = [
Expand All @@ -105,7 +105,7 @@ def test_get_chapters(self):

def test_get_articles(self):
"""Test retrieval of articles within the body."""
self.parser.get_body(body_xpath='.//akn:body')
self.parser.get_body()
self.parser.get_articles()

self.assertEqual(len(self.parser.articles), 31, "Incorrect number of articles extracted")
Expand Down
6 changes: 3 additions & 3 deletions tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,14 @@ def test_get_recitals(self):
self.assertEqual(self.parser.recitals, recitals)

def test_get_body(self):
self.parser.get_body(body_xpath='.//ENACTING.TERMS')
self.parser.get_body()
self.assertIsNotNone(self.parser.body, "Body element should not be None")

def test_get_chapters(self):
"""Test retrieval and content of chapter headings."""
self.parser = Formex4Parser()
self.parser.get_root(iopa)
self.parser.get_body(body_xpath='.//ENACTING.TERMS')
self.parser.get_body()
self.parser.get_chapters()

expected_chapters = [
Expand All @@ -105,7 +105,7 @@ def test_get_chapters(self):
self.assertEqual(self.parser.chapters, expected_chapters, "Chapters data does not match expected content")

def test_get_articles(self):
self.parser.get_body(body_xpath='.//ENACTING.TERMS')
self.parser.get_body()
self.parser.get_articles()

# Expected articles based on sample data in XML file
Expand Down
24 changes: 8 additions & 16 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,6 @@ def __init__(self):
super().__init__()

self.act = None
self.debug_info = {}


# Define the namespace mapping
self.namespaces = {
Expand Down Expand Up @@ -84,7 +82,6 @@ def get_recitals(self):
"""

def extract_intro(recitals_section):
# Intro - different implementation
recitals_intro = recitals_section.find('.//akn:intro', namespaces=self.namespaces)
intro_eId = recitals_intro.get('eId')
intro_text = ''.join(p.text.strip() for p in recitals_intro.findall('.//akn:p', namespaces=self.namespaces) if p.text)
Expand Down Expand Up @@ -117,6 +114,9 @@ def get_act(self) -> None:
if self.act is None:
# Fallback: try without namespace
self.act = self.root.find('.//act')

def get_body(self):
return super().get_body('.//akn:body')

def get_chapters(self) -> None:
"""
Expand Down Expand Up @@ -273,15 +273,12 @@ def parse(self, file: str) -> list[dict]:
This method sequentially calls various parsing functions to extract metadata,
preface, preamble, body, chapters, articles, and conclusions from the XML file.
It logs errors encountered during parsing and provides debug information about
the structure of the document.
Args:
file (str): The path to the Akoma Ntoso XML file.
"""
debug_info = {}
try:
self.load_schema('akomantoso30.xsd')
self.validate(file, format='Akoma Ntoso')
Expand All @@ -294,14 +291,12 @@ def parse(self, file: str) -> list[dict]:

try:
self.get_metadata()
debug_info['meta'] = self.metadata if hasattr(self, 'metadata') else "Meta not parsed."
print("Metadata parsed successfully.")
except Exception as e:
print(f"Error in get_meta: {e}")

try:
self.get_preface(preface_xpath='.//akn:preface', paragraph_xpath='akn:p')
debug_info['preface'] = self.preface if hasattr(self, 'preface') else 0
print(f"Preface parsed successfully.")
except Exception as e:
print(f"Error in get_preface: {e}")
Expand All @@ -316,31 +311,28 @@ def parse(self, file: str) -> list[dict]:
except Exception as e:
print(f"Error in get_citations: {e}")
try:
self.get_body(body_xpath='.//akn:body')
self.get_body()
print("Body parsed successfully.")
except Exception as e:
print(f"Error in get_body: {e}")

try:
self.get_chapters(chapter_xpath='.//akn:chapter', num_xpath='.//akn:num', heading_xpath='.//akn:heading')
debug_info['chapters'] = len(self.chapters) if hasattr(self, 'chapters') else 0
print(f"Chapters parsed successfully. Number of chapters: {debug_info['chapters']}")
print(f"Chapters parsed successfully. Number of chapters: {len(self.chapters)}")
except Exception as e:
print(f"Error in get_chapters: {e}")

try:
self.get_articles()
debug_info['articles'] = len(self.articles) if hasattr(self, 'articles') else 0
print(f"Articles parsed successfully. Number of articles: {debug_info['articles']}")
print(f"Articles parsed successfully. Number of articles: {len(self.articles)}")
except Exception as e:
print(f"Error in get_articles: {e}")

try:
self.get_conclusions()
debug_info['conclusions'] = self.conclusions if hasattr(self, 'conclusions') else "Conclusions not parsed."
self.get_conclusions()
print(f"Conclusions parsed successfully. ")
except Exception as e:
print(f"Error in get_conclusions: {e}")

except Exception as e:
print(f'Invalid Akoma Ntoso file: parsing may not work or work only partially: {e}')
print(f'Invalid {self.format} file: parsing may not work or work only partially: {e}')
5 changes: 4 additions & 1 deletion tulit/parsers/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ def extract_eId(recital):
extract_eId=extract_eId
)

def get_body(self):
return super().get_body('.//ENACTING.TERMS')

def get_chapters(self) -> None:
"""
Extracts chapter information from the document.
Expand Down Expand Up @@ -160,6 +163,6 @@ def parse(self, file):
self.get_metadata()
self.get_preface(preface_xpath='.//TITLE', paragraph_xpath='.//P')
self.get_preamble(preamble_xpath='.//PREAMBLE', notes_xpath='.//NOTE')
self.get_body(body_xpath='.//ENACTING.TERMS')
self.get_body()
self.get_chapters()
self.get_articles()
1 change: 0 additions & 1 deletion tulit/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,6 @@ def get_recitals(self, recitals_xpath, recital_xpath, text_xpath, extract_intro=

self.recitals = recitals

### Enacting terms block
def get_body(self, body_xpath) -> None:
"""
Extracts the body element from the document.
Expand Down

0 comments on commit 88a5a5c

Please sign in to comment.