Skip to content

Commit

Permalink
Fixed formula and preamble final
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 28, 2024
1 parent bcc7c2f commit d2a0a0c
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 51 deletions.
18 changes: 10 additions & 8 deletions tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,14 @@ def test_get_preamble(self):
"""Test retrieval of preamble data from the XML file."""
self.parser.get_preamble()
self.assertIsNotNone(self.parser.preamble, "Preamble element not found")
self.assertIsNotNone(self.parser.formula, "Formula not found")


def test_get_formula(self):
"""Test extraction of formula text within the preamble."""
self.parser.get_preamble()

formula_data = self.parser.get_formula()
self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data)
self.parser.get_formula()
self.assertEqual(self.parser.formula, "THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,")

def test_get_citations(self):
"""Test citation extraction in the preamble section."""
Expand Down Expand Up @@ -77,11 +76,14 @@ def test_get_recitals(self):
self.assertIn(expected_values['text'], self.parser.recitals[index]['text'],
f"Recital {index} text does not match expected content")

def test_get_act(self):
"""Test retrieval of the act element."""
self.parser.get_act()
self.assertIsInstance(self.parser.act, etree._Element, "Act element should be an lxml.etree._Element")

def test_get_preamble_final(self):
"""Test extraction of the final preamble text."""
self.parser.get_preamble()
self.parser.get_preamble_final()
preamble_final = "HAVE ADOPTED THIS DIRECTIVE:"
self.assertEqual(self.parser.preamble_final, preamble_final, "Final preamble text does not match expected content")


def test_get_body(self):
"""Test retrieval of the body element."""
self.parser.get_body()
Expand Down
38 changes: 19 additions & 19 deletions tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,22 @@ def test_get_root(self):
def test_get_preface(self):

self.parser.get_preface()
expected = (
"Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 "
"fixing representative prices in the poultrymeat and egg sectors and for egg "
"albumin, and amending Regulation (EC) No 1484/95"
)
self.assertEqual(self.parser.preface, expected)
preface = "Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 fixing representative prices in the poultrymeat and egg sectors and for egg albumin, and amending Regulation (EC) No 1484/95"
self.assertEqual(self.parser.preface, preface)

def test_get_preamble(self):
"""Test parsing the preamble section with quotations and numbered considerations in Formex4Parser."""

def test_get_preamble(self):
self.parser.get_preamble()
self.assertIsNotNone(self.parser.preamble)

def test_get_formula(self):
initial_statement = {
"initial_statement": "THE EUROPEAN COMMISSION,",
}
pass

self.parser.get_preamble()
self.parser.get_formula()

formula = "THE EUROPEAN COMMISSION,"

self.assertEqual(self.parser.formula, formula)


def test_get_citations(self):

Expand All @@ -57,7 +55,6 @@ def test_get_citations(self):


def test_get_recitals(self):
"""Test parsing the preamble section with quotations and numbered considerations in Formex4Parser."""

self.parser.get_preamble()
self.parser.get_recitals()
Expand All @@ -70,18 +67,21 @@ def test_get_recitals(self):
{"eId": "(4)", "text": "The measures provided for in this Regulation are in accordance with the opinion of the Management Committee for the Common Organisation of Agricultural Markets,"},
]

preamble_final = {
"preamble_final": "HAS ADOPTED THIS REGULATION:"
}

self.assertEqual(self.parser.recitals, recitals)

def test_get_preamble_final(self):
self.parser.get_preamble()

self.parser.get_preamble_final()
preamble_final = "HAS ADOPTED THIS REGULATION:"

self.assertEqual(self.parser.preamble_final, preamble_final)

def test_get_body(self):
self.parser.get_body()
self.assertIsNotNone(self.parser.body, "Body element should not be None")

def test_get_chapters(self):
"""Test retrieval and content of chapter headings."""
self.parser = Formex4Parser()
self.parser.get_root(iopa)
self.parser.get_body()
Expand Down
26 changes: 15 additions & 11 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,7 @@ def get_formula(self):
Concatenated text from all paragraphs within the formula element.
Returns None if no formula is found.
"""
formula = self.preamble.find('.//akn:formula', namespaces=self.namespaces)
# Extract text from <p> within <formula>
formula_text = ' '.join(p.text.strip() for p in formula.findall('akn:p', namespaces=self.namespaces) if p.text)
if formula is None:
return None

self.formula = formula_text
return self.formula

return super().get_formula(formula_xpath='.//akn:formula', paragraph_xpath='akn:p')

def get_citations(self) -> list:
"""
Expand Down Expand Up @@ -107,8 +99,20 @@ def extract_eId(recital):
)

def get_preamble_final(self):
return super().get_preamble_final()

"""
Extracts the final preamble text from the document.
Returns
-------
str or None
Concatenated text from the final preamble element.
Returns None if no final preamble is found.
"""
preamble_final = self.preamble.find(".//akn:block", namespaces=self.namespaces).text
self.preamble_final = preamble_final
return self.preamble_final

#return super().get_preamble_final(preamble_final_xpath='.//block[@name="preamble.final"]')

def get_body(self):
return super().get_body('.//akn:body')
Expand Down
22 changes: 14 additions & 8 deletions tulit/parsers/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,9 @@ def get_formula(self):
str
Formula text from the preamble.
"""
formula = self.preamble.findtext('PREAMBLE.INIT')

if formula is None:
return None

self.formula = formula

self.formula = self.preamble.findtext('PREAMBLE.INIT')
return self.formula


def get_citations(self):
"""
Expand Down Expand Up @@ -105,7 +100,18 @@ def extract_eId(recital):
)

def get_preamble_final(self):
return super().get_preamble_final()
"""
Extracts the final preamble text from the document.
Returns
-------
str or None
Concatenated text from the final preamble element.
Returns None if no final preamble is found.
"""
self.preamble_final = self.preamble.findtext('PREAMBLE.FINAL')

return self.preamble_final

def get_body(self):
return super().get_body('.//ENACTING.TERMS')
Expand Down
58 changes: 53 additions & 5 deletions tulit/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(self):
self.formula = None
self.citations = None
self.recitals = None
self.preamble_final = None

self.body = None
self.chapters = []
Expand Down Expand Up @@ -256,10 +257,32 @@ def get_preamble(self, preamble_xpath, notes_xpath) -> None:

if self.preamble is not None:
self.preamble = self.remove_node(self.preamble, notes_xpath)
#preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')

def get_formula(self):
pass
def get_formula(self, formula_xpath: str, paragraph_xpath: str) -> str:
"""
Extracts formula text from the preamble.
Parameters
----------
formula_xpath : str
XPath expression to locate the formula element.
paragraph_xpath : str
XPath expression to locate the paragraphs within the formula.
Returns
-------
str or None
Concatenated text from all paragraphs within the formula element.
Returns None if no formula is found.
"""
formula = self.preamble.find(formula_xpath, namespaces=self.namespaces)
if formula is None:
return None

# Extract text from <p> within <formula>
formula_text = ' '.join(p.text.strip() for p in formula.findall(paragraph_xpath, namespaces=self.namespaces) if p.text)
self.formula = formula_text
return self.formula

def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
"""
Expand Down Expand Up @@ -333,8 +356,27 @@ def get_recitals(self, recitals_xpath, recital_xpath, text_xpath, extract_intro=

self.recitals = recitals

def get_preamble_final(self):
pass
def get_preamble_final(self) -> str:
"""
Extracts the final preamble text from the document.
Parameters
----------
preamble_final_xpath : str
XPath expression to locate the final preamble element.
Returns
-------
str or None
Concatenated text from the final preamble element.
Returns None if no final preamble is found.
"""
preamble_final = self.preamble.findtext('.//block', namespaces=self.namespaces)
if preamble_final is None:
return None

self.preamble_final = preamble_final
return self.preamble_final

def get_body(self, body_xpath) -> None:
"""
Expand Down Expand Up @@ -486,6 +528,12 @@ def parse(self, file: str, schema, format) -> None:
except Exception as e:
print(f"Error in get_recitals: {e}")

try:
self.get_preamble_final()
print(f"Preamble final parsed successfully.")
except Exception as e:
print(f"Error in get_preamble_final: {e}")

try:
self.get_body()
print("Body element found.")
Expand Down

0 comments on commit d2a0a0c

Please sign in to comment.