Skip to content

Commit

Permalink
Implemented get_recitals in generic XMLParser class with specific imp…
Browse files Browse the repository at this point in the history
…lementations for formex and akn
  • Loading branch information
AlessioNar committed Dec 27, 2024
1 parent 6250040 commit 537887d
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 116 deletions.
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@

# -- Options for HTML output
autodocs_mock_imports = ['tulit']
autodoc_member_order = 'bysource'

html_theme = 'sphinx_rtd_theme'

Expand Down
2 changes: 1 addition & 1 deletion tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_get_recitals(self):
with self.subTest(recital=index):
self.assertEqual(self.parser.recitals[index]['eId'], expected_values['eId'],
f"Recital {index} ID does not match expected value")
self.assertIn(expected_values['text'], self.parser.recitals[index]['recital_text'],
self.assertIn(expected_values['text'], self.parser.recitals[index]['text'],
f"Recital {index} text does not match expected content")

def test_get_act(self):
Expand Down
10 changes: 5 additions & 5 deletions tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,11 @@ def test_get_recitals(self):
self.parser.get_recitals()

recitals = [
{"eId": "rec_0", "recital_text": "Whereas:"},
{"eId": "(1)", "recital_text": "Commission Regulation (EC) No 1484/95 lays down detailed rules for implementing the system of additional import duties and fixes representative prices for poultrymeat and egg products and for egg albumin."},
{"eId": "(2)", "recital_text": "Regular monitoring of the data used to determine representative prices for poultrymeat and egg products and for egg albumin shows that the representative import prices for certain products should be amended to take account of variations in price according to origin. The representative prices should therefore be published."},
{"eId": "(3)", "recital_text": "In view of the situation on the market, this amendment should be applied as soon as possible."},
{"eId": "(4)", "recital_text": "The measures provided for in this Regulation are in accordance with the opinion of the Management Committee for the Common Organisation of Agricultural Markets,"},
{"eId": "rec_0", "text": "Whereas:"},
{"eId": "(1)", "text": "Commission Regulation (EC) No 1484/95 lays down detailed rules for implementing the system of additional import duties and fixes representative prices for poultrymeat and egg products and for egg albumin."},
{"eId": "(2)", "text": "Regular monitoring of the data used to determine representative prices for poultrymeat and egg products and for egg albumin shows that the representative import prices for certain products should be amended to take account of variations in price according to origin. The representative prices should therefore be published."},
{"eId": "(3)", "text": "In view of the situation on the market, this amendment should be applied as soon as possible."},
{"eId": "(4)", "text": "The measures provided for in this Regulation are in accordance with the opinion of the Management Committee for the Common Organisation of Agricultural Markets,"},
]

preamble_final = {
Expand Down
56 changes: 20 additions & 36 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,42 +269,26 @@ def get_recitals(self):
List of dictionaries containing recital text and eId for each
recital. Returns None if no recitals are found.
"""
recitals_section = self.preamble.find('.//akn:recitals', namespaces=self.namespaces)
if recitals_section is None:
return None

recitals = []

# Intro
recitals_intro = recitals_section.find('.//akn:intro', namespaces=self.namespaces)
recitals_intro_eId = recitals_intro.get('eId')
recitals_intro_text = ' '.join(p.text.strip() for p in recitals_intro.findall('.//akn:p', namespaces=self.namespaces) if p.text)
recitals.append({
'recital_text': recitals_intro_text,
'eId': recitals_intro_eId
})

# Removing all authorialNote nodes
recitals_section = self.remove_node(recitals_section, './/akn:authorialNote')

# Step 2: Process each <recital> element in the recitals_section without the <authorialNote> elements
for recital in recitals_section.findall('.//akn:recital', namespaces=self.namespaces):
eId = str(recital.get('eId'))

# Extract text from remaining <akn:p> elements
recital_text = ' '.join(' '.join(p.itertext()).strip() for p in recital.findall('.//akn:p', namespaces=self.namespaces))

# Remove any double spaces in the concatenated recital text
recital_text = re.sub(r'\s+', ' ', recital_text)

# Append the cleaned recital text and eId to the list
recitals.append({
'recital_text': recital_text,
'eId': eId
})

self.recitals = recitals


def extract_intro(recitals_section):
# Intro - different implementation
recitals_intro = recitals_section.find('.//akn:intro', namespaces=self.namespaces)
intro_eId = recitals_intro.get('eId')
intro_text = ''.join(p.text.strip() for p in recitals_intro.findall('.//akn:p', namespaces=self.namespaces) if p.text)
return intro_eId, intro_text

def extract_eId(recital):
return str(recital.get('eId'))

return super().get_recitals(
recitals_xpath='.//akn:recitals',
recital_xpath='.//akn:recital',
text_xpath='.//akn:p',
extract_intro=extract_intro,
extract_eId=extract_eId,

)

### Act block
def get_act(self) -> None:
"""
Expand Down
44 changes: 22 additions & 22 deletions tulit/parsers/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,34 +103,34 @@ def extract_eId(citation, index):
extract_eId=extract_eId
)

def get_recitals(self, recitals_xpath='.//GR.CONSID', recital_xpath='.//CONSID') -> None:
def get_recitals(self) -> None:
"""
Extracts recitals from the preamble.
Returns
-------
list
List of dictionaries containing recital text and eId for each recital.
"""

recitals = []
recitals.append({
"eId": 'rec_0',
"recital_text": self.preamble.findtext('.//GR.CONSID/GR.CONSID.INIT')
})

for recital in self.preamble.findall(recital_xpath):
recital_num = recital.findtext('.//NO.P')
recital_text = "".join(recital.find('.//TXT').itertext()).strip()
recitals.append({
"eId": recital_num,
"recital_text": recital_text
})
#preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')

self.recitals = recitals
list or None
List of dictionaries containing recital text and eId for each
recital. Returns None if no recitals are found.
"""

def extract_intro(recitals_section):
# Intro - different implementation
intro_eId = 'rec_0'
intro_text = self.preamble.findtext('.//GR.CONSID.INIT')

return intro_eId, intro_text


def extract_eId(recital):
return recital.findtext('.//NO.P')

return super().get_recitals(
recitals_xpath='.//GR.CONSID',
recital_xpath='.//CONSID',
text_xpath='.//TXT',
extract_intro=extract_intro,
extract_eId=extract_eId
)

def get_chapters(self) -> None:
"""
Expand Down
3 changes: 2 additions & 1 deletion tulit/parsers/html.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from bs4 import BeautifulSoup
from .parser import Parser

class HTMLParser():
class HTMLParser(Parser):
def __init__(self):
"""
Initializes the HTML parser and sets up the BeautifulSoup instance.
Expand Down
155 changes: 104 additions & 51 deletions tulit/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,26 @@
import os
import re

class XMLParser(ABC):
class Parser(ABC):
"""
Abstract base class for XML parsers.
Abstract base class for parsers
Attributes
----------
schema : lxml.etree.XMLSchema or None
The XML schema used for validation.
valid : bool or None
Indicates whether the XML file is valid against the schema.
validation_errors : lxml.etree._LogEntry or None
Validation errors if the XML file is invalid.
root : lxml.etree._Element
Root element of the XML document.
namespaces : dict
Dictionary containing XML namespaces.
root : lxml.etree._Element or bs4.BeautifulSoup
Root element of the XML or HTML document.
preface : str or None
Extracted preface text from the XML document.
preamble : lxml.etree.Element or None
The preamble section of the XML document.
Extracted preface text from the document.
preamble : lxml.etree.Element or bs4.Tag or None
The preamble section of the document.
formula : str or None
The formula element extracted from the preamble.
citations : list or None
List of extracted citations from the preamble.
recitals : list or None
List of extracted recitals from the preamble.
body : lxml.etree.Element or None
The body section of the XML document.
body : lxml.etree.Element or bs4.Tag or None
The body section of the document.
chapters : list or None
List of extracted chapters from the body.
articles : list or None
Expand All @@ -49,12 +41,8 @@ def __init__(self):
----------
None
"""
self.schema = None
self.valid = None
self.validation_errors = None
self.root = None
self.namespaces = {}


self.root = None
self.preface = None

self.preamble = None
Expand All @@ -68,13 +56,38 @@ def __init__(self):
self.conclusions = None

self.articles_text = []

@abstractmethod
def parse(self):

class XMLParser(Parser):
"""
Base class for XML parsers.
Attributes
----------
schema : lxml.etree.XMLSchema or None
The XML schema used for validation.
valid : bool or None
Indicates whether the XML file is valid against the schema.
validation_errors : lxml.etree._LogEntry or None
Validation errors if the XML file is invalid.
namespaces : dict
Dictionary containing XML namespaces.
"""

def __init__(self):
"""
Abstract method to parse the data. This method must be implemented by the subclass.
Initializes the Parser object.
Parameters
----------
None
"""
pass
super().__init__()

self.schema = None
self.valid = None
self.validation_errors = None

self.namespaces = {}

def load_schema(self, schema):
"""
Expand Down Expand Up @@ -135,25 +148,6 @@ def validate(self, format, file: str) -> bool:
except Exception as e:
print(f"An error occurred during validation: {e}")
self.valid = False

def get_root(self, file: str):
"""
Parses an XML file and returns its root element.
Parameters
----------
file : str
Path to the XML file.
Returns
-------
None
"""
with open(file, 'r', encoding='utf-8') as f:
tree = etree.parse(f)
self.root = tree.getroot()


def remove_node(self, tree, node):
"""
Expand Down Expand Up @@ -193,6 +187,24 @@ def remove_node(self, tree, node):

return tree

def get_root(self, file: str):
"""
Parses an XML file and returns its root element.
Parameters
----------
file : str
Path to the XML file.
Returns
-------
None
"""
with open(file, 'r', encoding='utf-8') as f:
tree = etree.parse(f)
self.root = tree.getroot()


def get_preface(self, preface_xpath, paragraph_xpath) -> None:
"""
Extracts paragraphs from the preface section of the document.
Expand Down Expand Up @@ -241,9 +253,9 @@ def get_preamble(self, preamble_xpath, notes_xpath) -> None:
if self.preamble is not None:
self.preamble = self.remove_node(self.preamble, notes_xpath)
self.formula = self.get_formula()

#self.recitals = self.get_recitals()

#preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')

def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
"""
Extracts citations from the preamble.
Expand Down Expand Up @@ -284,6 +296,40 @@ def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):

self.citations = citations

def get_recitals(self, recitals_xpath, recital_xpath, text_xpath, extract_intro=None, extract_eId=None):
"""
Extracts recitals from the preamble.
Returns
-------
list or None
List of dictionaries containing recital text and eId for each
recital. Returns None if no recitals are found.
"""
recitals_section = self.preamble.find(recitals_xpath, namespaces=self.namespaces)
if recitals_section is None:
return None

recitals = []
# Get an eId for the citation, depending on the XML format
intro_eId, intro_text = extract_intro(recitals_section) if extract_intro else (None, None)

recitals.append({
"eId": intro_eId,
"text": intro_text
})


for recital in recitals_section.findall(recital_xpath, namespaces=self.namespaces):
eId = extract_eId(recital) if extract_eId else None
text = ''.join(''.join(p.itertext()).strip() for p in recital.findall(text_xpath, namespaces=self.namespaces))
recitals.append({
"eId": eId,
"text": text
})

self.recitals = recitals

### Enacting terms block
def get_body(self, body_xpath) -> None:
"""
Expand All @@ -303,4 +349,11 @@ def get_body(self, body_xpath) -> None:
self.body = self.root.find(body_xpath, namespaces=self.namespaces)
if self.body is None:
# Fallback: try without namespace
self.body = self.root.find(body_xpath)
self.body = self.root.find(body_xpath)

@abstractmethod
def parse(self):
"""
Abstract method to parse the data. This method must be implemented by the subclass.
"""
pass

0 comments on commit 537887d

Please sign in to comment.