Skip to content

Commit

Permalink
Initiated transition to common data model for formex
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Nov 24, 2024
1 parent cab4b68 commit 1dc3bfe
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 51 deletions.
64 changes: 31 additions & 33 deletions op_cellar/parsers/formex.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,19 @@
from .parser import Parser
import re
import xml.etree.ElementTree as ET
from lxml import etree

class Formex4Parser(Parser):
def parse(self, file):
"""
Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms.
Args:
file (str): Path to the FORMEX XML file.
def __init__(self):
pass

Returns:
dict: Parsed data containing metadata, title, preamble, and articles.
def load_xml(self, file):
"""
"""
with open(file, 'r', encoding='utf-8') as f:
tree = ET.parse(f)
root = tree.getroot()


parsed_data = {
"metadata": self._parse_metadata(root),
"title": self._parse_title(root),
"preamble": self._parse_preamble(root),
"articles": self._parse_articles(root),
}

return parsed_data
tree = etree.parse(f)
self.root = tree.getroot()

def _parse_metadata(self, root):
def get_metadata(self):
"""
Extracts metadata information from the BIB.INSTANCE section.
Expand All @@ -38,7 +24,7 @@ def _parse_metadata(self, root):
dict: Extracted metadata.
"""
metadata = {}
bib_instance = root.find('BIB.INSTANCE')
bib_instance = self.root.find('BIB.INSTANCE')

if bib_instance is not None:
doc_ref = bib_instance.find('DOCUMENT.REF')
Expand All @@ -64,7 +50,7 @@ def _parse_metadata(self, root):

return metadata

def _parse_title(self, root):
def get_title(self, root):
"""
Extracts title information from the TITLE section.
Expand All @@ -84,7 +70,7 @@ def _parse_title(self, root):

return title_text.strip()

def _parse_preamble(self, root):
def get_preamble(self, root):
"""
Extracts the preamble section, including initial statements and considerations.
Expand Down Expand Up @@ -130,7 +116,7 @@ def _parse_preamble(self, root):

return preamble_data

def _parse_articles(self, root):
def get_articles(self):
"""
Extracts articles from the ENACTING.TERMS section.
Expand All @@ -140,16 +126,28 @@ def _parse_articles(self, root):
Returns:
list: Articles with identifier and content.
"""
articles = []
enacting_terms = root.find('ENACTING.TERMS')
self.articles = []
enacting_terms = self.root.find('ENACTING.TERMS')

if enacting_terms is not None:
for article in enacting_terms.findall('ARTICLE'):
article_data = {
"identifier": article.get("IDENTIFIER"),
"title": article.findtext('TI.ART'),
"content": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA'))
"eId": article.get("IDENTIFIER"),
"article_num": article.findtext('TI.ART'),
"article_text": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA'))
}
articles.append(article_data)
self.articles.append(article_data)

return articles

def parse(self, file):
"""
Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms.
Args:
file (str): Path to the FORMEX XML file.
Returns:
dict: Parsed data containing metadata, title, preamble, and articles.
"""
self.load_xml(file)
self.get_articles()
34 changes: 16 additions & 18 deletions tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ def setUp(self):
def test_parse_metadata(self):
self.maxDiff = None # Allow the full diff to be displayed
file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")
with open(file_path, 'r', encoding='utf-8') as f:
tree = ET.parse(f)
root = tree.getroot()
result = self.formex_parser._parse_metadata(root)

self.formex_parser.load_xml(file_path)

result = self.formex_parser.get_metadata()
expected = {
"file": "L_2011334EN.01002501.doc.xml",
"collection": "L",
Expand Down Expand Up @@ -44,7 +44,7 @@ def test_parse_title(self):
tree = ET.parse(f)
root = tree.getroot()

result = self.formex_parser._parse_title(root)
result = self.formex_parser.get_title(root)
expected = (
"Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 "
"fixing representative prices in the poultrymeat and egg sectors and for egg "
Expand All @@ -62,7 +62,7 @@ def test_parse_preamble(self):
tree = ET.parse(f)
root = tree.getroot()

result = self.formex_parser._parse_preamble(root)
result = self.formex_parser.get_preamble(root)

# Expected preamble structure
# @todo - see main function
Expand Down Expand Up @@ -91,28 +91,26 @@ def test_parse_articles(self):
self.maxDiff = None # Allow full diff if needed
file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")

# Parse the XML tree and pass the root to _parse_articles
with open(file_path, 'r', encoding='utf-8') as f:
tree = ET.parse(f)
root = tree.getroot()
self.formex_parser.load_xml(file_path)

self.formex_parser.get_articles()

result = self.formex_parser._parse_articles(root)

# Expected articles based on sample data in XML file
expected = [
{
"identifier": "001",
"title": "Article 1",
"content": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."
"eId": "001",
"article_num": "Article 1",
"article_text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."
},
{
"identifier": "002",
"title": "Article 2",
"content": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."
"eId": "002",
"article_num": "Article 2",
"article_text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."
}
]

self.assertEqual(result, expected)
self.assertEqual(self.formex_parser.articles, expected)

# Run the tests
if __name__ == "__main__":
Expand Down

0 comments on commit 1dc3bfe

Please sign in to comment.