From 1dc3bfecb24bacc9bd4174a5439750a09675d155 Mon Sep 17 00:00:00 2001 From: AlessioNar Date: Sun, 24 Nov 2024 17:52:05 +0100 Subject: [PATCH] Initiated transition to common data model for formex --- op_cellar/parsers/formex.py | 64 +++++++++++++++++------------------- tests/parsers/test_formex.py | 34 +++++++++---------- 2 files changed, 47 insertions(+), 51 deletions(-) diff --git a/op_cellar/parsers/formex.py b/op_cellar/parsers/formex.py index d0dee52..4eb7ba0 100644 --- a/op_cellar/parsers/formex.py +++ b/op_cellar/parsers/formex.py @@ -1,33 +1,19 @@ from .parser import Parser import re -import xml.etree.ElementTree as ET +from lxml import etree class Formex4Parser(Parser): - def parse(self, file): - """ - Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms. - - Args: - file (str): Path to the FORMEX XML file. + def __init__(self): + pass - Returns: - dict: Parsed data containing metadata, title, preamble, and articles. + def load_xml(self, file): + """ """ with open(file, 'r', encoding='utf-8') as f: - tree = ET.parse(f) - root = tree.getroot() - - - parsed_data = { - "metadata": self._parse_metadata(root), - "title": self._parse_title(root), - "preamble": self._parse_preamble(root), - "articles": self._parse_articles(root), - } - - return parsed_data + tree = etree.parse(f) + self.root = tree.getroot() - def _parse_metadata(self, root): + def get_metadata(self): """ Extracts metadata information from the BIB.INSTANCE section. @@ -38,7 +24,7 @@ def _parse_metadata(self, root): dict: Extracted metadata. """ metadata = {} - bib_instance = root.find('BIB.INSTANCE') + bib_instance = self.root.find('BIB.INSTANCE') if bib_instance is not None: doc_ref = bib_instance.find('DOCUMENT.REF') @@ -64,7 +50,7 @@ def _parse_metadata(self, root): return metadata - def _parse_title(self, root): + def get_title(self, root): """ Extracts title information from the TITLE section. @@ -84,7 +70,7 @@ def _parse_title(self, root): return title_text.strip() - def _parse_preamble(self, root): + def get_preamble(self, root): """ Extracts the preamble section, including initial statements and considerations. @@ -130,7 +116,7 @@ def _parse_preamble(self, root): return preamble_data - def _parse_articles(self, root): + def get_articles(self): """ Extracts articles from the ENACTING.TERMS section. @@ -140,16 +126,28 @@ def _parse_articles(self, root): Returns: list: Articles with identifier and content. """ - articles = [] - enacting_terms = root.find('ENACTING.TERMS') + self.articles = [] + enacting_terms = self.root.find('ENACTING.TERMS') if enacting_terms is not None: for article in enacting_terms.findall('ARTICLE'): article_data = { - "identifier": article.get("IDENTIFIER"), - "title": article.findtext('TI.ART'), - "content": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA')) + "eId": article.get("IDENTIFIER"), + "article_num": article.findtext('TI.ART'), + "article_text": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA')) } - articles.append(article_data) + self.articles.append(article_data) - return articles + + def parse(self, file): + """ + Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms. + + Args: + file (str): Path to the FORMEX XML file. + + Returns: + dict: Parsed data containing metadata, title, preamble, and articles. + """ + self.load_xml(file) + self.get_articles() \ No newline at end of file diff --git a/tests/parsers/test_formex.py b/tests/parsers/test_formex.py index 0c6a291..2e1bd85 100644 --- a/tests/parsers/test_formex.py +++ b/tests/parsers/test_formex.py @@ -13,10 +13,10 @@ def setUp(self): def test_parse_metadata(self): self.maxDiff = None # Allow the full diff to be displayed file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml") - with open(file_path, 'r', encoding='utf-8') as f: - tree = ET.parse(f) - root = tree.getroot() - result = self.formex_parser._parse_metadata(root) + + self.formex_parser.load_xml(file_path) + + result = self.formex_parser.get_metadata() expected = { "file": "L_2011334EN.01002501.doc.xml", "collection": "L", @@ -44,7 +44,7 @@ def test_parse_title(self): tree = ET.parse(f) root = tree.getroot() - result = self.formex_parser._parse_title(root) + result = self.formex_parser.get_title(root) expected = ( "Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 " "fixing representative prices in the poultrymeat and egg sectors and for egg " @@ -62,7 +62,7 @@ def test_parse_preamble(self): tree = ET.parse(f) root = tree.getroot() - result = self.formex_parser._parse_preamble(root) + result = self.formex_parser.get_preamble(root) # Expected preamble structure # @todo - see main function @@ -91,28 +91,26 @@ def test_parse_articles(self): self.maxDiff = None # Allow full diff if needed file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml") - # Parse the XML tree and pass the root to _parse_articles - with open(file_path, 'r', encoding='utf-8') as f: - tree = ET.parse(f) - root = tree.getroot() + self.formex_parser.load_xml(file_path) + + self.formex_parser.get_articles() - result = self.formex_parser._parse_articles(root) # Expected articles based on sample data in XML file expected = [ { - "identifier": "001", - "title": "Article 1", - "content": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation." + "eId": "001", + "article_num": "Article 1", + "article_text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation." }, { - "identifier": "002", - "title": "Article 2", - "content": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union." + "eId": "002", + "article_num": "Article 2", + "article_text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union." } ] - self.assertEqual(result, expected) + self.assertEqual(self.formex_parser.articles, expected) # Run the tests if __name__ == "__main__":