Initiated transition to common data model for formex

AlessioNar · Nov 24, 2024 · 1dc3bfe · 1dc3bfe
1 parent cab4b68
commit 1dc3bfe
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 51 deletions.
diff --git a/op_cellar/parsers/formex.py b/op_cellar/parsers/formex.py
@@ -1,33 +1,19 @@
 from .parser import Parser
 import re
-import xml.etree.ElementTree as ET
+from lxml import etree
 
 class Formex4Parser(Parser):
-    def parse(self, file):
-        """
-        Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms.
-
-        Args:
-        file (str): Path to the FORMEX XML file.
+    def __init__(self):
+        pass
 
-        Returns:
-        dict: Parsed data containing metadata, title, preamble, and articles.
+    def load_xml(self, file):
+        """
         """
         with open(file, 'r', encoding='utf-8') as f:
-            tree = ET.parse(f)
-            root = tree.getroot()
-
-
-        parsed_data = {
-            "metadata": self._parse_metadata(root),
-            "title": self._parse_title(root),
-            "preamble": self._parse_preamble(root),
-            "articles": self._parse_articles(root),
-        }
-
-        return parsed_data
+            tree = etree.parse(f)
+            self.root = tree.getroot()
 
-    def _parse_metadata(self, root):
+    def get_metadata(self):
         """
         Extracts metadata information from the BIB.INSTANCE section.
 
@@ -38,7 +24,7 @@ def _parse_metadata(self, root):
         dict: Extracted metadata.
         """
         metadata = {}
-        bib_instance = root.find('BIB.INSTANCE')
+        bib_instance = self.root.find('BIB.INSTANCE')
 
         if bib_instance is not None:
             doc_ref = bib_instance.find('DOCUMENT.REF')
@@ -64,7 +50,7 @@ def _parse_metadata(self, root):
 
         return metadata
 
-    def _parse_title(self, root):
+    def get_title(self, root):
         """
         Extracts title information from the TITLE section.
 
@@ -84,7 +70,7 @@ def _parse_title(self, root):
 
         return title_text.strip()
 
-    def _parse_preamble(self, root):
+    def get_preamble(self, root):
         """
         Extracts the preamble section, including initial statements and considerations.
 
@@ -130,7 +116,7 @@ def _parse_preamble(self, root):
 
         return preamble_data
 
-    def _parse_articles(self, root):
+    def get_articles(self):
         """
         Extracts articles from the ENACTING.TERMS section.
 
@@ -140,16 +126,28 @@ def _parse_articles(self, root):
         Returns:
         list: Articles with identifier and content.
         """
-        articles = []
-        enacting_terms = root.find('ENACTING.TERMS')
+        self.articles = []
+        enacting_terms = self.root.find('ENACTING.TERMS')
 
         if enacting_terms is not None:
             for article in enacting_terms.findall('ARTICLE'):
                 article_data = {
-                    "identifier": article.get("IDENTIFIER"),
-                    "title": article.findtext('TI.ART'),
-                    "content": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA'))
+                    "eId": article.get("IDENTIFIER"),
+                    "article_num": article.findtext('TI.ART'),
+                    "article_text": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA'))
                 }
-                articles.append(article_data)
+                self.articles.append(article_data)
 
-        return articles
+
+    def parse(self, file):
+        """
+        Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms.
+
+        Args:
+        file (str): Path to the FORMEX XML file.
+
+        Returns:
+        dict: Parsed data containing metadata, title, preamble, and articles.
+        """
+        self.load_xml(file)
+        self.get_articles()
diff --git a/tests/parsers/test_formex.py b/tests/parsers/test_formex.py
@@ -13,10 +13,10 @@ def setUp(self):
     def test_parse_metadata(self):
         self.maxDiff = None  # Allow the full diff to be displayed
         file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")
-        with open(file_path, 'r', encoding='utf-8') as f:
-            tree = ET.parse(f)
-            root = tree.getroot()
-        result = self.formex_parser._parse_metadata(root)
+
+        self.formex_parser.load_xml(file_path)
+
+        result = self.formex_parser.get_metadata()
         expected = {
             "file": "L_2011334EN.01002501.doc.xml",
             "collection": "L",
@@ -44,7 +44,7 @@ def test_parse_title(self):
             tree = ET.parse(f)
             root = tree.getroot()
 
-        result = self.formex_parser._parse_title(root)
+        result = self.formex_parser.get_title(root)
         expected = (
             "Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 "
             "fixing representative prices in the poultrymeat and egg sectors and for egg "
@@ -62,7 +62,7 @@ def test_parse_preamble(self):
             tree = ET.parse(f)
             root = tree.getroot()
 
-        result = self.formex_parser._parse_preamble(root)
+        result = self.formex_parser.get_preamble(root)
 
         # Expected preamble structure
         # @todo - see main function
@@ -91,28 +91,26 @@ def test_parse_articles(self):
         self.maxDiff = None  # Allow full diff if needed
         file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")
 
-        # Parse the XML tree and pass the root to _parse_articles
-        with open(file_path, 'r', encoding='utf-8') as f:
-            tree = ET.parse(f)
-            root = tree.getroot()
+        self.formex_parser.load_xml(file_path)
+
+        self.formex_parser.get_articles()
 
-        result = self.formex_parser._parse_articles(root)
 
         # Expected articles based on sample data in XML file
         expected = [
             {
-                "identifier": "001",
-                "title": "Article 1",
-                "content": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."
+                "eId": "001",
+                "article_num": "Article 1",
+                "article_text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."
             },
             {
-                "identifier": "002",
-                "title": "Article 2",
-                "content": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."
+                "eId": "002",
+                "article_num": "Article 2",
+                "article_text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."
             }
         ]
 
-        self.assertEqual(result, expected)
+        self.assertEqual(self.formex_parser.articles, expected)
 
 # Run the tests
 if __name__ == "__main__":