From 1dc3bfecb24bacc9bd4174a5439750a09675d155 Mon Sep 17 00:00:00 2001
From: AlessioNar <alessio.nardin@gmail.com>
Date: Sun, 24 Nov 2024 17:52:05 +0100
Subject: [PATCH] Initiated transition to common data model for formex

---
 op_cellar/parsers/formex.py  | 64 +++++++++++++++++-------------------
 tests/parsers/test_formex.py | 34 +++++++++----------
 2 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/op_cellar/parsers/formex.py b/op_cellar/parsers/formex.py
index d0dee52..4eb7ba0 100644
--- a/op_cellar/parsers/formex.py
+++ b/op_cellar/parsers/formex.py
@@ -1,33 +1,19 @@
 from .parser import Parser
 import re
-import xml.etree.ElementTree as ET
+from lxml import etree
 
 class Formex4Parser(Parser):
-    def parse(self, file):
-        """
-        Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms.
-
-        Args:
-        file (str): Path to the FORMEX XML file.
+    def __init__(self):
+        pass
 
-        Returns:
-        dict: Parsed data containing metadata, title, preamble, and articles.
+    def load_xml(self, file):
+        """
         """
         with open(file, 'r', encoding='utf-8') as f:
-            tree = ET.parse(f)
-            root = tree.getroot()
-            
-                        
-        parsed_data = {
-            "metadata": self._parse_metadata(root),
-            "title": self._parse_title(root),
-            "preamble": self._parse_preamble(root),
-            "articles": self._parse_articles(root),
-        }
-
-        return parsed_data
+            tree = etree.parse(f)
+            self.root = tree.getroot()
 
-    def _parse_metadata(self, root):
+    def get_metadata(self):
         """
         Extracts metadata information from the BIB.INSTANCE section.
 
@@ -38,7 +24,7 @@ def _parse_metadata(self, root):
         dict: Extracted metadata.
         """
         metadata = {}
-        bib_instance = root.find('BIB.INSTANCE')
+        bib_instance = self.root.find('BIB.INSTANCE')
         
         if bib_instance is not None:
             doc_ref = bib_instance.find('DOCUMENT.REF')
@@ -64,7 +50,7 @@ def _parse_metadata(self, root):
         
         return metadata
 
-    def _parse_title(self, root):
+    def get_title(self, root):
         """
         Extracts title information from the TITLE section.
 
@@ -84,7 +70,7 @@ def _parse_title(self, root):
         
         return title_text.strip()
         
-    def _parse_preamble(self, root):
+    def get_preamble(self, root):
         """
         Extracts the preamble section, including initial statements and considerations.
 
@@ -130,7 +116,7 @@ def _parse_preamble(self, root):
         
         return preamble_data
 
-    def _parse_articles(self, root):
+    def get_articles(self):
         """
         Extracts articles from the ENACTING.TERMS section.
 
@@ -140,16 +126,28 @@ def _parse_articles(self, root):
         Returns:
         list: Articles with identifier and content.
         """
-        articles = []
-        enacting_terms = root.find('ENACTING.TERMS')
+        self.articles = []
+        enacting_terms = self.root.find('ENACTING.TERMS')
         
         if enacting_terms is not None:
             for article in enacting_terms.findall('ARTICLE'):
                 article_data = {
-                    "identifier": article.get("IDENTIFIER"),
-                    "title": article.findtext('TI.ART'),
-                    "content": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA'))
+                    "eId": article.get("IDENTIFIER"),
+                    "article_num": article.findtext('TI.ART'),
+                    "article_text": " ".join("".join(alinea.itertext()).strip() for alinea in article.findall('ALINEA'))
                 }
-                articles.append(article_data)
+                self.articles.append(article_data)
         
-        return articles
+
+    def parse(self, file):
+        """
+        Parses a FORMEX XML document to extract metadata, title, preamble, and enacting terms.
+
+        Args:
+        file (str): Path to the FORMEX XML file.
+
+        Returns:
+        dict: Parsed data containing metadata, title, preamble, and articles.
+        """
+        self.load_xml(file)
+        self.get_articles()
\ No newline at end of file
diff --git a/tests/parsers/test_formex.py b/tests/parsers/test_formex.py
index 0c6a291..2e1bd85 100644
--- a/tests/parsers/test_formex.py
+++ b/tests/parsers/test_formex.py
@@ -13,10 +13,10 @@ def setUp(self):
     def test_parse_metadata(self):
         self.maxDiff = None  # Allow the full diff to be displayed
         file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")
-        with open(file_path, 'r', encoding='utf-8') as f:
-            tree = ET.parse(f)
-            root = tree.getroot()
-        result = self.formex_parser._parse_metadata(root)
+        
+        self.formex_parser.load_xml(file_path)
+
+        result = self.formex_parser.get_metadata()
         expected = {
             "file": "L_2011334EN.01002501.doc.xml",
             "collection": "L",
@@ -44,7 +44,7 @@ def test_parse_title(self):
             tree = ET.parse(f)
             root = tree.getroot()
         
-        result = self.formex_parser._parse_title(root)
+        result = self.formex_parser.get_title(root)
         expected = (
             "Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 "
             "fixing representative prices in the poultrymeat and egg sectors and for egg "
@@ -62,7 +62,7 @@ def test_parse_preamble(self):
             tree = ET.parse(f)
             root = tree.getroot()
         
-        result = self.formex_parser._parse_preamble(root)
+        result = self.formex_parser.get_preamble(root)
         
         # Expected preamble structure
         # @todo - see main function
@@ -91,28 +91,26 @@ def test_parse_articles(self):
         self.maxDiff = None  # Allow full diff if needed
         file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")
         
-        # Parse the XML tree and pass the root to _parse_articles
-        with open(file_path, 'r', encoding='utf-8') as f:
-            tree = ET.parse(f)
-            root = tree.getroot()
+        self.formex_parser.load_xml(file_path)
+        
+        self.formex_parser.get_articles()
         
-        result = self.formex_parser._parse_articles(root)
         
         # Expected articles based on sample data in XML file
         expected = [
             {
-                "identifier": "001",
-                "title": "Article 1",
-                "content": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."
+                "eId": "001",
+                "article_num": "Article 1",
+                "article_text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."
             },
             {
-                "identifier": "002",
-                "title": "Article 2",
-                "content": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."
+                "eId": "002",
+                "article_num": "Article 2",
+                "article_text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."
             }
         ]
         
-        self.assertEqual(result, expected)
+        self.assertEqual(self.formex_parser.articles, expected)
 
 # Run the tests
 if __name__ == "__main__":