Initiated completing the HTMLParser to the data model

AlessioNar · Dec 28, 2024 · df3e23d · df3e23d
1 parent 07d6bde
commit df3e23d
Show file tree

Hide file tree

Showing 5 changed files with 469 additions and 361 deletions.
diff --git a/tests/parsers/html/test_cellar.py b/tests/parsers/html/test_cellar.py
@@ -0,0 +1,117 @@
+import unittest
+import os
+from tulit.parsers.html.cellar import CellarHTMLParser
+import json
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "..\\..\\data\\html")
+file_path = os.path.join(DATA_DIR, "c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03\\DOC_1.html")
+
+
+class TestCellarHTMLParser(unittest.TestCase):
+    def setUp(self):
+        self.maxDiff = None  # Allow full diff if needed
+        self.parser = CellarHTMLParser()
+
+        # Ensure test file exists
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Test file not found at {file_path}")
+        self.parser.get_root(file_path)
+
+    def test_get_root(self):
+        """Test parsing and root element retrieval from the Akoma Ntoso file."""
+        self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}")
+        self.assertIsNotNone(self.parser.root, "Root element should not be None")      
+
+    def test_get_body(self):
+        self.parser.get_body()
+        self.assertIsNotNone(self.parser.body, "Body element should not be None")      
+
+    def test_get_preface(self):
+        self.parser.get_preface()
+        self.assertIsNotNone(self.parser.preface, "Preface element should not be None")
+
+    def test_get_preamble(self):
+        self.parser.get_preamble()
+        self.assertIsNotNone(self.parser.preamble, "Preamble element should not be None")      
+
+    def test_get_formula(self):
+        self.parser.get_preamble()
+        self.parser.get_formula()
+        formula = "THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,"
+        self.assertEqual(self.parser.formula, formula, "Formula should match expected value")
+
+    def test_get_citations(self):
+        self.parser.get_preamble()
+        self.parser.get_citations()
+        citations =  [
+            {
+                "eId": "cit_1",
+                "citation_text": "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 172 thereof,"
+            },
+            {
+                "eId": "cit_2",
+                "citation_text": "Having regard to the proposal from the European Commission,"
+            },
+            {
+                "eId": "cit_3",
+                "citation_text": "After transmission of the draft legislative act to the national parliaments,"
+            },
+            {
+                "eId": "cit_4",
+                "citation_text": "Having regard to the opinion of the European Economic and Social Committee(1),"
+            },
+            {
+                "eId": "cit_5",
+                "citation_text": "Having regard to the opinion of the Committee of the Regions(2),"
+            },
+            {
+                "eId": "cit_6",
+                "citation_text": "Acting in accordance with the ordinary legislative procedure(3),"
+            }
+        ]
+        self.assertEqual(self.parser.citations, citations, "Citations should match expected values")
+
+    def test_get_recitals(self):
+        self.parser.get_preamble()
+        self.parser.get_recitals()
+
+        self.assertIsNotNone(self.parser.recitals, "Recitals element should not be None")
+
+
+    def test_get_preamble_final(self):
+        self.parser.get_preamble()
+        self.parser.get_preamble_final()
+        preamble_final = "HAVE ADOPTED THIS REGULATION:"
+        self.assertEqual(self.parser.preamble_final, preamble_final, "Preamble final should match expected value")
+
+    def test_get_chapters(self):
+        self.parser.get_body()
+        self.parser.get_chapters()
+        self.assertIsNotNone(self.parser.chapters, "Chapters element should not be None")        
+
+
+    def test_get_articles(self):
+        """Test parsing articles from an HTML file."""
+        # Parse the body and articles using the parser
+        self.parser.get_body()
+        self.parser.get_articles()
+
+        # Save output file to directory
+        #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), 'w+', encoding='utf-8') as f:
+        #    json.dump(self.parser.articles, f)
+        #    
+        ## Load the expected structure of parsed articles
+        #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), encoding='utf-8') as f:
+        #    expected = json.load(f)
+
+        # Assert the parsed articles match the expected structure
+        #self.assertEqual(self.parser.articles, expected)
+
+
+    def test_get_conclusions(self):
+        self.parser.get_conclusions()
+        self.assertIsNotNone(self.parser.conclusions, "Conclusions element should not be None")      
+
+# Run the tests
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/parsers/html/test_html.py b/tests/parsers/html/test_html.py
@@ -21,58 +21,6 @@ def test_get_root(self):
         """Test parsing and root element retrieval from the Akoma Ntoso file."""
         self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}")
         self.assertIsNotNone(self.parser.root, "Root element should not be None")      
-
-    def test_get_body(self):
-        self.parser.get_body()
-        self.assertIsNotNone(self.parser.body, "Body element should not be None")      
-
-    def test_get_metadata(self):
-        pass
-
-    def test_get_preface(self):
-        self.parser.get_preface()
-        self.assertIsNotNone(self.parser.preface, "Preface element should not be None")
-
-    def test_get_preamble(self):
-        self.parser.get_preamble()
-        self.assertIsNotNone(self.parser.preamble, "Preamble element should not be None")      
-
-    def test_get_preamble_formula(self):
-        pass
-
-    def test_get_preamble_citations(self):
-        pass
-
-    def test_get_preamble_recitals(self):
-        pass      
-
-
-    def test_get_chapters(self):
-        pass
-
-    def test_get_articles(self):
-        """Test parsing articles from an HTML file."""
-        # Parse the body and articles using the parser
-        self.parser.get_body()
-        self.parser.get_articles()
-
-        # Save output file to directory
-        #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), 'w+', encoding='utf-8') as f:
-        #    json.dump(self.parser.articles, f)
-
-        # Load the expected structure of parsed articles
-        with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), encoding='utf-8') as f:
-            expected = json.load(f)
-
-        # Assert the parsed articles match the expected structure
-        self.assertEqual(self.parser.articles, expected)
-
-
-    def test_get_conclusions(self):
-        self.parser.get_conclusions()
-        self.assertIsNotNone(self.parser.conclusions, "Conclusions element should not be None")      
-
-
 
 # Run the tests
 if __name__ == "__main__":