-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initiated completing the HTMLParser to the data model
- Loading branch information
1 parent
07d6bde
commit df3e23d
Showing
5 changed files
with
469 additions
and
361 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
import unittest | ||
import os | ||
from tulit.parsers.html.cellar import CellarHTMLParser | ||
import json | ||
|
||
DATA_DIR = os.path.join(os.path.dirname(__file__), "..\\..\\data\\html") | ||
file_path = os.path.join(DATA_DIR, "c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03\\DOC_1.html") | ||
|
||
|
||
class TestCellarHTMLParser(unittest.TestCase): | ||
def setUp(self): | ||
self.maxDiff = None # Allow full diff if needed | ||
self.parser = CellarHTMLParser() | ||
|
||
# Ensure test file exists | ||
if not os.path.exists(file_path): | ||
raise FileNotFoundError(f"Test file not found at {file_path}") | ||
self.parser.get_root(file_path) | ||
|
||
def test_get_root(self): | ||
"""Test parsing and root element retrieval from the Akoma Ntoso file.""" | ||
self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}") | ||
self.assertIsNotNone(self.parser.root, "Root element should not be None") | ||
|
||
def test_get_body(self): | ||
self.parser.get_body() | ||
self.assertIsNotNone(self.parser.body, "Body element should not be None") | ||
|
||
def test_get_preface(self): | ||
self.parser.get_preface() | ||
self.assertIsNotNone(self.parser.preface, "Preface element should not be None") | ||
|
||
def test_get_preamble(self): | ||
self.parser.get_preamble() | ||
self.assertIsNotNone(self.parser.preamble, "Preamble element should not be None") | ||
|
||
def test_get_formula(self): | ||
self.parser.get_preamble() | ||
self.parser.get_formula() | ||
formula = "THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION," | ||
self.assertEqual(self.parser.formula, formula, "Formula should match expected value") | ||
|
||
def test_get_citations(self): | ||
self.parser.get_preamble() | ||
self.parser.get_citations() | ||
citations = [ | ||
{ | ||
"eId": "cit_1", | ||
"citation_text": "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 172 thereof," | ||
}, | ||
{ | ||
"eId": "cit_2", | ||
"citation_text": "Having regard to the proposal from the European Commission," | ||
}, | ||
{ | ||
"eId": "cit_3", | ||
"citation_text": "After transmission of the draft legislative act to the national parliaments," | ||
}, | ||
{ | ||
"eId": "cit_4", | ||
"citation_text": "Having regard to the opinion of the European Economic and Social Committee(1)," | ||
}, | ||
{ | ||
"eId": "cit_5", | ||
"citation_text": "Having regard to the opinion of the Committee of the Regions(2)," | ||
}, | ||
{ | ||
"eId": "cit_6", | ||
"citation_text": "Acting in accordance with the ordinary legislative procedure(3)," | ||
} | ||
] | ||
self.assertEqual(self.parser.citations, citations, "Citations should match expected values") | ||
|
||
def test_get_recitals(self): | ||
self.parser.get_preamble() | ||
self.parser.get_recitals() | ||
|
||
self.assertIsNotNone(self.parser.recitals, "Recitals element should not be None") | ||
|
||
|
||
def test_get_preamble_final(self): | ||
self.parser.get_preamble() | ||
self.parser.get_preamble_final() | ||
preamble_final = "HAVE ADOPTED THIS REGULATION:" | ||
self.assertEqual(self.parser.preamble_final, preamble_final, "Preamble final should match expected value") | ||
|
||
def test_get_chapters(self): | ||
self.parser.get_body() | ||
self.parser.get_chapters() | ||
self.assertIsNotNone(self.parser.chapters, "Chapters element should not be None") | ||
|
||
|
||
def test_get_articles(self): | ||
"""Test parsing articles from an HTML file.""" | ||
# Parse the body and articles using the parser | ||
self.parser.get_body() | ||
self.parser.get_articles() | ||
|
||
# Save output file to directory | ||
#with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), 'w+', encoding='utf-8') as f: | ||
# json.dump(self.parser.articles, f) | ||
# | ||
## Load the expected structure of parsed articles | ||
#with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), encoding='utf-8') as f: | ||
# expected = json.load(f) | ||
|
||
# Assert the parsed articles match the expected structure | ||
#self.assertEqual(self.parser.articles, expected) | ||
|
||
|
||
def test_get_conclusions(self): | ||
self.parser.get_conclusions() | ||
self.assertIsNotNone(self.parser.conclusions, "Conclusions element should not be None") | ||
|
||
# Run the tests | ||
if __name__ == "__main__": | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.