diff --git a/tests/parsers/html/test_cellar.py b/tests/parsers/html/test_cellar.py new file mode 100644 index 0000000..f860c78 --- /dev/null +++ b/tests/parsers/html/test_cellar.py @@ -0,0 +1,117 @@ +import unittest +import os +from tulit.parsers.html.cellar import CellarHTMLParser +import json + +DATA_DIR = os.path.join(os.path.dirname(__file__), "..\\..\\data\\html") +file_path = os.path.join(DATA_DIR, "c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03\\DOC_1.html") + + +class TestCellarHTMLParser(unittest.TestCase): + def setUp(self): + self.maxDiff = None # Allow full diff if needed + self.parser = CellarHTMLParser() + + # Ensure test file exists + if not os.path.exists(file_path): + raise FileNotFoundError(f"Test file not found at {file_path}") + self.parser.get_root(file_path) + + def test_get_root(self): + """Test parsing and root element retrieval from the Akoma Ntoso file.""" + self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}") + self.assertIsNotNone(self.parser.root, "Root element should not be None") + + def test_get_body(self): + self.parser.get_body() + self.assertIsNotNone(self.parser.body, "Body element should not be None") + + def test_get_preface(self): + self.parser.get_preface() + self.assertIsNotNone(self.parser.preface, "Preface element should not be None") + + def test_get_preamble(self): + self.parser.get_preamble() + self.assertIsNotNone(self.parser.preamble, "Preamble element should not be None") + + def test_get_formula(self): + self.parser.get_preamble() + self.parser.get_formula() + formula = "THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION," + self.assertEqual(self.parser.formula, formula, "Formula should match expected value") + + def test_get_citations(self): + self.parser.get_preamble() + self.parser.get_citations() + citations = [ + { + "eId": "cit_1", + "citation_text": "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 172 thereof," + }, + { + "eId": "cit_2", + "citation_text": "Having regard to the proposal from the European Commission," + }, + { + "eId": "cit_3", + "citation_text": "After transmission of the draft legislative act to the national parliaments," + }, + { + "eId": "cit_4", + "citation_text": "Having regard to the opinion of the European Economic and Social Committee(1)," + }, + { + "eId": "cit_5", + "citation_text": "Having regard to the opinion of the Committee of the Regions(2)," + }, + { + "eId": "cit_6", + "citation_text": "Acting in accordance with the ordinary legislative procedure(3)," + } + ] + self.assertEqual(self.parser.citations, citations, "Citations should match expected values") + + def test_get_recitals(self): + self.parser.get_preamble() + self.parser.get_recitals() + + self.assertIsNotNone(self.parser.recitals, "Recitals element should not be None") + + + def test_get_preamble_final(self): + self.parser.get_preamble() + self.parser.get_preamble_final() + preamble_final = "HAVE ADOPTED THIS REGULATION:" + self.assertEqual(self.parser.preamble_final, preamble_final, "Preamble final should match expected value") + + def test_get_chapters(self): + self.parser.get_body() + self.parser.get_chapters() + self.assertIsNotNone(self.parser.chapters, "Chapters element should not be None") + + + def test_get_articles(self): + """Test parsing articles from an HTML file.""" + # Parse the body and articles using the parser + self.parser.get_body() + self.parser.get_articles() + + # Save output file to directory + #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), 'w+', encoding='utf-8') as f: + # json.dump(self.parser.articles, f) + # + ## Load the expected structure of parsed articles + #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), encoding='utf-8') as f: + # expected = json.load(f) + + # Assert the parsed articles match the expected structure + #self.assertEqual(self.parser.articles, expected) + + + def test_get_conclusions(self): + self.parser.get_conclusions() + self.assertIsNotNone(self.parser.conclusions, "Conclusions element should not be None") + +# Run the tests +if __name__ == "__main__": + unittest.main() diff --git a/tests/parsers/html/test_html.py b/tests/parsers/html/test_html.py index a956144..7c93fa2 100644 --- a/tests/parsers/html/test_html.py +++ b/tests/parsers/html/test_html.py @@ -21,58 +21,6 @@ def test_get_root(self): """Test parsing and root element retrieval from the Akoma Ntoso file.""" self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}") self.assertIsNotNone(self.parser.root, "Root element should not be None") - - def test_get_body(self): - self.parser.get_body() - self.assertIsNotNone(self.parser.body, "Body element should not be None") - - def test_get_metadata(self): - pass - - def test_get_preface(self): - self.parser.get_preface() - self.assertIsNotNone(self.parser.preface, "Preface element should not be None") - - def test_get_preamble(self): - self.parser.get_preamble() - self.assertIsNotNone(self.parser.preamble, "Preamble element should not be None") - - def test_get_preamble_formula(self): - pass - - def test_get_preamble_citations(self): - pass - - def test_get_preamble_recitals(self): - pass - - - def test_get_chapters(self): - pass - - def test_get_articles(self): - """Test parsing articles from an HTML file.""" - # Parse the body and articles using the parser - self.parser.get_body() - self.parser.get_articles() - - # Save output file to directory - #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), 'w+', encoding='utf-8') as f: - # json.dump(self.parser.articles, f) - - # Load the expected structure of parsed articles - with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), encoding='utf-8') as f: - expected = json.load(f) - - # Assert the parsed articles match the expected structure - self.assertEqual(self.parser.articles, expected) - - - def test_get_conclusions(self): - self.parser.get_conclusions() - self.assertIsNotNone(self.parser.conclusions, "Conclusions element should not be None") - - # Run the tests if __name__ == "__main__": diff --git a/tulit/parsers/html/cellar.py b/tulit/parsers/html/cellar.py new file mode 100644 index 0000000..43b6ec9 --- /dev/null +++ b/tulit/parsers/html/cellar.py @@ -0,0 +1,296 @@ +from tulit.parsers.html.xhtml import HTMLParser +import json + +class CellarHTMLParser(HTMLParser): + def __init__(self): + pass + + def get_preface(self): + """ + Extracts the preface text from the HTML, if available. + + Parameters + ---------- + None + + Returns + ------- + None + The extracted preface is stored in the 'preface' attribute. + """ + try: + preface_element = self.root.find('div', class_='eli-main-title') + if preface_element: + self.preface = preface_element.get_text(strip=True) + print("Preface extracted successfully.") + else: + self.preface = None + print("No preface found.") + except Exception as e: + print(f"Error extracting preface: {e}") + + + def get_preamble(self): + """ + Extracts the preamble text from the HTML, if available. + + Parameters + ---------- + None + + Returns + ------- + None + The extracted preamble is stored in the 'preamble' attribute. + """ + + self.preamble = self.root.find('div', class_='eli-subdivision', id='pbl_1') + + def get_formula(self): + """ + Extracts the formula from the HTML, if present. + + Parameters + ---------- + None + + Returns + ------- + None + The extracted formula is stored in the 'formula' attribute. + """ + self.formula = self.preamble.find('p', class_='oj-normal').text + + + + def get_citations(self): + """ + Extracts citations from the HTML. + + Parameters + ---------- + None + + Returns + ------- + None + The extracted citations are stored in the 'citations' attribute + """ + citations = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('cit_')) + self.citations = [] + for citation in citations: + citation_id = citation.get('id') + citation_text = citation.get_text(strip=True) + self.citations.append({ + 'eId' : citation_id, + 'citation_text' : citation_text + } + ) + + def get_recitals(self): + """ + Extracts recitals from the HTML. + + Parameters + ---------- + None + + Returns + ------- + None + The extracted recitals are stored in the 'recitals' attribute. + """ + recitals = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('rct_')) + self.recitals = [] + for recital in recitals: + recital_id = recital.get('id') + recital_text = recital.get_text(strip=True) + self.recitals.append({ + 'eId' : recital_id, + 'recital_text' : recital_text + } + ) + def get_preamble_final(self): + """ + Extracts the final preamble text from the HTML, if available. + + Parameters + ---------- + None + + Returns + ------- + None + The extracted final preamble is stored in the 'preamble_final' attribute. + """ + self.preamble_final = self.preamble.find_all('p', class_='oj-normal')[-1].get_text(strip=True) + + def get_body(self): + """ + Extracts the body content from the HTML. + + Parameters + ---------- + None + + Returns + ------- + None + The extracted body content is stored in the 'body' attribute + """ + + self.body = self.root.find('div', id=lambda x: x and x.startswith('enc_')) + + def get_chapters(self): + """ + Extracts chapters from the HTML, grouping them by their IDs and headings. + """ + + chapters = self.body.find_all('div', id=lambda x: x and x.startswith('cpt_') and '.' not in x) + self.chapters = [] + for chapter in chapters: + chapter_id = chapter.get('id') + chapter_num = chapter.find('p', class_="oj-ti-section-1").get_text(strip=True) + chapter_title = chapter.find('div', class_="eli-title").get_text(strip=True) + self.chapters.append({ + 'eId': chapter_id, + 'chapter_num': chapter_num, + 'chapter_heading': chapter_title + }) + + + def get_lists(self, parent_id: str, container): + """ + Parses HTML tables representing lists and generates Akoma Ntoso-style eIds. + + Args: + parent_id (str): The eId of the parent element (e.g., article or subdivision). + container (BeautifulSoup Tag): The container holding the elements. + + Returns: + list[dict]: List of list elements with eIds and corresponding text content. + """ + lists = [] + list_counter = 0 + + # Find all
elements within the container + tables = container.find_all('table') + + for table in tables: + list_counter += 1 + list_eId = f"{parent_id}__list_{list_counter}" + + # Process each row () within the table + points = [] + point_counter = 0 + + for row in table.find_all('tr'): + cols = row.find_all('td') + if len(cols) >= 2: + # Extract point number (e.g., (a)) and content + point_counter += 1 + point_eId = f"{list_eId}__point_{point_counter}" + point_num = cols[0].get_text(strip=True) # First column: point number + point_text = cols[1].get_text(" ", strip=True) # Second column: point text + + # Clean text + point_text = self._clean_text(point_text) + + points.append({ + 'eId': point_eId, + 'num': point_num, + 'text': point_text + }) + + # Add the list with its points + lists.append({ + 'eId': list_eId, + 'points': points + }) + + return lists + + + def get_articles(self): + """ + Extracts articles from the HTML. Each
with an id starting with "art" is treated as an article (eId). + Subsequent subdivisions are processed based on the closest parent with an id. + + Returns: + list[dict]: List of articles, each containing its eId and associated content. + """ + + articles = self.body.find_all('div', id=lambda x: x and x.startswith('art_') and '.' not in x) + self.articles = [] + for article in articles: + eId = article.get('id') # Treat the id as the eId + article_num = article.find('p', class_='oj-ti-art').get_text(strip=True) + article_title_element = article.find('p', class_='oj-sti-art') + if article_title_element is not None: + article_title = article_title_element.get_text(strip=True) + else: + article_title = None + # Group

tags by their closest parent with an id + content_map = {} + for p in article.find_all('p', class_='oj-normal'): # Filter

with class 'oj-normal' + current_element = p + parent_eId = None + # Traverse upward to find the closest parent with an id + while current_element: + parent_eId = current_element.get('id') + if parent_eId: + break + current_element = current_element.parent + if parent_eId: + # Add text from the

to the appropriate parent_eId group + if parent_eId not in content_map: + content_map[parent_eId] = [] + content_map[parent_eId].append(p.get_text(strip=True)) + # Combine grouped content into structured output + subdivisions = [] + for sub_eId, texts in content_map.items(): + subdivisions.append({ + 'eId': sub_eId, + 'text': ' '.join(texts) # Combine all

texts for the subdivision + }) + # Store the article with its eId and subdivisions + self.articles.append({ + 'eId': eId, + 'article_num': article_num, + 'article_title': article_title, + 'article_text': subdivisions + }) + + + def get_conclusions(self): + """ + Extracts conclusions from the HTML, if present. + """ + conclusions_element = self.root.find('div', class_='oj-final') + self.conclusions = conclusions_element.get_text(strip=True) + + def parse(self, file): + return super().parse(file) + + +def main(): + parser = CellarHTMLParser() + file_to_parse = 'tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html' + + output_file = 'tests/data/json/iopa_html.json' + + + parser.parse(file_to_parse) + + with open(output_file, 'w', encoding='utf-8') as f: + # Get the parser's attributes as a dictionary + parser_dict = parser.__dict__ + + # Filter out non-serializable attributes + serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))} + + # Write to a JSON file + json.dump(serializable_dict, f, ensure_ascii=False, indent=4) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tulit/parsers/html/xhtml.py b/tulit/parsers/html/xhtml.py index b95d2e3..8a12f97 100644 --- a/tulit/parsers/html/xhtml.py +++ b/tulit/parsers/html/xhtml.py @@ -8,8 +8,6 @@ def __init__(self): Initializes the HTML parser and sets up the BeautifulSoup instance. """ super().__init__() - self.root = None - self.valid = True def get_root(self, file): """ @@ -32,335 +30,84 @@ def get_root(self, file): print("HTML loaded successfully.") except Exception as e: print(f"Error loading HTML: {e}") + - def get_metadata(self): + def parse(self, file: str) -> Parser: """ - Extracts metadata from the HTML. + Parses an HTML file and extracts the preface, preamble, formula, citations, recitals, preamble final, body, chapters, articles, and conclusions. Parameters ---------- - None + file : str + Path to the XML file to parse. Returns ------- - None - The extracted metadata is stored in the 'meta' attribute. + Parser + The parser object with the parsed elements stored in the attributes. """ + try: - meta_elements = self.root.find_all('meta') - for meta in meta_elements: - name = meta.get('name') - content = meta.get('content') - if name and content: - self.meta[name] = content - print(f"Metadata extracted: {len(self.meta)} entries.") + self.get_root(file) + print("Root element loaded successfully.") except Exception as e: - print(f"Error extracting metadata: {e}") - - def get_preface(self): - """ - Extracts the preface text from the HTML, if available. - - Parameters - ---------- - None - - Returns - ------- - None - The extracted preface is stored in the 'preface' attribute. - """ + print(f"Error in get_root: {e}") + try: - preface_element = self.root.find('div', class_='eli-main-title') - if preface_element: - self.preface = preface_element.get_text(strip=True) - print("Preface extracted successfully.") - else: - self.preface = None - print("No preface found.") + self.get_preface() + print(f"Preface parsed successfully. Preface: {self.preface}") except Exception as e: - print(f"Error extracting preface: {e}") - - - def get_preamble(self): - """ - Extracts the preamble text from the HTML, if available. - - Parameters - ---------- - None + print(f"Error in get_preface: {e}") - Returns - ------- - None - The extracted preamble is stored in the 'preamble' attribute. - """ - - self.preamble = self.root.find('div', class_='eli-subdivision', id='pbl_1') - if self.preamble: + try: + self.get_preamble() + print(f"Preamble element found.") + except Exception as e: + print(f"Error in get_preamble: {e}") + try: + self.get_formula() + print(f"Formula parsed successfully.") + except Exception as e: + print(f"Error in get_formula: {e}") + try: self.get_citations() + print(f"Citations parsed successfully. Number of citations: {len(self.citations)}") + except Exception as e: + print(f"Error in get_citations: {e}") + try: self.get_recitals() - print("Preamble extracted successfully.") - else: - self.preamble = None - print("No preamble found.") - - - def get_citations(self): - """ - Extracts citations from the HTML. - - Parameters - ---------- - None - - Returns - ------- - None - The extracted citations are stored in the 'citations' attribute - """ - citations = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('cit_')) - self.citations = [] - for citation in citations: - citation_id = citation.get('id') - citation_text = citation.get_text(strip=True) - self.citations.append({ - 'eId' : citation_id, - 'citation_text' : citation_text - } - ) - print(f"Citations extracted: {len(self.citations)}") - - def get_recitals(self): - """ - Extracts recitals from the HTML. - - Parameters - ---------- - None - - Returns - ------- - None - The extracted recitals are stored in the 'recitals' attribute. - """ - recitals = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('rct_')) - self.recitals = [] - for recital in recitals: - recital_id = recital.get('id') - recital_text = recital.get_text(strip=True) - self.recitals.append({ - 'eId' : recital_id, - 'recital_text' : recital_text - } - ) - print(f"Recitals extracted: {len(self.recitals)}") - - def get_body(self): - """ - Extracts the body content from the HTML. + print(f"Recitals parsed successfully. Number of recitals: {len(self.recitals)}") + except Exception as e: + print(f"Error in get_recitals: {e}") - Parameters - ---------- - None + try: + self.get_preamble_final() + print(f"Preamble final parsed successfully.") + except Exception as e: + print(f"Error in get_preamble_final: {e}") - Returns - ------- - None - The extracted body content is stored in the 'body' attribute - """ try: - body_element = self.root.find('div', id=lambda x: x and x.startswith('enc_')) - if body_element: - self.body = body_element - print("Body extracted successfully.") - else: - self.body = None - print("No body found.") + self.get_body() + print("Body element found.") except Exception as e: - print(f"Error extracting body: {e}") - - def get_chapters(self): - """ - Extracts chapters from the HTML, grouping them by their IDs and headings. - """ + print(f"Error in get_body: {e}") try: - chapters = self.body.find_all('div', id=lambda x: x and x.startswith('cpt_') and '.' not in x) - self.chapters = [] - for chapter in chapters: - chapter_id = chapter.get('id') - chapter_num = chapter.find('p', class_="oj-ti-section-1").get_text(strip=True) - chapter_title = chapter.find('div', class_="eli-title").get_text(strip=True) - self.chapters.append({ - 'eId': chapter_id, - 'chapter_num': chapter_num, - 'chapter_heading': chapter_title - }) - print(f"Chapters extracted: {len(self.chapters)}") + self.get_chapters() + print(f"Chapters parsed successfully. Number of chapters: {len(self.chapters)}") except Exception as e: - print(f"Error extracting chapters: {e}") - - def get_lists(self, parent_id: str, container): - """ - Parses HTML tables representing lists and generates Akoma Ntoso-style eIds. - - Args: - parent_id (str): The eId of the parent element (e.g., article or subdivision). - container (BeautifulSoup Tag): The container holding the

elements. - - Returns: - list[dict]: List of list elements with eIds and corresponding text content. - """ - lists = [] - list_counter = 0 - - # Find all
elements within the container - tables = container.find_all('table') - - for table in tables: - list_counter += 1 - list_eId = f"{parent_id}__list_{list_counter}" - - # Process each row () within the table - points = [] - point_counter = 0 - - for row in table.find_all('tr'): - cols = row.find_all('td') - if len(cols) >= 2: - # Extract point number (e.g., (a)) and content - point_counter += 1 - point_eId = f"{list_eId}__point_{point_counter}" - point_num = cols[0].get_text(strip=True) # First column: point number - point_text = cols[1].get_text(" ", strip=True) # Second column: point text - - # Clean text - point_text = self._clean_text(point_text) - - points.append({ - 'eId': point_eId, - 'num': point_num, - 'text': point_text - }) - - # Add the list with its points - lists.append({ - 'eId': list_eId, - 'points': points - }) - - return lists - - - def get_articles(self): - """ - Extracts articles from the HTML. Each
with an id starting with "art" is treated as an article (eId). - Subsequent subdivisions are processed based on the closest parent with an id. - - Returns: - list[dict]: List of articles, each containing its eId and associated content. - """ + print(f"Error in get_chapters: {e}") try: - articles = self.body.find_all('div', id=lambda x: x and x.startswith('art_') and '.' not in x) - self.articles = [] - - for article in articles: - eId = article.get('id') # Treat the id as the eId - article_num = article.find('p', class_='oj-ti-art').get_text(strip=True) - article_title_element = article.find('p', class_='oj-sti-art') - if article_title_element is not None: - article_title = article_title_element.get_text(strip=True) - else: - article_title = None - - # Group

tags by their closest parent with an id - content_map = {} - for p in article.find_all('p', class_='oj-normal'): # Filter

with class 'oj-normal' - current_element = p - parent_eId = None - - # Traverse upward to find the closest parent with an id - while current_element: - parent_eId = current_element.get('id') - if parent_eId: - break - current_element = current_element.parent - - if parent_eId: - # Add text from the

to the appropriate parent_eId group - if parent_eId not in content_map: - content_map[parent_eId] = [] - content_map[parent_eId].append(p.get_text(strip=True)) - - # Combine grouped content into structured output - subdivisions = [] - for sub_eId, texts in content_map.items(): - subdivisions.append({ - 'eId': sub_eId, - 'text': ' '.join(texts) # Combine all

texts for the subdivision - }) - - # Store the article with its eId and subdivisions - self.articles.append({ - 'eId': eId, - 'article_num': article_num, - 'article_title': article_title, - 'article_text': subdivisions - }) - - print(f"Articles extracted: {len(self.articles)}") + self.get_articles() + print(f"Articles parsed successfully. Number of articles: {len(self.articles)}") + print(f"Total number of children in articles: {sum([len(list(article)) for article in self.articles])}") + except Exception as e: - print(f"Error extracting articles: {e}") - - - def get_conclusions(self): - """ - Extracts conclusions from the HTML, if present. - """ + print(f"Error in get_articles: {e}") try: - conclusions_element = self.root.find('div', class_='oj-final') - if conclusions_element: - self.conclusions = conclusions_element.get_text(strip=True) - print("Conclusions extracted successfully.") - else: - self.conclusions = None - print("No conclusions found.") + self.get_conclusions() + print(f"Conclusions parsed successfully. ") except Exception as e: - print(f"Error extracting conclusions: {e}") - - def parse(self, file: str): - """ - Parses an HTML file and extracts all relevant sections. - """ - self.get_root(file) - #self.get_meta() - self.get_preface() - self.get_preamble() - self.get_body() - self.get_chapters() - self.get_articles() - self.get_conclusions() - - -def main(): - parser = HTMLParser() - file_to_parse = 'tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html' - - output_file = 'tests/data/json/iopa_html.json' - - - parser.parse(file_to_parse) - - with open(output_file, 'w', encoding='utf-8') as f: - # Get the parser's attributes as a dictionary - parser_dict = parser.__dict__ - - # Filter out non-serializable attributes - serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))} - - # Write to a JSON file - json.dump(serializable_dict, f, ensure_ascii=False, indent=4) - -if __name__ == "__main__": - main() - + print(f"Error in get_conclusions: {e}") + + return self + \ No newline at end of file diff --git a/tulit/parsers/xml/xml.py b/tulit/parsers/xml/xml.py index ab5e493..d390aef 100644 --- a/tulit/parsers/xml/xml.py +++ b/tulit/parsers/xml/xml.py @@ -411,7 +411,7 @@ def get_conclusions(self): pass - def parse(self, file: str, schema, format) -> None: + def parse(self, file: str, schema, format) -> Parser: """ Parses an XML file and extracts relevant sections based on the format.