diff --git a/tests/parsers/html/test_cellar.py b/tests/parsers/html/test_cellar.py
new file mode 100644
index 0000000..f860c78
--- /dev/null
+++ b/tests/parsers/html/test_cellar.py
@@ -0,0 +1,117 @@
+import unittest
+import os
+from tulit.parsers.html.cellar import CellarHTMLParser
+import json
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), "..\\..\\data\\html")
+file_path = os.path.join(DATA_DIR, "c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03\\DOC_1.html")
+
+
+class TestCellarHTMLParser(unittest.TestCase):
+ def setUp(self):
+ self.maxDiff = None # Allow full diff if needed
+ self.parser = CellarHTMLParser()
+
+ # Ensure test file exists
+ if not os.path.exists(file_path):
+ raise FileNotFoundError(f"Test file not found at {file_path}")
+ self.parser.get_root(file_path)
+
+ def test_get_root(self):
+ """Test parsing and root element retrieval from the Akoma Ntoso file."""
+ self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}")
+ self.assertIsNotNone(self.parser.root, "Root element should not be None")
+
+ def test_get_body(self):
+ self.parser.get_body()
+ self.assertIsNotNone(self.parser.body, "Body element should not be None")
+
+ def test_get_preface(self):
+ self.parser.get_preface()
+ self.assertIsNotNone(self.parser.preface, "Preface element should not be None")
+
+ def test_get_preamble(self):
+ self.parser.get_preamble()
+ self.assertIsNotNone(self.parser.preamble, "Preamble element should not be None")
+
+ def test_get_formula(self):
+ self.parser.get_preamble()
+ self.parser.get_formula()
+ formula = "THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,"
+ self.assertEqual(self.parser.formula, formula, "Formula should match expected value")
+
+ def test_get_citations(self):
+ self.parser.get_preamble()
+ self.parser.get_citations()
+ citations = [
+ {
+ "eId": "cit_1",
+ "citation_text": "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 172 thereof,"
+ },
+ {
+ "eId": "cit_2",
+ "citation_text": "Having regard to the proposal from the European Commission,"
+ },
+ {
+ "eId": "cit_3",
+ "citation_text": "After transmission of the draft legislative act to the national parliaments,"
+ },
+ {
+ "eId": "cit_4",
+ "citation_text": "Having regard to the opinion of the European Economic and Social Committee(1),"
+ },
+ {
+ "eId": "cit_5",
+ "citation_text": "Having regard to the opinion of the Committee of the Regions(2),"
+ },
+ {
+ "eId": "cit_6",
+ "citation_text": "Acting in accordance with the ordinary legislative procedure(3),"
+ }
+ ]
+ self.assertEqual(self.parser.citations, citations, "Citations should match expected values")
+
+ def test_get_recitals(self):
+ self.parser.get_preamble()
+ self.parser.get_recitals()
+
+ self.assertIsNotNone(self.parser.recitals, "Recitals element should not be None")
+
+
+ def test_get_preamble_final(self):
+ self.parser.get_preamble()
+ self.parser.get_preamble_final()
+ preamble_final = "HAVE ADOPTED THIS REGULATION:"
+ self.assertEqual(self.parser.preamble_final, preamble_final, "Preamble final should match expected value")
+
+ def test_get_chapters(self):
+ self.parser.get_body()
+ self.parser.get_chapters()
+ self.assertIsNotNone(self.parser.chapters, "Chapters element should not be None")
+
+
+ def test_get_articles(self):
+ """Test parsing articles from an HTML file."""
+ # Parse the body and articles using the parser
+ self.parser.get_body()
+ self.parser.get_articles()
+
+ # Save output file to directory
+ #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), 'w+', encoding='utf-8') as f:
+ # json.dump(self.parser.articles, f)
+ #
+ ## Load the expected structure of parsed articles
+ #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), encoding='utf-8') as f:
+ # expected = json.load(f)
+
+ # Assert the parsed articles match the expected structure
+ #self.assertEqual(self.parser.articles, expected)
+
+
+ def test_get_conclusions(self):
+ self.parser.get_conclusions()
+ self.assertIsNotNone(self.parser.conclusions, "Conclusions element should not be None")
+
+# Run the tests
+if __name__ == "__main__":
+ unittest.main()
diff --git a/tests/parsers/html/test_html.py b/tests/parsers/html/test_html.py
index a956144..7c93fa2 100644
--- a/tests/parsers/html/test_html.py
+++ b/tests/parsers/html/test_html.py
@@ -21,58 +21,6 @@ def test_get_root(self):
"""Test parsing and root element retrieval from the Akoma Ntoso file."""
self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}")
self.assertIsNotNone(self.parser.root, "Root element should not be None")
-
- def test_get_body(self):
- self.parser.get_body()
- self.assertIsNotNone(self.parser.body, "Body element should not be None")
-
- def test_get_metadata(self):
- pass
-
- def test_get_preface(self):
- self.parser.get_preface()
- self.assertIsNotNone(self.parser.preface, "Preface element should not be None")
-
- def test_get_preamble(self):
- self.parser.get_preamble()
- self.assertIsNotNone(self.parser.preamble, "Preamble element should not be None")
-
- def test_get_preamble_formula(self):
- pass
-
- def test_get_preamble_citations(self):
- pass
-
- def test_get_preamble_recitals(self):
- pass
-
-
- def test_get_chapters(self):
- pass
-
- def test_get_articles(self):
- """Test parsing articles from an HTML file."""
- # Parse the body and articles using the parser
- self.parser.get_body()
- self.parser.get_articles()
-
- # Save output file to directory
- #with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), 'w+', encoding='utf-8') as f:
- # json.dump(self.parser.articles, f)
-
- # Load the expected structure of parsed articles
- with open(os.path.join(DATA_DIR, 'json', 'articles_html.json'), encoding='utf-8') as f:
- expected = json.load(f)
-
- # Assert the parsed articles match the expected structure
- self.assertEqual(self.parser.articles, expected)
-
-
- def test_get_conclusions(self):
- self.parser.get_conclusions()
- self.assertIsNotNone(self.parser.conclusions, "Conclusions element should not be None")
-
-
# Run the tests
if __name__ == "__main__":
diff --git a/tulit/parsers/html/cellar.py b/tulit/parsers/html/cellar.py
new file mode 100644
index 0000000..43b6ec9
--- /dev/null
+++ b/tulit/parsers/html/cellar.py
@@ -0,0 +1,296 @@
+from tulit.parsers.html.xhtml import HTMLParser
+import json
+
+class CellarHTMLParser(HTMLParser):
+ def __init__(self):
+ pass
+
+ def get_preface(self):
+ """
+ Extracts the preface text from the HTML, if available.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ None
+ The extracted preface is stored in the 'preface' attribute.
+ """
+ try:
+ preface_element = self.root.find('div', class_='eli-main-title')
+ if preface_element:
+ self.preface = preface_element.get_text(strip=True)
+ print("Preface extracted successfully.")
+ else:
+ self.preface = None
+ print("No preface found.")
+ except Exception as e:
+ print(f"Error extracting preface: {e}")
+
+
+ def get_preamble(self):
+ """
+ Extracts the preamble text from the HTML, if available.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ None
+ The extracted preamble is stored in the 'preamble' attribute.
+ """
+
+ self.preamble = self.root.find('div', class_='eli-subdivision', id='pbl_1')
+
+ def get_formula(self):
+ """
+ Extracts the formula from the HTML, if present.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ None
+ The extracted formula is stored in the 'formula' attribute.
+ """
+ self.formula = self.preamble.find('p', class_='oj-normal').text
+
+
+
+ def get_citations(self):
+ """
+ Extracts citations from the HTML.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ None
+ The extracted citations are stored in the 'citations' attribute
+ """
+ citations = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('cit_'))
+ self.citations = []
+ for citation in citations:
+ citation_id = citation.get('id')
+ citation_text = citation.get_text(strip=True)
+ self.citations.append({
+ 'eId' : citation_id,
+ 'citation_text' : citation_text
+ }
+ )
+
+ def get_recitals(self):
+ """
+ Extracts recitals from the HTML.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ None
+ The extracted recitals are stored in the 'recitals' attribute.
+ """
+ recitals = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('rct_'))
+ self.recitals = []
+ for recital in recitals:
+ recital_id = recital.get('id')
+ recital_text = recital.get_text(strip=True)
+ self.recitals.append({
+ 'eId' : recital_id,
+ 'recital_text' : recital_text
+ }
+ )
+ def get_preamble_final(self):
+ """
+ Extracts the final preamble text from the HTML, if available.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ None
+ The extracted final preamble is stored in the 'preamble_final' attribute.
+ """
+ self.preamble_final = self.preamble.find_all('p', class_='oj-normal')[-1].get_text(strip=True)
+
+ def get_body(self):
+ """
+ Extracts the body content from the HTML.
+
+ Parameters
+ ----------
+ None
+
+ Returns
+ -------
+ None
+ The extracted body content is stored in the 'body' attribute
+ """
+
+ self.body = self.root.find('div', id=lambda x: x and x.startswith('enc_'))
+
+ def get_chapters(self):
+ """
+ Extracts chapters from the HTML, grouping them by their IDs and headings.
+ """
+
+ chapters = self.body.find_all('div', id=lambda x: x and x.startswith('cpt_') and '.' not in x)
+ self.chapters = []
+ for chapter in chapters:
+ chapter_id = chapter.get('id')
+ chapter_num = chapter.find('p', class_="oj-ti-section-1").get_text(strip=True)
+ chapter_title = chapter.find('div', class_="eli-title").get_text(strip=True)
+ self.chapters.append({
+ 'eId': chapter_id,
+ 'chapter_num': chapter_num,
+ 'chapter_heading': chapter_title
+ })
+
+
+ def get_lists(self, parent_id: str, container):
+ """
+ Parses HTML tables representing lists and generates Akoma Ntoso-style eIds.
+
+ Args:
+ parent_id (str): The eId of the parent element (e.g., article or subdivision).
+ container (BeautifulSoup Tag): The container holding the
elements.
+
+ Returns:
+ list[dict]: List of list elements with eIds and corresponding text content.
+ """
+ lists = []
+ list_counter = 0
+
+ # Find all elements within the container
+ tables = container.find_all('table')
+
+ for table in tables:
+ list_counter += 1
+ list_eId = f"{parent_id}__list_{list_counter}"
+
+ # Process each row () within the table
+ points = []
+ point_counter = 0
+
+ for row in table.find_all('tr'):
+ cols = row.find_all('td')
+ if len(cols) >= 2:
+ # Extract point number (e.g., (a)) and content
+ point_counter += 1
+ point_eId = f"{list_eId}__point_{point_counter}"
+ point_num = cols[0].get_text(strip=True) # First column: point number
+ point_text = cols[1].get_text(" ", strip=True) # Second column: point text
+
+ # Clean text
+ point_text = self._clean_text(point_text)
+
+ points.append({
+ 'eId': point_eId,
+ 'num': point_num,
+ 'text': point_text
+ })
+
+ # Add the list with its points
+ lists.append({
+ 'eId': list_eId,
+ 'points': points
+ })
+
+ return lists
+
+
+ def get_articles(self):
+ """
+ Extracts articles from the HTML. Each with an id starting with "art" is treated as an article (eId).
+ Subsequent subdivisions are processed based on the closest parent with an id.
+
+ Returns:
+ list[dict]: List of articles, each containing its eId and associated content.
+ """
+
+ articles = self.body.find_all('div', id=lambda x: x and x.startswith('art_') and '.' not in x)
+ self.articles = []
+ for article in articles:
+ eId = article.get('id') # Treat the id as the eId
+ article_num = article.find('p', class_='oj-ti-art').get_text(strip=True)
+ article_title_element = article.find('p', class_='oj-sti-art')
+ if article_title_element is not None:
+ article_title = article_title_element.get_text(strip=True)
+ else:
+ article_title = None
+ # Group
tags by their closest parent with an id
+ content_map = {}
+ for p in article.find_all('p', class_='oj-normal'): # Filter
with class 'oj-normal'
+ current_element = p
+ parent_eId = None
+ # Traverse upward to find the closest parent with an id
+ while current_element:
+ parent_eId = current_element.get('id')
+ if parent_eId:
+ break
+ current_element = current_element.parent
+ if parent_eId:
+ # Add text from the
to the appropriate parent_eId group
+ if parent_eId not in content_map:
+ content_map[parent_eId] = []
+ content_map[parent_eId].append(p.get_text(strip=True))
+ # Combine grouped content into structured output
+ subdivisions = []
+ for sub_eId, texts in content_map.items():
+ subdivisions.append({
+ 'eId': sub_eId,
+ 'text': ' '.join(texts) # Combine all
texts for the subdivision
+ })
+ # Store the article with its eId and subdivisions
+ self.articles.append({
+ 'eId': eId,
+ 'article_num': article_num,
+ 'article_title': article_title,
+ 'article_text': subdivisions
+ })
+
+
+ def get_conclusions(self):
+ """
+ Extracts conclusions from the HTML, if present.
+ """
+ conclusions_element = self.root.find('div', class_='oj-final')
+ self.conclusions = conclusions_element.get_text(strip=True)
+
+ def parse(self, file):
+ return super().parse(file)
+
+
+def main():
+ parser = CellarHTMLParser()
+ file_to_parse = 'tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html'
+
+ output_file = 'tests/data/json/iopa_html.json'
+
+
+ parser.parse(file_to_parse)
+
+ with open(output_file, 'w', encoding='utf-8') as f:
+ # Get the parser's attributes as a dictionary
+ parser_dict = parser.__dict__
+
+ # Filter out non-serializable attributes
+ serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}
+
+ # Write to a JSON file
+ json.dump(serializable_dict, f, ensure_ascii=False, indent=4)
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/tulit/parsers/html/xhtml.py b/tulit/parsers/html/xhtml.py
index b95d2e3..8a12f97 100644
--- a/tulit/parsers/html/xhtml.py
+++ b/tulit/parsers/html/xhtml.py
@@ -8,8 +8,6 @@ def __init__(self):
Initializes the HTML parser and sets up the BeautifulSoup instance.
"""
super().__init__()
- self.root = None
- self.valid = True
def get_root(self, file):
"""
@@ -32,335 +30,84 @@ def get_root(self, file):
print("HTML loaded successfully.")
except Exception as e:
print(f"Error loading HTML: {e}")
+
- def get_metadata(self):
+ def parse(self, file: str) -> Parser:
"""
- Extracts metadata from the HTML.
+ Parses an HTML file and extracts the preface, preamble, formula, citations, recitals, preamble final, body, chapters, articles, and conclusions.
Parameters
----------
- None
+ file : str
+ Path to the XML file to parse.
Returns
-------
- None
- The extracted metadata is stored in the 'meta' attribute.
+ Parser
+ The parser object with the parsed elements stored in the attributes.
"""
+
try:
- meta_elements = self.root.find_all('meta')
- for meta in meta_elements:
- name = meta.get('name')
- content = meta.get('content')
- if name and content:
- self.meta[name] = content
- print(f"Metadata extracted: {len(self.meta)} entries.")
+ self.get_root(file)
+ print("Root element loaded successfully.")
except Exception as e:
- print(f"Error extracting metadata: {e}")
-
- def get_preface(self):
- """
- Extracts the preface text from the HTML, if available.
-
- Parameters
- ----------
- None
-
- Returns
- -------
- None
- The extracted preface is stored in the 'preface' attribute.
- """
+ print(f"Error in get_root: {e}")
+
try:
- preface_element = self.root.find('div', class_='eli-main-title')
- if preface_element:
- self.preface = preface_element.get_text(strip=True)
- print("Preface extracted successfully.")
- else:
- self.preface = None
- print("No preface found.")
+ self.get_preface()
+ print(f"Preface parsed successfully. Preface: {self.preface}")
except Exception as e:
- print(f"Error extracting preface: {e}")
-
-
- def get_preamble(self):
- """
- Extracts the preamble text from the HTML, if available.
-
- Parameters
- ----------
- None
+ print(f"Error in get_preface: {e}")
- Returns
- -------
- None
- The extracted preamble is stored in the 'preamble' attribute.
- """
-
- self.preamble = self.root.find('div', class_='eli-subdivision', id='pbl_1')
- if self.preamble:
+ try:
+ self.get_preamble()
+ print(f"Preamble element found.")
+ except Exception as e:
+ print(f"Error in get_preamble: {e}")
+ try:
+ self.get_formula()
+ print(f"Formula parsed successfully.")
+ except Exception as e:
+ print(f"Error in get_formula: {e}")
+ try:
self.get_citations()
+ print(f"Citations parsed successfully. Number of citations: {len(self.citations)}")
+ except Exception as e:
+ print(f"Error in get_citations: {e}")
+ try:
self.get_recitals()
- print("Preamble extracted successfully.")
- else:
- self.preamble = None
- print("No preamble found.")
-
-
- def get_citations(self):
- """
- Extracts citations from the HTML.
-
- Parameters
- ----------
- None
-
- Returns
- -------
- None
- The extracted citations are stored in the 'citations' attribute
- """
- citations = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('cit_'))
- self.citations = []
- for citation in citations:
- citation_id = citation.get('id')
- citation_text = citation.get_text(strip=True)
- self.citations.append({
- 'eId' : citation_id,
- 'citation_text' : citation_text
- }
- )
- print(f"Citations extracted: {len(self.citations)}")
-
- def get_recitals(self):
- """
- Extracts recitals from the HTML.
-
- Parameters
- ----------
- None
-
- Returns
- -------
- None
- The extracted recitals are stored in the 'recitals' attribute.
- """
- recitals = self.preamble.find_all('div', class_='eli-subdivision', id=lambda x: x and x.startswith('rct_'))
- self.recitals = []
- for recital in recitals:
- recital_id = recital.get('id')
- recital_text = recital.get_text(strip=True)
- self.recitals.append({
- 'eId' : recital_id,
- 'recital_text' : recital_text
- }
- )
- print(f"Recitals extracted: {len(self.recitals)}")
-
- def get_body(self):
- """
- Extracts the body content from the HTML.
+ print(f"Recitals parsed successfully. Number of recitals: {len(self.recitals)}")
+ except Exception as e:
+ print(f"Error in get_recitals: {e}")
- Parameters
- ----------
- None
+ try:
+ self.get_preamble_final()
+ print(f"Preamble final parsed successfully.")
+ except Exception as e:
+ print(f"Error in get_preamble_final: {e}")
- Returns
- -------
- None
- The extracted body content is stored in the 'body' attribute
- """
try:
- body_element = self.root.find('div', id=lambda x: x and x.startswith('enc_'))
- if body_element:
- self.body = body_element
- print("Body extracted successfully.")
- else:
- self.body = None
- print("No body found.")
+ self.get_body()
+ print("Body element found.")
except Exception as e:
- print(f"Error extracting body: {e}")
-
- def get_chapters(self):
- """
- Extracts chapters from the HTML, grouping them by their IDs and headings.
- """
+ print(f"Error in get_body: {e}")
try:
- chapters = self.body.find_all('div', id=lambda x: x and x.startswith('cpt_') and '.' not in x)
- self.chapters = []
- for chapter in chapters:
- chapter_id = chapter.get('id')
- chapter_num = chapter.find('p', class_="oj-ti-section-1").get_text(strip=True)
- chapter_title = chapter.find('div', class_="eli-title").get_text(strip=True)
- self.chapters.append({
- 'eId': chapter_id,
- 'chapter_num': chapter_num,
- 'chapter_heading': chapter_title
- })
- print(f"Chapters extracted: {len(self.chapters)}")
+ self.get_chapters()
+ print(f"Chapters parsed successfully. Number of chapters: {len(self.chapters)}")
except Exception as e:
- print(f"Error extracting chapters: {e}")
-
- def get_lists(self, parent_id: str, container):
- """
- Parses HTML tables representing lists and generates Akoma Ntoso-style eIds.
-
- Args:
- parent_id (str): The eId of the parent element (e.g., article or subdivision).
- container (BeautifulSoup Tag): The container holding the
elements.
-
- Returns:
- list[dict]: List of list elements with eIds and corresponding text content.
- """
- lists = []
- list_counter = 0
-
- # Find all elements within the container
- tables = container.find_all('table')
-
- for table in tables:
- list_counter += 1
- list_eId = f"{parent_id}__list_{list_counter}"
-
- # Process each row () within the table
- points = []
- point_counter = 0
-
- for row in table.find_all('tr'):
- cols = row.find_all('td')
- if len(cols) >= 2:
- # Extract point number (e.g., (a)) and content
- point_counter += 1
- point_eId = f"{list_eId}__point_{point_counter}"
- point_num = cols[0].get_text(strip=True) # First column: point number
- point_text = cols[1].get_text(" ", strip=True) # Second column: point text
-
- # Clean text
- point_text = self._clean_text(point_text)
-
- points.append({
- 'eId': point_eId,
- 'num': point_num,
- 'text': point_text
- })
-
- # Add the list with its points
- lists.append({
- 'eId': list_eId,
- 'points': points
- })
-
- return lists
-
-
- def get_articles(self):
- """
- Extracts articles from the HTML. Each with an id starting with "art" is treated as an article (eId).
- Subsequent subdivisions are processed based on the closest parent with an id.
-
- Returns:
- list[dict]: List of articles, each containing its eId and associated content.
- """
+ print(f"Error in get_chapters: {e}")
try:
- articles = self.body.find_all('div', id=lambda x: x and x.startswith('art_') and '.' not in x)
- self.articles = []
-
- for article in articles:
- eId = article.get('id') # Treat the id as the eId
- article_num = article.find('p', class_='oj-ti-art').get_text(strip=True)
- article_title_element = article.find('p', class_='oj-sti-art')
- if article_title_element is not None:
- article_title = article_title_element.get_text(strip=True)
- else:
- article_title = None
-
- # Group
tags by their closest parent with an id
- content_map = {}
- for p in article.find_all('p', class_='oj-normal'): # Filter
with class 'oj-normal'
- current_element = p
- parent_eId = None
-
- # Traverse upward to find the closest parent with an id
- while current_element:
- parent_eId = current_element.get('id')
- if parent_eId:
- break
- current_element = current_element.parent
-
- if parent_eId:
- # Add text from the
to the appropriate parent_eId group
- if parent_eId not in content_map:
- content_map[parent_eId] = []
- content_map[parent_eId].append(p.get_text(strip=True))
-
- # Combine grouped content into structured output
- subdivisions = []
- for sub_eId, texts in content_map.items():
- subdivisions.append({
- 'eId': sub_eId,
- 'text': ' '.join(texts) # Combine all
texts for the subdivision
- })
-
- # Store the article with its eId and subdivisions
- self.articles.append({
- 'eId': eId,
- 'article_num': article_num,
- 'article_title': article_title,
- 'article_text': subdivisions
- })
-
- print(f"Articles extracted: {len(self.articles)}")
+ self.get_articles()
+ print(f"Articles parsed successfully. Number of articles: {len(self.articles)}")
+ print(f"Total number of children in articles: {sum([len(list(article)) for article in self.articles])}")
+
except Exception as e:
- print(f"Error extracting articles: {e}")
-
-
- def get_conclusions(self):
- """
- Extracts conclusions from the HTML, if present.
- """
+ print(f"Error in get_articles: {e}")
try:
- conclusions_element = self.root.find('div', class_='oj-final')
- if conclusions_element:
- self.conclusions = conclusions_element.get_text(strip=True)
- print("Conclusions extracted successfully.")
- else:
- self.conclusions = None
- print("No conclusions found.")
+ self.get_conclusions()
+ print(f"Conclusions parsed successfully. ")
except Exception as e:
- print(f"Error extracting conclusions: {e}")
-
- def parse(self, file: str):
- """
- Parses an HTML file and extracts all relevant sections.
- """
- self.get_root(file)
- #self.get_meta()
- self.get_preface()
- self.get_preamble()
- self.get_body()
- self.get_chapters()
- self.get_articles()
- self.get_conclusions()
-
-
-def main():
- parser = HTMLParser()
- file_to_parse = 'tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html'
-
- output_file = 'tests/data/json/iopa_html.json'
-
-
- parser.parse(file_to_parse)
-
- with open(output_file, 'w', encoding='utf-8') as f:
- # Get the parser's attributes as a dictionary
- parser_dict = parser.__dict__
-
- # Filter out non-serializable attributes
- serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}
-
- # Write to a JSON file
- json.dump(serializable_dict, f, ensure_ascii=False, indent=4)
-
-if __name__ == "__main__":
- main()
-
+ print(f"Error in get_conclusions: {e}")
+
+ return self
+
\ No newline at end of file
diff --git a/tulit/parsers/xml/xml.py b/tulit/parsers/xml/xml.py
index ab5e493..d390aef 100644
--- a/tulit/parsers/xml/xml.py
+++ b/tulit/parsers/xml/xml.py
@@ -411,7 +411,7 @@ def get_conclusions(self):
pass
- def parse(self, file: str, schema, format) -> None:
+ def parse(self, file: str, schema, format) -> Parser:
"""
Parses an XML file and extracts relevant sections based on the format.