Created all common functions needed for version 0.1.0 of the package

AlessioNar · Dec 28, 2024 · 3c2f862 · 3c2f862
1 parent 864ae70
commit 3c2f862
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 15 deletions.
diff --git a/tulit/parsers/akomantoso.py b/tulit/parsers/akomantoso.py
@@ -50,12 +50,14 @@ def get_formula(self):
             Returns None if no formula is found.
         """
         formula = self.preamble.find('.//akn:formula', namespaces=self.namespaces)
+        # Extract text from <p> within <formula>
+        formula_text = ' '.join(p.text.strip() for p in formula.findall('akn:p', namespaces=self.namespaces) if p.text)
         if formula is None:
             return None
 
-        # Extract text from <p> within <formula>
-        formula_text = ' '.join(p.text.strip() for p in formula.findall('akn:p', namespaces=self.namespaces) if p.text)
-        return formula_text
+        self.formula = formula_text
+        return self.formula
+
 
     def get_citations(self) -> list:
         """
@@ -105,6 +107,9 @@ def extract_eId(recital):
             extract_eId=extract_eId,
 
         )
+
+    def get_preamble_final(self):
+        return super().get_preamble_final()
 
     ### Act block
     def get_act(self) -> None:

diff --git a/tulit/parsers/formex.py b/tulit/parsers/formex.py
@@ -47,7 +47,12 @@ def get_formula(self):
         str
             Formula text from the preamble.
         """
-        self.formula = self.preamble.findtext('PREAMBLE.INIT')
+        formula = self.preamble.findtext('PREAMBLE.INIT')
+
+        if formula is None:
+            return None
+
+        self.formula = formula
 
         return self.formula
 
@@ -99,6 +104,9 @@ def extract_eId(recital):
             extract_eId=extract_eId
         )
 
+    def get_preamble_final(self):
+        return super().get_preamble_final()
+
     def get_body(self):
         return super().get_body('.//ENACTING.TERMS')
 
@@ -162,6 +170,9 @@ def get_articles(self):
                 self.articles.append(article_data)
         else:
             print('No enacting terms XML tag has been found')
+
+    def get_conclusions(self):
+        return super().get_conclusions()
 
 
     def parse(self, file):

diff --git a/tulit/parsers/parser.py b/tulit/parsers/parser.py
@@ -1,4 +1,4 @@
-from abc import ABC, abstractmethod
+from abc import ABC
 from lxml import etree
 import os
 import re
@@ -146,9 +146,9 @@ def validate(self, file: str,  format: str) -> bool:
             print(f"{file} is not a valid {format} file. Validation errors: {e}")
             self.valid = False
             self.validation_errors = e.error_log
-        #except Exception as e:
-        #    print(f"An error occurred during validation: {e}")
-        #    self.valid = False
+        except Exception as e:
+            print(f"An error occurred during validation: {e}")
+            self.valid = False
 
     def remove_node(self, tree, node):
         """
@@ -231,7 +231,10 @@ def get_preface(self, preface_xpath, paragraph_xpath) -> None:
                 paragraph_text = ''.join(p.itertext()).strip()
                 paragraphs.append(paragraph_text)
 
-        self.preface = ' '.join(paragraphs)
+        # Join all paragraphs into a single string and remove duplicate spaces or newlines
+        self.preface = ' '.join(paragraphs).replace('\n', '').replace('\t', '').replace('\r', '')
+        self.preface = re.sub(' +', ' ', self.preface)
+
 
     def get_preamble(self, preamble_xpath, notes_xpath) -> None:
         """
@@ -253,7 +256,6 @@ def get_preamble(self, preamble_xpath, notes_xpath) -> None:
 
         if self.preamble is not None:            
             self.preamble = self.remove_node(self.preamble, notes_xpath)
-            self.formula = self.get_formula()
             #preamble_data["preamble_final"] = self.preamble.findtext('PREAMBLE.FINAL')
 
     def get_formula(self):
@@ -331,6 +333,9 @@ def get_recitals(self, recitals_xpath, recital_xpath, text_xpath, extract_intro=
 
         self.recitals = recitals
 
+    def get_preamble_final(self):
+        pass
+
     def get_body(self, body_xpath) -> None:
         """
         Extracts the body element from the document.
@@ -422,6 +427,9 @@ def get_articles(self, article_xpath, extract_eId=None) -> None:
     def get_subdivisions(self, subdivision_xpath, extract_eId=None) -> None:
         pass
 
+    def get_conclusions(self):
+        pass
+
     def parse(self, file: str, schema, format) -> None:
         """
         Parses an Akoma Ntoso file to extract provisions as individual sentences.
@@ -453,29 +461,34 @@ def parse(self, file: str, schema, format) -> None:
 
                 try:
                     self.get_preface()
-                    print(f"Preface parsed successfully.")
+                    print(f"Preface parsed successfully. Preface: {self.preface}")
                 except Exception as e:
                     print(f"Error in get_preface: {e}")
 
                 try:
                     self.get_preamble()
-                    print(f"Preamble parsed successfully.")
+                    print(f"Preamble element found.")
                 except Exception as e:
                     print(f"Error in get_preamble: {e}")
+                try:
+                    self.get_formula()
+                    print(f"Formula parsed successfully.")
+                except Exception as e:
+                    print(f"Error in get_formula: {e}")
                 try:
                     self.get_citations()
-                    print(f"Citations parsed successfully.")
+                    print(f"Citations parsed successfully. Number of citations: {len(self.citations)}")
                 except Exception as e:
                     print(f"Error in get_citations: {e}")
                 try:
                     self.get_recitals()
-                    print(f"Recitals parsed successfully.")
+                    print(f"Recitals parsed successfully. Number of recitals: {len(self.recitals)}")
                 except Exception as e:
                     print(f"Error in get_recitals: {e}")
 
                 try:
                     self.get_body()
-                    print("Body parsed successfully.")
+                    print("Body element found.")
                 except Exception as e:
                     print(f"Error in get_body: {e}")
                 try: