Created callable main functions

AlessioNar · Dec 28, 2024 · 864ae70 · 864ae70
1 parent 1c1974e
commit 864ae70
Show file tree

Hide file tree

Showing 3 changed files with 56 additions and 11 deletions.
diff --git a/tulit/parsers/akomantoso.py b/tulit/parsers/akomantoso.py
@@ -1,7 +1,8 @@
-from .parser import XMLParser
+from tulit.parsers.parser import XMLParser
 import re
 from lxml import etree
 import os
+import json
 
 class AkomaNtosoParser(XMLParser):
     """
@@ -283,4 +284,26 @@ def parse(self, file: str) -> None:
 
 
         """
-        return super.parse(file, schema = 'akomantoso30.xsd', format = 'Akoma Ntoso')
+        return super().parse(file, schema = 'akomantoso30.xsd', format = 'Akoma Ntoso')
+
+def main():
+    parser = AkomaNtosoParser()
+
+    file_to_parse = 'tests/data/akn/eu/32014L0092.akn'
+    output_file = 'tests/data/json/akn.json'
+
+    parser.parse(file_to_parse)
+
+
+    with open(output_file, 'w', encoding='utf-8') as f:
+        # Get the parser's attributes as a dictionary
+        parser_dict = parser.__dict__
+
+        # Filter out non-serializable attributes
+        serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}
+
+        # Write to a JSON file
+        json.dump(serializable_dict, f, ensure_ascii=False, indent=4)
+
+if __name__ == "__main__":
+    main()
diff --git a/tulit/parsers/formex.py b/tulit/parsers/formex.py
@@ -1,8 +1,9 @@
 import re
 import os
+import json
 
 from lxml import etree
-from .parser import XMLParser
+from tulit.parsers.parser import XMLParser
 
 class Formex4Parser(XMLParser):
     """
@@ -175,4 +176,26 @@ def parse(self, file):
         dict
             Parsed data containing metadata, title, preamble, and articles.
         """
-        super().parse(file, schema='formex4.xsd', format='Formex 4')
+        super().parse(file, schema='./formex4.xsd', format='Formex 4')
+
+def main():
+    parser = Formex4Parser()
+    file_to_parse = 'tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml'
+    output_file = 'tests/data/json/iopa.json'
+
+
+    parser.parse(file_to_parse)
+
+    with open(output_file, 'w', encoding='utf-8') as f:
+        # Get the parser's attributes as a dictionary
+        parser_dict = parser.__dict__
+
+        # Filter out non-serializable attributes
+        serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}
+
+        # Write to a JSON file
+        json.dump(serializable_dict, f, ensure_ascii=False, indent=4)
+
+if __name__ == "__main__":
+    main()
+
diff --git a/tulit/parsers/parser.py b/tulit/parsers/parser.py
@@ -85,6 +85,7 @@ def __init__(self):
 
         self.schema = None
         self.valid = None
+        self.format = None
         self.validation_errors = None
 
         self.namespaces = {}
@@ -115,7 +116,7 @@ def load_schema(self, schema):
         except Exception as e:
             print(f"Error loading schema: {e}")
 
-    def validate(self, format, file: str) -> bool:
+    def validate(self, file: str,  format: str) -> bool:
         """
         Validates an XML file against the loaded XSD schema.
         
@@ -145,9 +146,9 @@ def validate(self, format, file: str) -> bool:
             print(f"{file} is not a valid {format} file. Validation errors: {e}")
             self.valid = False
             self.validation_errors = e.error_log
-        except Exception as e:
-            print(f"An error occurred during validation: {e}")
-            self.valid = False
+        #except Exception as e:
+        #    print(f"An error occurred during validation: {e}")
+        #    self.valid = False
 
     def remove_node(self, tree, node):
         """
@@ -288,7 +289,6 @@ def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
             text = text.replace('\n', '').replace('\t', '').replace('\r', '')  # remove newline and tab characters
             text = re.sub(' +', ' ', text)  # replace multiple spaces with a single space
 
-            # Get an eId for the citation, depending on the XML format
             eId = extract_eId(citation, index) if extract_eId else index
 
             citations.append({
@@ -313,7 +313,6 @@ def get_recitals(self, recitals_xpath, recital_xpath, text_xpath, extract_intro=
             return None
 
         recitals = []
-        # Get an eId for the citation, depending on the XML format
         intro_eId, intro_text = extract_intro(recitals_section) if extract_intro else (None, None)
 
         recitals.append({
@@ -444,7 +443,7 @@ def parse(self, file: str, schema, format) -> None:
         """
         try:
             self.load_schema(schema)
-            self.validate(file, format)
+            self.validate(file=file, format=format)
             if self.valid == True:
                 try:
                     self.get_root(file)