Skip to content

Commit

Permalink
Created callable main functions
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 28, 2024
1 parent 1c1974e commit 864ae70
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 11 deletions.
27 changes: 25 additions & 2 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .parser import XMLParser
from tulit.parsers.parser import XMLParser
import re
from lxml import etree
import os
import json

class AkomaNtosoParser(XMLParser):
"""
Expand Down Expand Up @@ -283,4 +284,26 @@ def parse(self, file: str) -> None:
"""
return super.parse(file, schema = 'akomantoso30.xsd', format = 'Akoma Ntoso')
return super().parse(file, schema = 'akomantoso30.xsd', format = 'Akoma Ntoso')

def main():
parser = AkomaNtosoParser()

file_to_parse = 'tests/data/akn/eu/32014L0092.akn'
output_file = 'tests/data/json/akn.json'

parser.parse(file_to_parse)


with open(output_file, 'w', encoding='utf-8') as f:
# Get the parser's attributes as a dictionary
parser_dict = parser.__dict__

# Filter out non-serializable attributes
serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}

# Write to a JSON file
json.dump(serializable_dict, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
main()
27 changes: 25 additions & 2 deletions tulit/parsers/formex.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re
import os
import json

from lxml import etree
from .parser import XMLParser
from tulit.parsers.parser import XMLParser

class Formex4Parser(XMLParser):
"""
Expand Down Expand Up @@ -175,4 +176,26 @@ def parse(self, file):
dict
Parsed data containing metadata, title, preamble, and articles.
"""
super().parse(file, schema='formex4.xsd', format='Formex 4')
super().parse(file, schema='./formex4.xsd', format='Formex 4')

def main():
parser = Formex4Parser()
file_to_parse = 'tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml'
output_file = 'tests/data/json/iopa.json'


parser.parse(file_to_parse)

with open(output_file, 'w', encoding='utf-8') as f:
# Get the parser's attributes as a dictionary
parser_dict = parser.__dict__

# Filter out non-serializable attributes
serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}

# Write to a JSON file
json.dump(serializable_dict, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
main()

13 changes: 6 additions & 7 deletions tulit/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def __init__(self):

self.schema = None
self.valid = None
self.format = None
self.validation_errors = None

self.namespaces = {}
Expand Down Expand Up @@ -115,7 +116,7 @@ def load_schema(self, schema):
except Exception as e:
print(f"Error loading schema: {e}")

def validate(self, format, file: str) -> bool:
def validate(self, file: str, format: str) -> bool:
"""
Validates an XML file against the loaded XSD schema.
Expand Down Expand Up @@ -145,9 +146,9 @@ def validate(self, format, file: str) -> bool:
print(f"{file} is not a valid {format} file. Validation errors: {e}")
self.valid = False
self.validation_errors = e.error_log
except Exception as e:
print(f"An error occurred during validation: {e}")
self.valid = False
#except Exception as e:
# print(f"An error occurred during validation: {e}")
# self.valid = False

def remove_node(self, tree, node):
"""
Expand Down Expand Up @@ -288,7 +289,6 @@ def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):
text = text.replace('\n', '').replace('\t', '').replace('\r', '') # remove newline and tab characters
text = re.sub(' +', ' ', text) # replace multiple spaces with a single space

# Get an eId for the citation, depending on the XML format
eId = extract_eId(citation, index) if extract_eId else index

citations.append({
Expand All @@ -313,7 +313,6 @@ def get_recitals(self, recitals_xpath, recital_xpath, text_xpath, extract_intro=
return None

recitals = []
# Get an eId for the citation, depending on the XML format
intro_eId, intro_text = extract_intro(recitals_section) if extract_intro else (None, None)

recitals.append({
Expand Down Expand Up @@ -444,7 +443,7 @@ def parse(self, file: str, schema, format) -> None:
"""
try:
self.load_schema(schema)
self.validate(file, format)
self.validate(file=file, format=format)
if self.valid == True:
try:
self.get_root(file)
Expand Down

0 comments on commit 864ae70

Please sign in to comment.