Skip to content

Commit

Permalink
Harmonised methods between formex and akn
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 14, 2024
1 parent 95558f0 commit 26b559a
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 41 deletions.
64 changes: 28 additions & 36 deletions tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,26 @@

import os

DATA_DIR = os.path.join(os.path.dirname(__file__), "../data/formex")
DATA_DIR = os.path.join(os.path.dirname(__file__), "..\\data\\formex")
file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")

class TestFormex4Parser(unittest.TestCase):
def setUp(self):
self.formex_parser = Formex4Parser()
self.maxDiff = None # Allow full diff if needed
self.parser = Formex4Parser()
self.parser.get_root(file_path)

def test_parse_metadata(self):
def test_get_root(self):
"""Test parsing and root element retrieval from the Akoma Ntoso file."""
self.assertTrue(os.path.exists(file_path), f"Test file not found at {file_path}")
self.assertIsNotNone(self.parser.root, "Root element should not be None")

def test_get_metadata(self):
self.maxDiff = None # Allow the full diff to be displayed
file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")

self.formex_parser.load_xml(file_path)
self.parser.load_xml(file_path)

result = self.formex_parser.get_metadata()
result = self.parser.get_metadata()
expected = {
"file": "L_2011334EN.01002501.doc.xml",
"collection": "L",
Expand All @@ -35,34 +42,23 @@ def test_parse_metadata(self):
}
self.assertEqual(result, expected)

def test_parse_title(self):
def test_get_preface(self):
self.maxDiff = None # Allow full diff if needed
file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")

# Parse the XML tree and pass the root to _parse_title
with open(file_path, 'r', encoding='utf-8') as f:
tree = ET.parse(f)
root = tree.getroot()

result = self.formex_parser.get_title(root)


result = self.parser.get_preface()
expected = (
"Commission Implementing Regulation (EU) No 1319/2011 of 15 December 2011 "
"fixing representative prices in the poultrymeat and egg sectors and for egg "
"albumin, and amending Regulation (EC) No 1484/95"
)
self.assertEqual(result, expected)
self.assertEqual(self.parser.preface, expected)

def test_parse_preamble(self):
def test_get_preamble(self):
"""Test parsing the preamble section with quotations and numbered considerations in Formex4Parser."""
self.maxDiff = None # Allow full diff if needed
file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")

# Parse the XML tree and pass the root to _parse_preamble
with open(file_path, 'r', encoding='utf-8') as f:
tree = ET.parse(f)
root = tree.getroot()

result = self.formex_parser.get_preamble(root)
result = self.parser.get_preamble()

# Expected preamble structure
# @todo - see main function
Expand All @@ -84,18 +80,14 @@ def test_parse_preamble(self):
}

self.assertEqual(result, expected)



def test_parse_articles(self):
self.maxDiff = None # Allow full diff if needed
file_path = os.path.join(DATA_DIR, "L_2011334EN.01002501.xml")

self.formex_parser.load_xml(file_path)
self.formex_parser.get_body()

self.formex_parser.get_articles()

def test_get_body(self):
self.parser.get_body()
self.assertIsNotNone(self.parser.body, "Body element should not be None")

def test_get_articles(self):
self.parser.get_body()
self.parser.get_articles()

# Expected articles based on sample data in XML file
expected = [
Expand All @@ -111,7 +103,7 @@ def test_parse_articles(self):
}
]

self.assertEqual(self.formex_parser.articles, expected)
self.assertEqual(self.parser.articles, expected)

# Run the tests
if __name__ == "__main__":
Expand Down
21 changes: 16 additions & 5 deletions ulit/parsers/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ def __init__(self):
"""
# Define the namespace mapping
self.root = None
self.namespaces = {}

self.preface = None
self.metadata = {}

self.namespaces = FMX_NAMESPACES


Expand Down Expand Up @@ -67,7 +73,7 @@ def get_metadata(self):

return metadata

def get_title(self, root):
def get_preface(self):
"""
Extracts title information from the TITLE section.
Expand All @@ -77,17 +83,18 @@ def get_title(self, root):
Returns:
str: Concatenated title text.
"""
title_element = root.find('TITLE')
title_element = self.root.find('TITLE')
title_text = ""

if title_element is not None:
for paragraph in title_element.iter('P'):
paragraph_text = "".join(paragraph.itertext()).strip()
title_text += paragraph_text + " "
self.preface = title_text.strip()

return title_text.strip()
return self.preface

def get_preamble(self, root):
def get_preamble(self):
"""
Extracts the preamble section, including initial statements and considerations.
Expand All @@ -98,7 +105,7 @@ def get_preamble(self, root):
dict: Preamble details, including quotations and considerations.
"""
preamble_data = {"initial_statement": None, "quotations": [], "consid_init": None, "considerations": [], "preamble_final": None}
preamble = root.find('PREAMBLE')
preamble = self.root.find('PREAMBLE')

if preamble is not None:
# Initial statement
Expand All @@ -119,6 +126,8 @@ def get_preamble(self, root):
text = text.replace('\n', '').replace('\t', '').replace('\r', '') # remove newline and tab characters
text = re.sub(' +', ' ', text) # replace multiple spaces with a single space
preamble_data["quotations"].append(text)

self.citations = preamble_data['quotations']

preamble_data["consid_init"] = preamble.findtext('.//GR.CONSID/GR.CONSID.INIT')

Expand Down Expand Up @@ -183,5 +192,7 @@ def parse(self, file):
dict: Parsed data containing metadata, title, preamble, and articles.
"""
self.load_xml(file)
self.get_preface()
self.get_preamble()
self.get_body()
self.get_articles()

0 comments on commit 26b559a

Please sign in to comment.