Skip to content

Commit

Permalink
Merge branch 'development'
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Feb 18, 2025
2 parents fbdc27c + 53beff3 commit 10ce21b
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 21 deletions.
4 changes: 2 additions & 2 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
project = 'tulit'
author = 'AlessioNar'

release = '0.1.2'
version = '0.1.2'
release = '0.2.0'
version = '0.2.0'

# -- General configuration
sys.path.insert(0, os.path.abspath('../../tulit'))
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
[project]
name = "tulit"
version = "0.1.2"
version = "0.2.0"
description = "TULIT - The Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"

[tool.poetry]
name = "tulit"
version = "0.1.2"
version = "0.2.0"
description = "TULIT - The Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"
authors = ["AlessioNar <[email protected]>"]
license = "EUPL 1.2"
Expand Down
4 changes: 2 additions & 2 deletions tests/parsers/xml/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,15 @@ def test_get_articles(self):
"num": "Article 1",
"heading": None,
"children": [
{"eId": 0, "text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."}
{"eId": 0, "text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation.", "contains_amendment": False, "amendment": None}
]
},
{
"eId": "art_2",
"num": "Article 2",
"heading": None,
"children": [
{"eId": 0, "text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."}
{"eId": 0, "text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union.", "contains_amendment": False, "amendment": None}
]
}
]
Expand Down
64 changes: 49 additions & 15 deletions tulit/parsers/xml/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,25 +187,35 @@ def get_articles(self):
article_eId = article_eId.lstrip('0')
article_eId = f'art_{article_eId}'
children = []
amendments = []

# Check if the article contains <QUOT.S> tag
if article.findall('.//QUOT.S'):
article, amendments = self._handle_amendments(article)
print('Amendment article found!')
print('\n')

print('Amendments:', amendments)
print('\n')

# Extract text and metadata from all relevant elements within the article
if article.findall('.//PARAG'):
self._extract_elements(article, './/PARAG', children)
self._extract_elements(article, './/PARAG', children, amendments)
elif article.findall('.//ALINEA'):
# If no PARAG elements, check for ALINEA elements
alineas = article.findall('.//ALINEA')
for alinea in alineas:
# if there are P elements within the ALINEA, extract them first, then extract LIST//ITEM elements, if they are still absent, extract the text from the ALINEA
p_elements = alinea.findall('.//P')
self._extract_elements(alinea, './/P', children)
self._extract_elements(alinea, './/LIST//ITEM', children, start_index=len(p_elements))
self._extract_elements(alinea, './/P', children, amendments)
self._extract_elements(alinea, './/LIST//ITEM', children, amendments, start_index=len(p_elements))
if not p_elements:
self._extract_elements(alinea, '.', children)
self._extract_elements(alinea, '.', children, amendments)

self.articles.append({
"eId": article_eId,
"num": article.findtext('.//TI.ART'),
"heading": article.findtext('.//STI.ART'),
"num": article.findtext('.//TI.ART') or article.findtext('.//TI.ART//P'),
"heading": article.findtext('.//STI.ART') or article.findtext('.//STI.ART//P'),
"children": children
})

Expand All @@ -214,7 +224,7 @@ def get_articles(self):
print('No enacting terms XML tag has been found')
return []

def _extract_elements(self, parent, xpath, children, start_index=0):
def _extract_elements(self, parent, xpath, children, amendments, start_index=0):
"""
Helper method to extract text and metadata from elements.
Expand All @@ -226,12 +236,13 @@ def _extract_elements(self, parent, xpath, children, start_index=0):
The XPath expression to locate the elements.
children : list
The list to append the extracted elements to.
is_list : bool, optional
Whether the elements are part of a list (default is False).
amendments : list
List of amendments extracted from the article.
start_index : int, optional
The starting index for the elements (default is 0).
"""
elements = parent.findall(xpath)
amendment_index = 0
for index, element in enumerate(elements, start=start_index):
for sub_element in element.iter():
if sub_element.tag == 'QUOT.START':
Expand All @@ -245,12 +256,35 @@ def _extract_elements(self, parent, xpath, children, start_index=0):
text = text.replace('\u00A0', ' ') # replace non-breaking spaces with regular spaces
text = re.sub(' +', ' ', text) # replace multiple spaces with a single space
text = re.sub(r'\s+([.,!?;:’])', r'\1', text) # replace spaces before punctuation with nothing

child = {
"eId": element.get("IDENTIFIER") or element.get("ID") or element.get("NO.P") or index,
"text": text
}
children.append(child)
if text is not None and text != '' and text != ';':
child = {
"eId": element.get("IDENTIFIER") or element.get("ID") or element.get("NO.P") or index,
"text": text,
"contains_amendment": amendment_index < len(amendments),
"amendment": amendments[amendment_index] if amendment_index < len(amendments) else None
}
children.append(child)
amendment_index += 1

def _handle_amendments(self, article):
"""
Handles amendments made in the ACT using the <QUOT.S> tag.
Parameters
----------
article : lxml.etree._Element
The article element to process.
"""
amendments = []
for quot_s in article.findall('.//QUOT.S'):
amendment_text = " ".join(quot_s.itertext()).strip()
# Process the amendment text as needed
# For example, you could store it in a list or apply it to the article text
amendments.append(amendment_text)
# Remove the QUOT.S tags from the article using the self.remove_node method
article = self.remove_node(article, './/QUOT.S')
return article, amendments


def get_conclusions(self):
"""
Expand Down

0 comments on commit 10ce21b

Please sign in to comment.