Merge branch 'development'

AlessioNar · Feb 18, 2025 · 10ce21b · 10ce21b
2 parents fbdc27c + 53beff3
commit 10ce21b
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 21 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -8,8 +8,8 @@
 project = 'tulit'
 author = 'AlessioNar'
 
-release = '0.1.2'
-version = '0.1.2'
+release = '0.2.0'
+version = '0.2.0'
 
 # -- General configuration
 sys.path.insert(0, os.path.abspath('../../tulit'))

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,11 +1,11 @@
 [project]
 name = "tulit"
-version = "0.1.2"
+version = "0.2.0"
 description = "TULIT - The Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"
 
 [tool.poetry]
 name = "tulit"
-version = "0.1.2"
+version = "0.2.0"
 description = "TULIT - The Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"
 authors = ["AlessioNar <[email protected]>"]
 license = "EUPL 1.2"

diff --git a/tests/parsers/xml/test_formex.py b/tests/parsers/xml/test_formex.py
@@ -108,15 +108,15 @@ def test_get_articles(self):
                 "num": "Article 1",
                 "heading": None,
                 "children": [
-                    {"eId": 0, "text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."}
+                    {"eId": 0, "text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation.", "contains_amendment": False, "amendment": None}
                 ]
             },
             {
                 "eId": "art_2",
                 "num": "Article 2",
                 "heading": None,
                 "children": [
-                    {"eId": 0, "text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."}
+                    {"eId": 0, "text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union.", "contains_amendment": False, "amendment": None}
                 ]
             }
         ]

diff --git a/tulit/parsers/xml/formex.py b/tulit/parsers/xml/formex.py
@@ -187,25 +187,35 @@ def get_articles(self):
                 article_eId = article_eId.lstrip('0')
                 article_eId = f'art_{article_eId}'
                 children = []
+                amendments = []
+
+                # Check if the article contains <QUOT.S> tag
+                if article.findall('.//QUOT.S'):
+                    article, amendments = self._handle_amendments(article)
+                    print('Amendment article found!')
+                    print('\n')
+
+                    print('Amendments:', amendments)
+                    print('\n')
 
                 # Extract text and metadata from all relevant elements within the article
                 if article.findall('.//PARAG'):
-                    self._extract_elements(article, './/PARAG', children)
+                    self._extract_elements(article, './/PARAG', children, amendments)
                 elif article.findall('.//ALINEA'):
                     # If no PARAG elements, check for ALINEA elements
                     alineas = article.findall('.//ALINEA')
                     for alinea in alineas:
                         # if there are P elements within the ALINEA, extract them first, then extract LIST//ITEM elements, if they are still absent, extract the text from the ALINEA
                         p_elements = alinea.findall('.//P')
-                        self._extract_elements(alinea, './/P', children)
-                        self._extract_elements(alinea, './/LIST//ITEM', children, start_index=len(p_elements))
+                        self._extract_elements(alinea, './/P', children, amendments)
+                        self._extract_elements(alinea, './/LIST//ITEM', children, amendments, start_index=len(p_elements))
                         if not p_elements:
-                            self._extract_elements(alinea, '.', children)                        
+                            self._extract_elements(alinea, '.', children, amendments)                        
 
                 self.articles.append({
                     "eId": article_eId,
-                    "num": article.findtext('.//TI.ART'),
-                    "heading": article.findtext('.//STI.ART'),
+                    "num": article.findtext('.//TI.ART') or article.findtext('.//TI.ART//P'),
+                    "heading": article.findtext('.//STI.ART') or article.findtext('.//STI.ART//P'),
                     "children": children
                 })
 
@@ -214,7 +224,7 @@ def get_articles(self):
             print('No enacting terms XML tag has been found')
             return []
 
-    def _extract_elements(self, parent, xpath, children, start_index=0):
+    def _extract_elements(self, parent, xpath, children, amendments, start_index=0):
         """
         Helper method to extract text and metadata from elements.
 
@@ -226,12 +236,13 @@ def _extract_elements(self, parent, xpath, children, start_index=0):
             The XPath expression to locate the elements.
         children : list
             The list to append the extracted elements to.
-        is_list : bool, optional
-            Whether the elements are part of a list (default is False).
+        amendments : list
+            List of amendments extracted from the article.
         start_index : int, optional
             The starting index for the elements (default is 0).
         """
         elements = parent.findall(xpath)
+        amendment_index = 0
         for index, element in enumerate(elements, start=start_index):
             for sub_element in element.iter():
                 if sub_element.tag == 'QUOT.START':                    
@@ -245,12 +256,35 @@ def _extract_elements(self, parent, xpath, children, start_index=0):
             text = text.replace('\u00A0', ' ')  # replace non-breaking spaces with regular spaces
             text = re.sub(' +', ' ', text)  # replace multiple spaces with a single space
             text = re.sub(r'\s+([.,!?;:’])', r'\1', text)  # replace spaces before punctuation with nothing
-
-            child = {
-            "eId": element.get("IDENTIFIER") or element.get("ID") or element.get("NO.P") or index,
-            "text": text
-            }
-            children.append(child)
+            if text is not None and text != '' and text != ';':
+                child = {
+                    "eId": element.get("IDENTIFIER") or element.get("ID") or element.get("NO.P") or index,
+                    "text": text,
+                    "contains_amendment": amendment_index < len(amendments),
+                    "amendment": amendments[amendment_index] if amendment_index < len(amendments) else None
+                }
+                children.append(child)
+                amendment_index += 1
+
+    def _handle_amendments(self, article):
+        """
+        Handles amendments made in the ACT using the <QUOT.S> tag.
+
+        Parameters
+        ----------
+        article : lxml.etree._Element
+            The article element to process.
+        """
+        amendments = []
+        for quot_s in article.findall('.//QUOT.S'):
+            amendment_text = " ".join(quot_s.itertext()).strip()
+            # Process the amendment text as needed
+            # For example, you could store it in a list or apply it to the article text            
+            amendments.append(amendment_text)
+        # Remove the QUOT.S tags from the article using the self.remove_node method
+        article = self.remove_node(article, './/QUOT.S')
+        return article, amendments
+
 
     def get_conclusions(self):
         """