Some small fixes in the text processing

AlessioNar · Jan 6, 2025 · a53f5ea · a53f5ea
1 parent f7598e6
commit a53f5ea
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 7 deletions.
diff --git a/tulit/parsers/html/cellar.py b/tulit/parsers/html/cellar.py
@@ -198,7 +198,7 @@ def get_articles(self):
             if paragraphs and len(article.find_all('table')) == 0:
                 for paragraph in paragraphs:
                     text = ' '.join(paragraph.get_text(separator= ' ', strip=True).split())
-
+                    text = re.sub(r'\s+([.,!?;:’])', r'\1', text)  # replace spaces before punctuation with nothing
                     children.append({
                         # Get parent of the paragraph: Use the id of the parent div as the eId
                         'eId': paragraph.find_parent('div').get('id'),
@@ -218,8 +218,11 @@ def get_articles(self):
                         cols = row.find_all('td')
                         if len(cols) == 2:
                             number = cols[0].get_text(strip=True)
+                            number = number.strip('()')  # Remove parentheses
+                            number = int(number)
                             text = ' '.join(cols[1].get_text(separator = ' ', strip=True).split())
-
+                            text = re.sub(r'\s+([.,!?;:’])', r'\1', text)  # replace spaces before punctuation with nothing
+
                             children.append({
                                 'eId': number,
                                 'text': text
@@ -229,9 +232,11 @@ def get_articles(self):
                 paragraphs = article.find_all('div', id=lambda x: x and '.' in x)
                 for paragraph in paragraphs:
                     if not paragraph.get('class'):
+                        text = ' '.join(paragraph.get_text(separator = ' ', strip=True).split())
+                        text = re.sub(r'\s+([.,!?;:’])', r'\1', text)  # replace spaces before punctuation with nothing
                         children.append({
                                 'eId': paragraph.get('id'),
-                                'text': ' '.join(paragraph.get_text(separator = ' ', strip=True).split())
+                                'text': text
                         })
 
             # Store the article with its eId and subdivisions

diff --git a/tulit/parsers/xml/formex.py b/tulit/parsers/xml/formex.py
@@ -69,7 +69,8 @@ def get_citations(self):
             - 'text': Citation text
         """
         def extract_eId(citation, index):
-            return index
+            return f'cit_{index + 1}'
+
 
         return super().get_citations(
             citations_xpath='.//GR.VISA',
@@ -95,7 +96,10 @@ def extract_intro(recitals_section):
             self.recitals_intro = intro_text            
 
         def extract_eId(recital):
-            return recital.findtext('.//NO.P')
+            eId = recital.findtext('.//NO.P')
+            # Remove () and return eId in the format rct_{number}
+            eId = eId.strip('()')  # Remove parentheses
+            return f'rct_{eId}'
 
         return super().get_recitals(
             recitals_xpath='.//GR.CONSID', 
@@ -135,7 +139,7 @@ def get_chapters(self) -> None:
             - 'chapter_heading': Chapter heading text
         """
         def extract_eId(chapter, index):
-            return index
+            return f'cpt_{index+1}'
 
         def get_headings(chapter):
             if len(chapter.findall('.//HT')) > 0:
@@ -179,6 +183,9 @@ def get_articles(self):
         self.articles = []
         if self.body is not None:
             for article in self.body.findall('.//ARTICLE'):
+                article_eId = article.get("IDENTIFIER")
+                article_eId = article_eId.lstrip('0')
+                article_eId = f'art_{article_eId}'
                 children = []
 
                 # Extract text and metadata from all relevant elements within the article
@@ -196,7 +203,7 @@ def get_articles(self):
                             self._extract_elements(alinea, '.', children)                        
 
                 self.articles.append({
-                    "eId": article.get("IDENTIFIER"),
+                    "eId": article_eId,
                     "num": article.findtext('.//TI.ART'),
                     "heading": article.findtext('.//STI.ART'),
                     "children": children
@@ -226,10 +233,18 @@ def _extract_elements(self, parent, xpath, children, start_index=0):
         """
         elements = parent.findall(xpath)
         for index, element in enumerate(elements, start=start_index):
+            for sub_element in element.iter():
+                if sub_element.tag == 'QUOT.START':                    
+                    sub_element.text = "‘"                    
+                elif sub_element.tag == 'QUOT.END':                    
+                    sub_element.text = "’"
+
             text = "".join(element.itertext()).strip()
+            text = re.sub(r'^\(\d+\)', '', text).strip()
             text = text.replace('\n', '').replace('\t', '').replace('\r', '')  # remove newline and tab characters
             text = text.replace('\u00A0', ' ')  # replace non-breaking spaces with regular spaces
             text = re.sub(' +', ' ', text)  # replace multiple spaces with a single space
+            text = re.sub(r'\s+([.,!?;:’])', r'\1', text)  # replace spaces before punctuation with nothing
 
             child = {
             "eId": element.get("IDENTIFIER") or element.get("ID") or element.get("NO.P") or index,