Skip to content

Commit

Permalink
Some small fixes in the text processing
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Jan 6, 2025
1 parent f7598e6 commit a53f5ea
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 7 deletions.
11 changes: 8 additions & 3 deletions tulit/parsers/html/cellar.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def get_articles(self):
if paragraphs and len(article.find_all('table')) == 0:
for paragraph in paragraphs:
text = ' '.join(paragraph.get_text(separator= ' ', strip=True).split())

text = re.sub(r'\s+([.,!?;:’])', r'\1', text) # replace spaces before punctuation with nothing
children.append({
# Get parent of the paragraph: Use the id of the parent div as the eId
'eId': paragraph.find_parent('div').get('id'),
Expand All @@ -218,8 +218,11 @@ def get_articles(self):
cols = row.find_all('td')
if len(cols) == 2:
number = cols[0].get_text(strip=True)
number = number.strip('()') # Remove parentheses
number = int(number)
text = ' '.join(cols[1].get_text(separator = ' ', strip=True).split())

text = re.sub(r'\s+([.,!?;:’])', r'\1', text) # replace spaces before punctuation with nothing

children.append({
'eId': number,
'text': text
Expand All @@ -229,9 +232,11 @@ def get_articles(self):
paragraphs = article.find_all('div', id=lambda x: x and '.' in x)
for paragraph in paragraphs:
if not paragraph.get('class'):
text = ' '.join(paragraph.get_text(separator = ' ', strip=True).split())
text = re.sub(r'\s+([.,!?;:’])', r'\1', text) # replace spaces before punctuation with nothing
children.append({
'eId': paragraph.get('id'),
'text': ' '.join(paragraph.get_text(separator = ' ', strip=True).split())
'text': text
})

# Store the article with its eId and subdivisions
Expand Down
23 changes: 19 additions & 4 deletions tulit/parsers/xml/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def get_citations(self):
- 'text': Citation text
"""
def extract_eId(citation, index):
return index
return f'cit_{index + 1}'


return super().get_citations(
citations_xpath='.//GR.VISA',
Expand All @@ -95,7 +96,10 @@ def extract_intro(recitals_section):
self.recitals_intro = intro_text

def extract_eId(recital):
return recital.findtext('.//NO.P')
eId = recital.findtext('.//NO.P')
# Remove () and return eId in the format rct_{number}
eId = eId.strip('()') # Remove parentheses
return f'rct_{eId}'

return super().get_recitals(
recitals_xpath='.//GR.CONSID',
Expand Down Expand Up @@ -135,7 +139,7 @@ def get_chapters(self) -> None:
- 'chapter_heading': Chapter heading text
"""
def extract_eId(chapter, index):
return index
return f'cpt_{index+1}'

def get_headings(chapter):
if len(chapter.findall('.//HT')) > 0:
Expand Down Expand Up @@ -179,6 +183,9 @@ def get_articles(self):
self.articles = []
if self.body is not None:
for article in self.body.findall('.//ARTICLE'):
article_eId = article.get("IDENTIFIER")
article_eId = article_eId.lstrip('0')
article_eId = f'art_{article_eId}'
children = []

# Extract text and metadata from all relevant elements within the article
Expand All @@ -196,7 +203,7 @@ def get_articles(self):
self._extract_elements(alinea, '.', children)

self.articles.append({
"eId": article.get("IDENTIFIER"),
"eId": article_eId,
"num": article.findtext('.//TI.ART'),
"heading": article.findtext('.//STI.ART'),
"children": children
Expand Down Expand Up @@ -226,10 +233,18 @@ def _extract_elements(self, parent, xpath, children, start_index=0):
"""
elements = parent.findall(xpath)
for index, element in enumerate(elements, start=start_index):
for sub_element in element.iter():
if sub_element.tag == 'QUOT.START':
sub_element.text = "‘"
elif sub_element.tag == 'QUOT.END':
sub_element.text = "’"

text = "".join(element.itertext()).strip()
text = re.sub(r'^\(\d+\)', '', text).strip()
text = text.replace('\n', '').replace('\t', '').replace('\r', '') # remove newline and tab characters
text = text.replace('\u00A0', ' ') # replace non-breaking spaces with regular spaces
text = re.sub(' +', ' ', text) # replace multiple spaces with a single space
text = re.sub(r'\s+([.,!?;:’])', r'\1', text) # replace spaces before punctuation with nothing

child = {
"eId": element.get("IDENTIFIER") or element.get("ID") or element.get("NO.P") or index,
Expand Down

0 comments on commit a53f5ea

Please sign in to comment.