Skip to content

Commit

Permalink
Finalised generalisation for get_citations() method
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 24, 2024
1 parent 72ca8cf commit b6967e2
Show file tree
Hide file tree
Showing 6 changed files with 2,101 additions and 12 deletions.

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,12 @@ def test_get_citations(self):
"""Test citation extraction in the preamble section."""
self.parser.get_preamble(preamble_xpath='.//akn:preamble', notes_xpath='.//akn:authorialNote')
self.parser.get_citations()

self.assertIsNotNone(self.parser.citations, "Citations data not found")

first_citation = self.parser.citations[0]
expected_text = "Having regard to the Treaty on the Functioning of the European Union, and in particular Article 114"
self.assertIn(expected_text, first_citation['citation_text'])
self.assertIn(expected_text, first_citation['text'])

def test_get_recitals(self):
"""Test retrieval and content verification of recitals in the preamble."""
Expand Down
6 changes: 3 additions & 3 deletions tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,9 @@ def test_get_citations(self):


citations = [
{'eId': 0, 'citation_text': "Having regard to the Treaty on the Functioning of the European Union,"},
{"eId": 1, 'citation_text':"Having regard to Council Regulation (EC) No 1234/2007 of 22 October 2007 establishing a common organisation of agricultural markets and on specific provisions for certain agricultural products (Single CMO Regulation) , and in particular Article 143 thereof,"},
{"eId": 2, 'citation_text':"Having regard to Council Regulation (EC) No 614/2009 of 7 July 2009 on the common system of trade for ovalbumin and lactalbumin , and in particular Article 3(4) thereof,"},
{'eId': 0, 'text': "Having regard to the Treaty on the Functioning of the European Union,"},
{"eId": 1, 'text':"Having regard to Council Regulation (EC) No 1234/2007 of 22 October 2007 establishing a common organisation of agricultural markets and on specific provisions for certain agricultural products (Single CMO Regulation) , and in particular Article 143 thereof,"},
{"eId": 2, 'text':"Having regard to Council Regulation (EC) No 614/2009 of 7 July 2009 on the common system of trade for ovalbumin and lactalbumin , and in particular Article 3(4) thereof,"},
]

self.assertEqual(self.parser.citations, citations)
Expand Down
5 changes: 3 additions & 2 deletions tulit/parsers/akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,8 +246,9 @@ def get_citations(self) -> list:
Returns
-------
list
List of dictionaries containing citation text without the associated
authorial notes.
List of dictionaries containing citation data with keys:
- 'eId': Citation identifier, which is retrieved from the 'eId' attribute
- 'citation_text': Citation text
"""
def extract_eId(citation, index):
return citation.get('eId')
Expand Down
4 changes: 3 additions & 1 deletion tulit/parsers/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def get_citations(self):
Returns
-------
list
List of dictionaries containing citation text.
List of dictionaries containing citation data with keys:
- 'eId': Citation identifier, which is the index of the citation in the preamble
- 'citation_text': Citation text
"""
def extract_eId(citation, index):
return index
Expand Down
8 changes: 4 additions & 4 deletions tulit/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,15 +268,15 @@ def get_citations(self, citations_xpath, citation_xpath, extract_eId=None):

citations = []
for index, citation in enumerate(citations_section.findall(citation_xpath, namespaces=self.namespaces)):
citation_text = "".join(citation.itertext()).strip()
citation_text = citation_text.replace('\n', '').replace('\t', '').replace('\r', '') # remove newline and tab characters
citation_text = re.sub(' +', ' ', citation_text) # replace multiple spaces with a single space
text = "".join(citation.itertext()).strip()
text = text.replace('\n', '').replace('\t', '').replace('\r', '') # remove newline and tab characters
text = re.sub(' +', ' ', text) # replace multiple spaces with a single space

eId = extract_eId(citation, index) if extract_eId else index
# Up until here, the code is the same as for Formex
citations.append({
'eId' : eId,
'citation_text': citation_text,
'text': text,
})

self.citations = citations
Expand Down

0 comments on commit b6967e2

Please sign in to comment.