Skip to content

Commit

Permalink
Improved stability and reliance of the package.
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 14, 2024
1 parent 2fa8014 commit 73d7df6
Show file tree
Hide file tree
Showing 14 changed files with 340 additions and 7,245 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
**/__pycache__/*
tests/logs
tests/logs
tests/data
1,782 changes: 0 additions & 1,782 deletions poetry.lock

This file was deleted.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ version = "0.0.1"
description = "ULIT - a Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"

[tool.poetry]
name = "op_cellar"
version = "0.0.3"
name = "ulit"
version = "0.0.1"
description = "ULIT - a Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"
authors = ["AlessioNar <[email protected]>"]
license = "EUPL 1.2"
Expand Down
1,927 changes: 0 additions & 1,927 deletions tests/data/formex/L_202400903EN.000101.fmx.xml

This file was deleted.

3,448 changes: 0 additions & 3,448 deletions tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.xhtml

This file was deleted.

File renamed without changes.
17 changes: 17 additions & 0 deletions tests/metadata/queries/html_query.rq
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
PREFIX purl: <http://purl.org/dc/elements/1.1/>

SELECT DISTINCT ?cellarURIs, ?manif, ?format, ?expr
WHERE {
?work owl:sameAs <http://publications.europa.eu/resource/celex/{CELEX}> .
?expr cdm:expression_belongs_to_work ?work ;
cdm:expression_uses_language ?lang .
?lang purl:identifier ?langCode .
?manif cdm:manifestation_manifests_expression ?expr;
cdm:manifestation_type ?format.
?cellarURIs cdm:item_belongs_to_manifestation ?manif.

FILTER(str(?format)="xhtml" && str(?langCode)="ENG")
}
ORDER BY ?cellarURIs
LIMIT 10
93 changes: 93 additions & 0 deletions tests/metadata/query_results/query_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"head": {
"link": [],
"vars": [
"cellarURIs",
"manif",
"format",
"expr"
]
},
"results": {
"distinct": false,
"ordered": true,
"bindings": [
{
"cellarURIs": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_1"
},
"manif": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
},
"format": {
"type": "typed-literal",
"datatype": "http://www.w3.org/2001/XMLSchema#string",
"value": "fmx4"
},
"expr": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
}
},
{
"cellarURIs": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_2"
},
"manif": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
},
"format": {
"type": "typed-literal",
"datatype": "http://www.w3.org/2001/XMLSchema#string",
"value": "fmx4"
},
"expr": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
}
},
{
"cellarURIs": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_3"
},
"manif": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
},
"format": {
"type": "typed-literal",
"datatype": "http://www.w3.org/2001/XMLSchema#string",
"value": "fmx4"
},
"expr": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
}
},
{
"cellarURIs": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_4"
},
"manif": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
},
"format": {
"type": "typed-literal",
"datatype": "http://www.w3.org/2001/XMLSchema#string",
"value": "fmx4"
},
"expr": {
"type": "uri",
"value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
}
}
]
}
}
File renamed without changes.
11 changes: 5 additions & 6 deletions tests/test_sparql.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from ulit.sparql import send_sparql_query, get_results_table
import os

DATA_DIR = os.path.join(os.path.dirname(__file__), ".")
DATA_DIR = os.path.join(os.path.dirname(__file__), "./metadata/queries")

class TestSendSparqlQuery(unittest.TestCase):
def setUp(self):
Expand All @@ -13,14 +13,13 @@ def setUp(self):
def test_send_sparql_query(self):

self.maxDiff = None # Allow the full diff to be displayed
sparql_file_path = os.path.join(DATA_DIR, "query.rq")
sparql_file_path = os.path.join(DATA_DIR, "formex_query.rq")
celex = "32024R0903"
# Act
# Send query
response = send_sparql_query(sparql_query_filepath=sparql_file_path, celex=celex)
print(response)
#print(response)
expected_results = json.loads('''{"head": {"link": [], "vars": ["cellarURIs", "manif", "format", "expr"]}, "results": {"distinct": false, "ordered": true, "bindings": [{"cellarURIs": {"type": "uri", "value": "http://publications.europa.eu/resource/cellar/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1"}, "manif": {"type": "uri", "value": "http://publications.europa.eu/resource/cellar/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02"}, "format": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "fmx4"}, "expr": {"type": "uri", "value": "http://publications.europa.eu/resource/cellar/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006"}}]}}''')

print(expected_results)
#print(expected_results)
self.assertEqual(response, expected_results)

if __name__ == "__main__":
Expand Down
127 changes: 86 additions & 41 deletions ulit/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ def download_documents(results, download_dir, log_dir, format=None):

if not os.path.exists(log_dir):
os.makedirs(log_dir)

document_paths = process_range(ids=cellar_ids, folder_path=os.path.join(download_dir), log_dir=log_dir, format=format)
document_paths = download_document(ids=cellar_ids, folder_path=os.path.join(download_dir), log_dir=log_dir, format=format)

return document_paths

Expand Down Expand Up @@ -87,7 +86,7 @@ def get_cellar_ids_from_json_results(cellar_results, format):
return cellar_uris

# Function to process a list of ids to download the corresponding zip files
def process_range(ids: list, folder_path: str, log_dir: str, format: str):
def download_document(ids: list, folder_path: str, log_dir: str, format: str):
"""
Process a list of ids to download the corresponding zip files.
Expand Down Expand Up @@ -121,48 +120,94 @@ def process_range(ids: list, folder_path: str, log_dir: str, format: str):
>>> process_range(ids, folder_path)
"""
try:
zip_files = []
single_files = []
other_downloads = []
file_paths = []

for id in ids:

response = fetch_content(id.strip())
if response is None:
continue

if 'Content-Type' in response.headers:
if 'zip' in response.headers['Content-Type']:
file_path = os.path.join(folder_path, id)
zip_files.append(id)
extract_zip(response, file_path)
else:
file_path = os.path.join(folder_path, id + '.' + format)
single_files.append(id)
out_file = os.path.join(file_path)
os.makedirs(os.path.dirname(out_file), exist_ok=True)
with open(out_file, 'w+', encoding="utf-8") as f:
f.write(response.text)

file_paths.append(file_path)
else:
other_downloads.append(id)

if len(other_downloads) != 0:
# Log results
id_logs_path = os.path.join(log_dir, 'failed_' + get_current_timestamp() + '.txt')
os.makedirs(os.path.dirname(id_logs_path), exist_ok=True)
with open(id_logs_path, 'w+') as f:
f.write('Failed downloads ' + get_current_timestamp() + '\n' + str(other_downloads))

with open(os.path.join(log_dir, get_current_timestamp() + '.txt'), 'w+') as f:
f.write(f"Zip files: {len(zip_files)}, Single files: {len(single_files)}, Failed downloads: {len(other_downloads)}")
print(id)
response = fetch_content(id)
file_path = handle_response(response=response, folder_path=folder_path, cellar_id=id)
file_paths.append(file_path)
return file_paths

except Exception as e:
logging.error(f"Error processing range: {e}")


def handle_response(response, folder_path, cellar_id):
"""
Handle a server response by saving or extracting its content.
Parameters
----------
response : requests.Response
The HTTP response object.
folder_path : str
Directory where the file will be saved.
cid : str
CELLAR ID of the document.
Returns
-------
str or None
Path to the saved file or None if the response couldn't be processed.
"""
content_type = response.headers.get('Content-Type', '')

# The return file is usually either a zip file, or a file with the name DOC_* inside a folder named as the cellar_id
target_path = os.path.join(folder_path, cellar_id)
os.makedirs(os.path.dirname(target_path), exist_ok=True)

if 'zip' in content_type:
extract_zip(response, target_path)
return target_path
else:
extension = get_extension_from_content_type(content_type)
if not extension:
logging.warning(f"Unknown content type for ID {cellar_id}: {content_type}")
return None

file_path = f"{target_path}.{extension}"
file_path = os.path.normpath(file_path)
is_binary = 'text' not in content_type # Assume binary if not explicitly text

with open(file_path, mode='wb+') as f:
f.write(response.content)
#print(response.content)
#print(file_path)
return file_path

def get_extension_from_content_type(content_type):
"""Map Content-Type to a file extension."""
content_type_mapping = [
'html',
'json',
'xml',
'txt',
'zip'
]
for ext in content_type_mapping:
if ext in content_type:
return ext
return None

def save_file(content, file_path, binary=False):
"""
Save content to a file.
Parameters
----------
content : bytes or str
The content to save.
file_path : str
Path to the file.
binary : bool
Whether the content is binary or text.
"""
mode = 'wb' if binary else 'w'
with open(file_path, mode) as f:
f.write(content)


# Function to send a GET request to download a zip file for the given id under the CELLAR URI
def fetch_content(id: str) -> requests.Response:
"""
Expand Down Expand Up @@ -205,7 +250,7 @@ def fetch_content(id: str) -> requests.Response:
try:
url = BASE_URL + id
headers = {
'Accept': "application/zip, application/zip;mtype=fmx4, application/xml;mtype=fmx4, application/xhtml+xml, text/html, text/html;type=simplified, application/msword, text/plain, application/xml;notice=object",
'Accept': "*, application/zip, application/zip;mtype=fmx4, application/xml;mtype=fmx4, application/xhtml+xml, text/html, text/html;type=simplified, application/msword, text/plain, application/xml, application/xml;notice=object",
'Accept-Language': "eng",
'Content-Type': "application/x-www-form-urlencoded",
'Host': "publications.europa.eu"
Expand Down Expand Up @@ -238,9 +283,9 @@ def extract_zip(response: requests.Response, folder_path: str):
logging.basicConfig(level=logging.INFO)
# Simulate getting results from somewhere

with open('./tests/results_html.json', 'r') as f:
with open('./tests/metadata/query_results/query_results.json', 'r') as f:
results = json.loads(f.read()) # Load the JSON data
document_paths = download_documents(results, './tests/data/html', log_dir='./tests/logs', format='xhtml')
#document_paths = download_documents(results, './tests/data/formex', log_dir='./tests/logs', format='fmx4')
#document_paths = download_documents(results, './tests/data/html', log_dir='./tests/logs', format='xhtml')
document_paths = download_documents(results, './tests/data/formex', log_dir='./tests/logs', format='fmx4')

print(document_paths)
64 changes: 64 additions & 0 deletions ulit/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import json
import logging
import os
from download import download_documents
from sparql import send_sparql_query
from parsers.html import HTMLParser
from parsers.formex import Formex4Parser

def main():
"""
Main function to execute SPARQL query and download documents
"""
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:

# Send SPARQL query
logger.info("Executing SPARQL query")
results = send_sparql_query('./tests/metadata/queries/formex_query.rq', celex='32008R1137')


# Save query results to JSON
results_file = './tests/metadata/query_results/query_results.json'
with open(results_file, "w") as f:
json.dump(results, f, indent=4)
logger.info(f"Results dumped in {results_file}")

# Load query results
with open('./tests/metadata/query_results/query_results.json', 'r') as f:
results = json.loads(f.read())

# Download documents
logger.info("Downloading documents")
downloaded_document_paths = download_documents(
results,
'./tests/data/formex',
log_dir='./tests/logs',
format='fmx4'
)
logger.info(f'{len(downloaded_document_paths)} documents downloaded in {downloaded_document_paths}')

# Extract the directory path (removing what's after the last '/')

# List the contents of the first directory
first_path = downloaded_document_paths[0]
first_item = os.listdir(first_path)[0]
file_path = os.path.join(*first_path.split('/'), first_item)

print(f'Parsing {file_path}')
# Sort the contents alphabetically and get the first item

parser = Formex4Parser()
parser.parse(file_path)
print(parser.articles)
#print(document_tree)

except Exception as e:
logger.error(f"An error occurred: {e}")
raise

if __name__ == "__main__":
main()
Loading

0 comments on commit 73d7df6

Please sign in to comment.