Improved stability and reliance of the package.

AlessioNar · Dec 14, 2024 · 73d7df6 · 73d7df6
1 parent 2fa8014
commit 73d7df6
Show file tree

Hide file tree

Showing 14 changed files with 340 additions and 7,245 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 **/__pycache__/*
-tests/logs
+tests/logs
+tests/data
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,8 +4,8 @@ version = "0.0.1"
 description = "ULIT - a Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"
 
 [tool.poetry]
-name = "op_cellar"
-version = "0.0.3"
+name = "ulit"
+version = "0.0.1"
 description = "ULIT - a Universal Legal Informatics Toolkit, is set of legal informatics utilities collected in a Python package that focuses on the retrieval of legal data and metadata from official sources in the EU, and their transformation in pythonic data structures"
 authors = ["AlessioNar <[email protected]>"]
 license = "EUPL 1.2"

diff --git a/tests/data/formex/L_202400903EN.000101.fmx.xml b/tests/data/formex/L_202400903EN.000101.fmx.xml
diff --git a/tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.xhtml b/tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.xhtml
diff --git a/tests/query.rq → tests/metadata/queries/formex_query.rq b/tests/query.rq → tests/metadata/queries/formex_query.rq
diff --git a/tests/metadata/queries/html_query.rq b/tests/metadata/queries/html_query.rq
@@ -0,0 +1,17 @@
+PREFIX cdm: <http://publications.europa.eu/ontology/cdm#>
+PREFIX purl: <http://purl.org/dc/elements/1.1/>
+
+SELECT DISTINCT ?cellarURIs, ?manif, ?format, ?expr
+WHERE {
+    ?work owl:sameAs <http://publications.europa.eu/resource/celex/{CELEX}> .
+    ?expr cdm:expression_belongs_to_work ?work ;
+           cdm:expression_uses_language ?lang .
+    ?lang purl:identifier ?langCode .
+    ?manif cdm:manifestation_manifests_expression ?expr;
+           cdm:manifestation_type ?format.
+    ?cellarURIs cdm:item_belongs_to_manifestation ?manif.
+
+    FILTER(str(?format)="xhtml" && str(?langCode)="ENG")
+}
+ORDER BY ?cellarURIs
+LIMIT 10
diff --git a/tests/metadata/query_results/query_results.json b/tests/metadata/query_results/query_results.json
@@ -0,0 +1,93 @@
+{
+    "head": {
+        "link": [],
+        "vars": [
+            "cellarURIs",
+            "manif",
+            "format",
+            "expr"
+        ]
+    },
+    "results": {
+        "distinct": false,
+        "ordered": true,
+        "bindings": [
+            {
+                "cellarURIs": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_1"
+                },
+                "manif": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
+                },
+                "format": {
+                    "type": "typed-literal",
+                    "datatype": "http://www.w3.org/2001/XMLSchema#string",
+                    "value": "fmx4"
+                },
+                "expr": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
+                }
+            },
+            {
+                "cellarURIs": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_2"
+                },
+                "manif": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
+                },
+                "format": {
+                    "type": "typed-literal",
+                    "datatype": "http://www.w3.org/2001/XMLSchema#string",
+                    "value": "fmx4"
+                },
+                "expr": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
+                }
+            },
+            {
+                "cellarURIs": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_3"
+                },
+                "manif": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
+                },
+                "format": {
+                    "type": "typed-literal",
+                    "datatype": "http://www.w3.org/2001/XMLSchema#string",
+                    "value": "fmx4"
+                },
+                "expr": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
+                }
+            },
+            {
+                "cellarURIs": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04/DOC_4"
+                },
+                "manif": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006.04"
+                },
+                "format": {
+                    "type": "typed-literal",
+                    "datatype": "http://www.w3.org/2001/XMLSchema#string",
+                    "value": "fmx4"
+                },
+                "expr": {
+                    "type": "uri",
+                    "value": "http://publications.europa.eu/resource/cellar/e115172d-3ab3-4b14-b0a4-dfdcc9871793.0006"
+                }
+            }
+        ]
+    }
+}
diff --git a/tests/results_formex.json → ...etadata/query_results/results_formex.json b/tests/results_formex.json → ...etadata/query_results/results_formex.json
diff --git a/tests/test_sparql.py b/tests/test_sparql.py
@@ -4,7 +4,7 @@
 from ulit.sparql import send_sparql_query, get_results_table
 import os
 
-DATA_DIR = os.path.join(os.path.dirname(__file__), ".")
+DATA_DIR = os.path.join(os.path.dirname(__file__), "./metadata/queries")
 
 class TestSendSparqlQuery(unittest.TestCase):
     def setUp(self):
@@ -13,14 +13,13 @@ def setUp(self):
     def test_send_sparql_query(self):
 
         self.maxDiff = None  # Allow the full diff to be displayed
-        sparql_file_path = os.path.join(DATA_DIR, "query.rq")
+        sparql_file_path = os.path.join(DATA_DIR, "formex_query.rq")
         celex = "32024R0903"
-        # Act
+        # Send query
         response = send_sparql_query(sparql_query_filepath=sparql_file_path, celex=celex)
-        print(response)
+        #print(response)
         expected_results = json.loads('''{"head": {"link": [], "vars": ["cellarURIs", "manif", "format", "expr"]}, "results": {"distinct": false, "ordered": true, "bindings": [{"cellarURIs": {"type": "uri", "value": "http://publications.europa.eu/resource/cellar/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1"}, "manif": {"type": "uri", "value": "http://publications.europa.eu/resource/cellar/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02"}, "format": {"type": "typed-literal", "datatype": "http://www.w3.org/2001/XMLSchema#string", "value": "fmx4"}, "expr": {"type": "uri", "value": "http://publications.europa.eu/resource/cellar/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006"}}]}}''')
-
-        print(expected_results)
+        #print(expected_results)
         self.assertEqual(response, expected_results)
 
 if __name__ == "__main__":

diff --git a/ulit/download.py b/ulit/download.py
@@ -34,8 +34,7 @@ def download_documents(results, download_dir, log_dir, format=None):
 
     if not os.path.exists(log_dir):
         os.makedirs(log_dir)
-
-    document_paths = process_range(ids=cellar_ids, folder_path=os.path.join(download_dir), log_dir=log_dir, format=format)
+    document_paths = download_document(ids=cellar_ids, folder_path=os.path.join(download_dir), log_dir=log_dir, format=format)
 
     return document_paths
 
@@ -87,7 +86,7 @@ def get_cellar_ids_from_json_results(cellar_results, format):
     return cellar_uris
 
 # Function to process a list of ids to download the corresponding zip files
-def process_range(ids: list, folder_path: str, log_dir: str, format: str):
+def download_document(ids: list, folder_path: str, log_dir: str, format: str):
     """
     Process a list of ids to download the corresponding zip files.
 
@@ -121,48 +120,94 @@ def process_range(ids: list, folder_path: str, log_dir: str, format: str):
     >>> process_range(ids, folder_path)
     """
     try:
-        zip_files = []
-        single_files = []
-        other_downloads = []
         file_paths = []
 
         for id in ids:
-
-            response = fetch_content(id.strip())
-            if response is None:
-                continue
-
-            if 'Content-Type' in response.headers:
-                if 'zip' in response.headers['Content-Type']:
-                    file_path =  os.path.join(folder_path, id)
-                    zip_files.append(id)
-                    extract_zip(response, file_path)                    
-                else:
-                    file_path = os.path.join(folder_path, id + '.' + format)
-                    single_files.append(id)
-                    out_file = os.path.join(file_path)
-                    os.makedirs(os.path.dirname(out_file), exist_ok=True)
-                    with open(out_file, 'w+', encoding="utf-8") as f:
-                        f.write(response.text)
-
-                file_paths.append(file_path)
-            else:
-                other_downloads.append(id)
-
-        if len(other_downloads) != 0:
-            # Log results
-            id_logs_path = os.path.join(log_dir, 'failed_' + get_current_timestamp() + '.txt')
-            os.makedirs(os.path.dirname(id_logs_path), exist_ok=True)
-            with open(id_logs_path, 'w+') as f:
-                f.write('Failed downloads ' + get_current_timestamp() + '\n' + str(other_downloads))
-
-        with open(os.path.join(log_dir, get_current_timestamp() + '.txt'), 'w+') as f:
-            f.write(f"Zip files: {len(zip_files)}, Single files: {len(single_files)}, Failed downloads: {len(other_downloads)}")
+            print(id)
+            response = fetch_content(id)
+            file_path = handle_response(response=response, folder_path=folder_path, cellar_id=id)
+            file_paths.append(file_path)
         return file_paths
+
     except Exception as e:
         logging.error(f"Error processing range: {e}")
 
 
+def handle_response(response, folder_path, cellar_id):
+    """
+    Handle a server response by saving or extracting its content.
+
+    Parameters
+    ----------
+    response : requests.Response
+        The HTTP response object.
+    folder_path : str
+        Directory where the file will be saved.
+    cid : str
+        CELLAR ID of the document.
+
+    Returns
+    -------
+    str or None
+        Path to the saved file or None if the response couldn't be processed.
+    """
+    content_type = response.headers.get('Content-Type', '')
+
+    # The return file is usually either a zip file, or a file with the name DOC_* inside a folder named as the cellar_id
+    target_path = os.path.join(folder_path, cellar_id)
+    os.makedirs(os.path.dirname(target_path), exist_ok=True)
+
+    if 'zip' in content_type:
+        extract_zip(response, target_path)
+        return target_path
+    else:
+        extension = get_extension_from_content_type(content_type)
+        if not extension:
+            logging.warning(f"Unknown content type for ID {cellar_id}: {content_type}")
+            return None
+
+        file_path = f"{target_path}.{extension}"
+        file_path = os.path.normpath(file_path)
+        is_binary = 'text' not in content_type  # Assume binary if not explicitly text
+
+        with open(file_path, mode='wb+') as f:
+            f.write(response.content)            
+        #print(response.content)
+        #print(file_path)
+        return file_path
+
+def get_extension_from_content_type(content_type):
+    """Map Content-Type to a file extension."""
+    content_type_mapping = [
+        'html',
+        'json',
+        'xml',
+        'txt',
+        'zip'
+    ]
+    for ext in content_type_mapping:
+        if ext in content_type:
+            return ext
+    return None
+
+def save_file(content, file_path, binary=False):
+    """
+    Save content to a file.
+
+    Parameters
+    ----------
+    content : bytes or str
+        The content to save.
+    file_path : str
+        Path to the file.
+    binary : bool
+        Whether the content is binary or text.
+    """
+    mode = 'wb' if binary else 'w'
+    with open(file_path, mode) as f:
+        f.write(content)
+
+
 # Function to send a GET request to download a zip file for the given id under the CELLAR URI
 def fetch_content(id: str) -> requests.Response:
     """
@@ -205,7 +250,7 @@ def fetch_content(id: str) -> requests.Response:
     try:
         url = BASE_URL + id
         headers = {
-            'Accept': "application/zip, application/zip;mtype=fmx4, application/xml;mtype=fmx4, application/xhtml+xml, text/html, text/html;type=simplified, application/msword, text/plain, application/xml;notice=object",
+            'Accept': "*, application/zip, application/zip;mtype=fmx4, application/xml;mtype=fmx4, application/xhtml+xml, text/html, text/html;type=simplified, application/msword, text/plain, application/xml, application/xml;notice=object",
             'Accept-Language': "eng",
             'Content-Type': "application/x-www-form-urlencoded",
             'Host': "publications.europa.eu"
@@ -238,9 +283,9 @@ def extract_zip(response: requests.Response, folder_path: str):
     logging.basicConfig(level=logging.INFO)
     # Simulate getting results from somewhere
 
-    with open('./tests/results_html.json', 'r') as f:
+    with open('./tests/metadata/query_results/query_results.json', 'r') as f:
         results = json.loads(f.read())  # Load the JSON data
-    document_paths = download_documents(results, './tests/data/html', log_dir='./tests/logs', format='xhtml')
-    #document_paths = download_documents(results, './tests/data/formex', log_dir='./tests/logs', format='fmx4')
+    #document_paths = download_documents(results, './tests/data/html', log_dir='./tests/logs', format='xhtml')
+    document_paths = download_documents(results, './tests/data/formex', log_dir='./tests/logs', format='fmx4')
 
     print(document_paths)
diff --git a/ulit/main.py b/ulit/main.py
@@ -0,0 +1,64 @@
+import json
+import logging
+import os
+from download import download_documents
+from sparql import send_sparql_query
+from parsers.html import HTMLParser
+from parsers.formex import Formex4Parser
+
+def main():
+    """
+    Main function to execute SPARQL query and download documents
+    """
+    # Configure logging
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+
+    try:
+
+        # Send SPARQL query
+        logger.info("Executing SPARQL query")
+        results = send_sparql_query('./tests/metadata/queries/formex_query.rq', celex='32008R1137')
+
+
+        # Save query results to JSON
+        results_file = './tests/metadata/query_results/query_results.json'
+        with open(results_file, "w") as f:
+            json.dump(results, f, indent=4)
+            logger.info(f"Results dumped in {results_file}")
+
+        # Load query results
+        with open('./tests/metadata/query_results/query_results.json', 'r') as f:
+            results = json.loads(f.read())
+
+        # Download documents
+        logger.info("Downloading documents")
+        downloaded_document_paths = download_documents(
+            results, 
+            './tests/data/formex', 
+            log_dir='./tests/logs', 
+            format='fmx4'
+        )
+        logger.info(f'{len(downloaded_document_paths)} documents downloaded in {downloaded_document_paths}')
+
+        # Extract the directory path (removing what's after the last '/')
+
+        # List the contents of the first directory
+        first_path = downloaded_document_paths[0]
+        first_item = os.listdir(first_path)[0]
+        file_path = os.path.join(*first_path.split('/'), first_item)
+
+        print(f'Parsing {file_path}')
+        # Sort the contents alphabetically and get the first item
+
+        parser = Formex4Parser()
+        parser.parse(file_path)
+        print(parser.articles)
+        #print(document_tree)
+
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        raise
+
+if __name__ == "__main__":
+    main()