Reorganised example files outside of parsers folder

AlessioNar · Dec 14, 2024 · 475cccc · 475cccc
1 parent 208dcfd
commit 475cccc
Show file tree

Hide file tree

Showing 10 changed files with 29 additions and 20 deletions.
diff --git a/tests/parsers/data/akn/32014L0092.akn → tests/data/akn/32014L0092.akn b/tests/parsers/data/akn/32014L0092.akn → tests/data/akn/32014L0092.akn
diff --git a/...s/parsers/data/akn/it_senato_ddl_2013.xml → tests/data/akn/it_senato_ddl_2013.xml b/...s/parsers/data/akn/it_senato_ddl_2013.xml → tests/data/akn/it_senato_ddl_2013.xml
diff --git a/...sers/data/formex/L_2011334EN.01002501.xml → tests/data/formex/L_2011334EN.01002501.xml b/...sers/data/formex/L_2011334EN.01002501.xml → tests/data/formex/L_2011334EN.01002501.xml
diff --git a/.../data/formex/L_202400903EN.000101.fmx.xml → .../data/formex/L_202400903EN.000101.fmx.xml b/.../data/formex/L_202400903EN.000101.fmx.xml → .../data/formex/L_202400903EN.000101.fmx.xml
diff --git a/...11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html → ...11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html b/...11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html → ...11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html
diff --git a/tests/logs/2024-12-14_12-00-56.txt b/tests/logs/2024-12-14_12-00-56.txt
@@ -0,0 +1 @@
+Zip files: 0, Single files: 1, Failed downloads: 0
diff --git a/tests/logs/2024-12-14_12-01-34.txt b/tests/logs/2024-12-14_12-01-34.txt
@@ -0,0 +1 @@
+Zip files: 0, Single files: 1, Failed downloads: 0
diff --git a/tests/parsers/test_akomantoso.py b/tests/parsers/test_akomantoso.py
@@ -4,7 +4,7 @@
 import lxml.etree as etree
 
 # Define constants for file paths and directories
-DATA_DIR = os.path.join(os.path.dirname(__file__), "data/akn")
+DATA_DIR = os.path.join(os.path.dirname(__file__), "../data/akn")
 file_path = os.path.join(DATA_DIR, "32014L0092.akn")
 
 class TestAkomaNtosoParser(unittest.TestCase):

diff --git a/tests/parsers/test_formex.py b/tests/parsers/test_formex.py
@@ -4,7 +4,7 @@
 
 import os 
 
-DATA_DIR = os.path.join(os.path.dirname(__file__), "data/formex")
+DATA_DIR = os.path.join(os.path.dirname(__file__), "../data/formex")
 
 class TestFormex4Parser(unittest.TestCase):
     def setUp(self):

diff --git a/ulit/download.py b/ulit/download.py
@@ -4,13 +4,12 @@
 import zipfile
 import requests
 from datetime import datetime
-import threading
 import json
 
 # Constants
 BASE_URL = 'http://publications.europa.eu/resource/cellar/'
 
-def download_documents(results, download_dir, log_dir, format=None, nthreads=1):
+def download_documents(results, download_dir, log_dir, format=None):
     """
     Download Cellar documents in parallel using multiple threads.
 
@@ -31,17 +30,15 @@ def download_documents(results, download_dir, log_dir, format=None, nthreads=1):
     The function uses a separate thread for each subset of Cellar ids.
     The number of threads can be adjusted by modifying the `nthreads` parameter.
     """
-    cellar_ids = get_cellar_ids_from_json_results(results, format)
+    cellar_ids = get_cellar_ids_from_json_results(cellar_results=results, format=format)
 
     if not os.path.exists(log_dir):
         os.makedirs(log_dir)
-    threads = []
-    for i in range(nthreads):  
-        cellar_ids_subset = cellar_ids[i::nthreads]
-        t = threading.Thread(target=process_range, args=(cellar_ids_subset, os.path.join(download_dir), log_dir))
-        threads.append(t)
-    [t.start() for t in threads]
-    [t.join() for t in threads]
+
+    document_paths = process_range(ids=cellar_ids, folder_path=os.path.join(download_dir), log_dir=log_dir)
+
+    return document_paths
+
 
 def get_cellar_ids_from_json_results(cellar_results, format):
     """
@@ -127,38 +124,47 @@ def process_range(ids: list, folder_path: str, log_dir: str):
         zip_files = []
         single_files = []
         other_downloads = []
+        file_paths = []
 
         for id in ids:
-            sub_folder_path = os.path.join(folder_path, id)
+            file_path = os.path.join(folder_path, id)
 
-            response = rest_get_call(id.strip())
+            response = fetch_content(id.strip())
             if response is None:
                 continue
 
             if 'Content-Type' in response.headers:
                 if 'zip' in response.headers['Content-Type']:
                     zip_files.append(id)
-                    extract_zip(response, sub_folder_path)
+                    extract_zip(response, file_path)
+                    file_paths.append(file_path)
                 else:
                     single_files.append(id)
-                    process_single_file(response, sub_folder_path)
+                    process_single_file(response, file_path)
+                    file_paths.append(file_path)
             else:
                 other_downloads.append(id)
 
         if len(other_downloads) != 0:
             # Log results
-            id_logs_path = log_dir + 'failed_' + get_current_timestamp() + '.txt'
+            id_logs_path = os.path.join(log_dir, 'failed_' + get_current_timestamp() + '.txt')
             os.makedirs(os.path.dirname(id_logs_path), exist_ok=True)
             with open(id_logs_path, 'w+') as f:
                 f.write('Failed downloads ' + get_current_timestamp() + '\n' + str(other_downloads))
 
-        with open(log_dir + get_current_timestamp() + '.txt', 'w+') as f:
+        with open(os.path.join(log_dir, get_current_timestamp() + '.txt'), 'w+') as f:
             f.write(f"Zip files: {len(zip_files)}, Single files: {len(single_files)}, Failed downloads: {len(other_downloads)}")
+        return file_paths
     except Exception as e:
         logging.error(f"Error processing range: {e}")
 
+def save_file(content, file_path):
+    """Save content to a file."""
+    with open(file_path, 'w+', encoding='utf-8') as f:
+        f.write(content)
+
 # Function to send a GET request to download a zip file for the given id under the CELLAR URI
-def rest_get_call(id: str) -> requests.Response:
+def fetch_content(id: str) -> requests.Response:
     """
     Send a GET request to download a zip file for the given id under the CELLAR URI.
 
@@ -274,4 +280,5 @@ def print_list_to_file(filename, lst):
     # Simulate getting results from somewhere
     with open('./tests/results.json', 'r') as f:
         results = json.loads(f.read())  # Load the JSON data
-    document_path = download_documents(results, './tests/parsers/data/html', log_dir='./tests/logs', format='xhtml')
+    document_paths = download_documents(results, './tests/data/html', log_dir='./tests/logs', format='xhtml')
+    print(document_paths)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Zip files: 0, Single files: 1, Failed downloads: 0