Skip to content

Commit

Permalink
Reorganised example files outside of parsers folder
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 14, 2024
1 parent 208dcfd commit 475cccc
Show file tree
Hide file tree
Showing 10 changed files with 29 additions and 20 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
1 change: 1 addition & 0 deletions tests/logs/2024-12-14_12-00-56.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Zip files: 0, Single files: 1, Failed downloads: 0
1 change: 1 addition & 0 deletions tests/logs/2024-12-14_12-01-34.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Zip files: 0, Single files: 1, Failed downloads: 0
2 changes: 1 addition & 1 deletion tests/parsers/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import lxml.etree as etree

# Define constants for file paths and directories
DATA_DIR = os.path.join(os.path.dirname(__file__), "data/akn")
DATA_DIR = os.path.join(os.path.dirname(__file__), "../data/akn")
file_path = os.path.join(DATA_DIR, "32014L0092.akn")

class TestAkomaNtosoParser(unittest.TestCase):
Expand Down
2 changes: 1 addition & 1 deletion tests/parsers/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import os

DATA_DIR = os.path.join(os.path.dirname(__file__), "data/formex")
DATA_DIR = os.path.join(os.path.dirname(__file__), "../data/formex")

class TestFormex4Parser(unittest.TestCase):
def setUp(self):
Expand Down
43 changes: 25 additions & 18 deletions ulit/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@
import zipfile
import requests
from datetime import datetime
import threading
import json

# Constants
BASE_URL = 'http://publications.europa.eu/resource/cellar/'

def download_documents(results, download_dir, log_dir, format=None, nthreads=1):
def download_documents(results, download_dir, log_dir, format=None):
"""
Download Cellar documents in parallel using multiple threads.
Expand All @@ -31,17 +30,15 @@ def download_documents(results, download_dir, log_dir, format=None, nthreads=1):
The function uses a separate thread for each subset of Cellar ids.
The number of threads can be adjusted by modifying the `nthreads` parameter.
"""
cellar_ids = get_cellar_ids_from_json_results(results, format)
cellar_ids = get_cellar_ids_from_json_results(cellar_results=results, format=format)

if not os.path.exists(log_dir):
os.makedirs(log_dir)
threads = []
for i in range(nthreads):
cellar_ids_subset = cellar_ids[i::nthreads]
t = threading.Thread(target=process_range, args=(cellar_ids_subset, os.path.join(download_dir), log_dir))
threads.append(t)
[t.start() for t in threads]
[t.join() for t in threads]

document_paths = process_range(ids=cellar_ids, folder_path=os.path.join(download_dir), log_dir=log_dir)

return document_paths


def get_cellar_ids_from_json_results(cellar_results, format):
"""
Expand Down Expand Up @@ -127,38 +124,47 @@ def process_range(ids: list, folder_path: str, log_dir: str):
zip_files = []
single_files = []
other_downloads = []
file_paths = []

for id in ids:
sub_folder_path = os.path.join(folder_path, id)
file_path = os.path.join(folder_path, id)

response = rest_get_call(id.strip())
response = fetch_content(id.strip())
if response is None:
continue

if 'Content-Type' in response.headers:
if 'zip' in response.headers['Content-Type']:
zip_files.append(id)
extract_zip(response, sub_folder_path)
extract_zip(response, file_path)
file_paths.append(file_path)
else:
single_files.append(id)
process_single_file(response, sub_folder_path)
process_single_file(response, file_path)
file_paths.append(file_path)
else:
other_downloads.append(id)

if len(other_downloads) != 0:
# Log results
id_logs_path = log_dir + 'failed_' + get_current_timestamp() + '.txt'
id_logs_path = os.path.join(log_dir, 'failed_' + get_current_timestamp() + '.txt')
os.makedirs(os.path.dirname(id_logs_path), exist_ok=True)
with open(id_logs_path, 'w+') as f:
f.write('Failed downloads ' + get_current_timestamp() + '\n' + str(other_downloads))

with open(log_dir + get_current_timestamp() + '.txt', 'w+') as f:
with open(os.path.join(log_dir, get_current_timestamp() + '.txt'), 'w+') as f:
f.write(f"Zip files: {len(zip_files)}, Single files: {len(single_files)}, Failed downloads: {len(other_downloads)}")
return file_paths
except Exception as e:
logging.error(f"Error processing range: {e}")

def save_file(content, file_path):
"""Save content to a file."""
with open(file_path, 'w+', encoding='utf-8') as f:
f.write(content)

# Function to send a GET request to download a zip file for the given id under the CELLAR URI
def rest_get_call(id: str) -> requests.Response:
def fetch_content(id: str) -> requests.Response:
"""
Send a GET request to download a zip file for the given id under the CELLAR URI.
Expand Down Expand Up @@ -274,4 +280,5 @@ def print_list_to_file(filename, lst):
# Simulate getting results from somewhere
with open('./tests/results.json', 'r') as f:
results = json.loads(f.read()) # Load the JSON data
document_path = download_documents(results, './tests/parsers/data/html', log_dir='./tests/logs', format='xhtml')
document_paths = download_documents(results, './tests/data/html', log_dir='./tests/logs', format='xhtml')
print(document_paths)

0 comments on commit 475cccc

Please sign in to comment.