From 79582b93b80dafeeec305e50732bfd73a980a476 Mon Sep 17 00:00:00 2001
From: AlessioNar <alessio.nardin@gmail.com>
Date: Wed, 16 Oct 2024 15:15:11 +0200
Subject: [PATCH 1/5] restructured document and added docstrings

---
 op_cellar/documents.py | 195 ++++++++++++++++++++++++++++++-----------
 1 file changed, 145 insertions(+), 50 deletions(-)

diff --git a/op_cellar/documents.py b/op_cellar/documents.py
index 339b50e..de39d98 100644
--- a/op_cellar/documents.py
+++ b/op_cellar/documents.py
@@ -11,33 +11,46 @@
 BASE_URL = 'http://publications.europa.eu/resource/cellar/'
 LOG_DIR = 'logs/'
 
-# Function to get the current timestamp
-def get_current_timestamp():
-    return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
 
-# Function to print a list to a file
-def print_list_to_file(filename, lst):
-    with open(filename, 'w+') as f:
-        for item in lst:
-            f.write(item + '\n')
+# Function to send a GET request to download a zip file for the given id under the CELLAR URI
+def rest_get_call(id: str) -> requests.Response:
+    """
+    Send a GET request to download a zip file for the given id under the CELLAR URI.
 
-# Function to download a zip file and extract it
-def extract_zip(response: requests.Response, folder_path: str):
-    try:
-        z = zipfile.ZipFile(io.BytesIO(response.content))
-        z.extractall(folder_path)
-    except Exception as e:
-        logging.error(f"Error downloading zip: {e}")
+    Parameters
+    ----------
+    id : str
+        The id of the resource to be retrieved.
 
-# Function to process a single file
-def process_single_file(response: requests.Response, folder_path: str, id: str):
-    out_file = folder_path + '/' + id + '.html'
-    os.makedirs(os.path.dirname(out_file), exist_ok=True)
-    with open(out_file, 'w+', encoding="utf-8") as f:
-        f.write(response.text)
+    Returns
+    -------
+    requests.Response
+        The response from the server.
 
-# Function to send a GET request to download a zip file for the given id under the CELLAR URI
-def rest_get_call(id: str) -> requests.Response:
+    Notes
+    -----
+    The request is sent with the following headers:
+    - Accept: application/xhtml+xml
+    - Accept-Language: eng
+    - Content-Type: application/x-www-form-urlencoded
+    - Host: publications.europa.eu
+
+    Raises
+    ------
+    requests.RequestException
+        If there is an error sending the request.
+
+    See Also
+    --------
+    requests : The underlying library used for making HTTP requests.
+
+    Examples
+    --------
+    >>> import requests
+    >>> response = rest_get_call('some_id')
+    >>> if response is not None:
+    ...     print(response.status_code)
+    """
     try:
         url = BASE_URL + id
         headers = {
@@ -55,21 +68,89 @@ def rest_get_call(id: str) -> requests.Response:
 
 # Function to create a list of CELLAR ids from the given cellar_results JSON dictionary and return the list
 def get_cellar_ids_from_json_results(cellar_results):
+    """
+    Extract CELLAR ids from a JSON dictionary.
+
+    Parameters
+    ----------
+    cellar_results : dict
+        A dictionary containing the response of the CELLAR SPARQL query
+
+    Returns
+    -------
+    list
+        A list of CELLAR ids.
+
+    Notes
+    -----
+    The function assumes that the JSON dictionary has the following structure:
+    - The dictionary contains a key "results" that maps to another dictionary.
+    - The inner dictionary contains a key "bindings" that maps to a list of dictionaries.
+    - Each dictionary in the list contains a key "cellarURIs" that maps to a dictionary.
+    - The innermost dictionary contains a key "value" that maps to a string representing the CELLAR URI.
+
+    The function extracts the CELLAR id by splitting the CELLAR URI at "cellar/" and taking the second part.
+
+    Examples
+    --------
+    >>> cellar_results = {
+    ...     "results": {
+    ...         "bindings": [
+    ...             {"cellarURIs": {"value": "https://example.com/cellar/some_id"}},
+    ...             {"cellarURIs": {"value": "https://example.com/cellar/another_id"}}
+    ...         ]
+    ...     }
+    ... }
+    >>> cellar_ids = get_cellar_ids_from_json_results(cellar_results)
+    >>> print(cellar_ids)
+    ['some_id', 'another_id']
+    """
     results_list = cellar_results["results"]["bindings"]
     cellar_ids_list = [results_list[i]["cellarURIs"]["value"].split("cellar/")[1] for i in range(len(results_list))]
     return cellar_ids_list
 
-# Function to log downloaded files
-def log_downloaded_files(downloaded_files: list, dir_to_check: str):
-    in_dir_name = LOG_DIR + 'in_dir_lists/'
-    os.makedirs(os.path.dirname(in_dir_name), exist_ok=True)
-    print_list_to_file(in_dir_name + 'in_dir_' + get_current_timestamp() + '.txt', downloaded_files)
+def download_documents(results, download_dir, nthreads=1):
+    """
+    Download Cellar documents in parallel using multiple threads.
+
+    Sends a REST query to the Publications Office APIs and downloads the documents
+    corresponding to the given results.
+
+    Parameters
+    ----------
+    results : dict
+        A dictionary containing the JSON results from the Publications Office APIs.
+    download_dir : str
+        The directory where the downloaded documents will be saved.
+    nthreads : int
+        The number of threads to use to make the request
+
+    Notes
+    -----
+    The function uses a separate thread for each subset of Cellar ids.
+    The number of threads can be adjusted by modifying the `nthreads` parameter.
+    """
+    cellar_ids = get_cellar_ids_from_json_results(results)
+
+    if not os.path.exists(LOG_DIR):
+        os.makedirs(LOG_DIR)
+    
+    threads = []
+    for i in range(nthreads):  
+        sub_list = cellar_ids[i::nthreads]
+        t = threading.Thread(target=process_range, args=(sub_list, os.path.join(download_dir, str(sub_list))))
+        threads.append(t)
+    [t.start() for t in threads]
+    [t.join() for t in threads]
+
+
+# Function to process a single file
+def process_single_file(response: requests.Response, folder_path: str, id: str):
+    out_file = folder_path + '/' + id + '.html'
+    os.makedirs(os.path.dirname(out_file), exist_ok=True)
+    with open(out_file, 'w+', encoding="utf-8") as f:
+        f.write(response.text)
 
-# Function to log missing ids
-def log_missing_ids(missing_ids: list):
-    new_ids_dir_name = LOG_DIR + 'cellar_ids/'
-    os.makedirs(os.path.dirname(new_ids_dir_name), exist_ok=True)
-    print_list_to_file(new_ids_dir_name + 'cellar_ids_' + get_current_timestamp() + '.txt', missing_ids)
 
 # Function to process a list of ids to download the corresponding zip files
 def process_range(ids: list, folder_path: str):
@@ -107,23 +188,37 @@ def process_range(ids: list, folder_path: str):
     except Exception as e:
         logging.error(f"Error processing range: {e}")
 
-# Function to download files in parallel using multiple threads
-def download_documents(results, download_dir):
-    """
-    Sends a REST query to the Publications Office APIs and downloads the Cellar documents
-    """
-    cellar_ids = get_cellar_ids_from_json_results(results)
-    if not os.path.exists(LOG_DIR):
-        os.makedirs(LOG_DIR)
-    
-    nthreads = 1
-    threads = []
-    for i in range(nthreads):  
-        sub_list = cellar_ids[i::nthreads]
-        t = threading.Thread(target=process_range, args=(sub_list, os.path.join(download_dir, str(sub_list))))
-        threads.append(t)
-    [t.start() for t in threads]
-    [t.join() for t in threads]
+# Function to log downloaded files
+def log_downloaded_files(downloaded_files: list, dir_to_check: str):
+    in_dir_name = LOG_DIR + 'in_dir_lists/'
+    os.makedirs(os.path.dirname(in_dir_name), exist_ok=True)
+    print_list_to_file(in_dir_name + 'in_dir_' + get_current_timestamp() + '.txt', downloaded_files)
+
+# Function to log missing ids
+def log_missing_ids(missing_ids: list):
+    new_ids_dir_name = LOG_DIR + 'cellar_ids/'
+    os.makedirs(os.path.dirname(new_ids_dir_name), exist_ok=True)
+    print_list_to_file(new_ids_dir_name + 'cellar_ids_' + get_current_timestamp() + '.txt', missing_ids)
+
+
+# Function to get the current timestamp
+def get_current_timestamp():
+    return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+
+# Function to print a list to a file
+def print_list_to_file(filename, lst):
+    with open(filename, 'w+') as f:
+        for item in lst:
+            f.write(item + '\n')
+
+# Function to download a zip file and extract it
+def extract_zip(response: requests.Response, folder_path: str):
+    try:
+        z = zipfile.ZipFile(io.BytesIO(response.content))
+        z.extractall(folder_path)
+    except Exception as e:
+        logging.error(f"Error downloading zip: {e}")
+
 
 # Main function
 if __name__ == "__main__":

From cf84d9d73ebbfd2135aeccca053c629a70f74019 Mon Sep 17 00:00:00 2001
From: AlessioNar <alessio.nardin@gmail.com>
Date: Wed, 16 Oct 2024 15:17:20 +0200
Subject: [PATCH 2/5] Reordered files

---
 op_cellar/documents.py | 141 ++++++++++++++++++++---------------------
 1 file changed, 69 insertions(+), 72 deletions(-)

diff --git a/op_cellar/documents.py b/op_cellar/documents.py
index de39d98..277ad19 100644
--- a/op_cellar/documents.py
+++ b/op_cellar/documents.py
@@ -11,60 +11,39 @@
 BASE_URL = 'http://publications.europa.eu/resource/cellar/'
 LOG_DIR = 'logs/'
 
-
-# Function to send a GET request to download a zip file for the given id under the CELLAR URI
-def rest_get_call(id: str) -> requests.Response:
+def download_documents(results, download_dir, nthreads=1):
     """
-    Send a GET request to download a zip file for the given id under the CELLAR URI.
+    Download Cellar documents in parallel using multiple threads.
+
+    Sends a REST query to the Publications Office APIs and downloads the documents
+    corresponding to the given results.
 
     Parameters
     ----------
-    id : str
-        The id of the resource to be retrieved.
-
-    Returns
-    -------
-    requests.Response
-        The response from the server.
+    results : dict
+        A dictionary containing the JSON results from the Publications Office APIs.
+    download_dir : str
+        The directory where the downloaded documents will be saved.
+    nthreads : int
+        The number of threads to use to make the request
 
     Notes
     -----
-    The request is sent with the following headers:
-    - Accept: application/xhtml+xml
-    - Accept-Language: eng
-    - Content-Type: application/x-www-form-urlencoded
-    - Host: publications.europa.eu
-
-    Raises
-    ------
-    requests.RequestException
-        If there is an error sending the request.
-
-    See Also
-    --------
-    requests : The underlying library used for making HTTP requests.
-
-    Examples
-    --------
-    >>> import requests
-    >>> response = rest_get_call('some_id')
-    >>> if response is not None:
-    ...     print(response.status_code)
+    The function uses a separate thread for each subset of Cellar ids.
+    The number of threads can be adjusted by modifying the `nthreads` parameter.
     """
-    try:
-        url = BASE_URL + id
-        headers = {
-            'Accept': "application/xhtml+xml",
-            'Accept-Language': "eng",
-            'Content-Type': "application/x-www-form-urlencoded",
-            'Host': "publications.europa.eu"
-        }
-        response = requests.request("GET", url, headers=headers)
-        response.raise_for_status()
-        return response
-    except requests.RequestException as e:
-        logging.error(f"Error sending GET request: {e}")
-        return None
+    cellar_ids = get_cellar_ids_from_json_results(results)
+
+    if not os.path.exists(LOG_DIR):
+        os.makedirs(LOG_DIR)
+    
+    threads = []
+    for i in range(nthreads):  
+        sub_list = cellar_ids[i::nthreads]
+        t = threading.Thread(target=process_range, args=(sub_list, os.path.join(download_dir, str(sub_list))))
+        threads.append(t)
+    [t.start() for t in threads]
+    [t.join() for t in threads]
 
 # Function to create a list of CELLAR ids from the given cellar_results JSON dictionary and return the list
 def get_cellar_ids_from_json_results(cellar_results):
@@ -109,40 +88,59 @@ def get_cellar_ids_from_json_results(cellar_results):
     cellar_ids_list = [results_list[i]["cellarURIs"]["value"].split("cellar/")[1] for i in range(len(results_list))]
     return cellar_ids_list
 
-def download_documents(results, download_dir, nthreads=1):
+# Function to send a GET request to download a zip file for the given id under the CELLAR URI
+def rest_get_call(id: str) -> requests.Response:
     """
-    Download Cellar documents in parallel using multiple threads.
-
-    Sends a REST query to the Publications Office APIs and downloads the documents
-    corresponding to the given results.
+    Send a GET request to download a zip file for the given id under the CELLAR URI.
 
     Parameters
     ----------
-    results : dict
-        A dictionary containing the JSON results from the Publications Office APIs.
-    download_dir : str
-        The directory where the downloaded documents will be saved.
-    nthreads : int
-        The number of threads to use to make the request
+    id : str
+        The id of the resource to be retrieved.
+
+    Returns
+    -------
+    requests.Response
+        The response from the server.
 
     Notes
     -----
-    The function uses a separate thread for each subset of Cellar ids.
-    The number of threads can be adjusted by modifying the `nthreads` parameter.
-    """
-    cellar_ids = get_cellar_ids_from_json_results(results)
+    The request is sent with the following headers:
+    - Accept: application/xhtml+xml
+    - Accept-Language: eng
+    - Content-Type: application/x-www-form-urlencoded
+    - Host: publications.europa.eu
 
-    if not os.path.exists(LOG_DIR):
-        os.makedirs(LOG_DIR)
-    
-    threads = []
-    for i in range(nthreads):  
-        sub_list = cellar_ids[i::nthreads]
-        t = threading.Thread(target=process_range, args=(sub_list, os.path.join(download_dir, str(sub_list))))
-        threads.append(t)
-    [t.start() for t in threads]
-    [t.join() for t in threads]
+    Raises
+    ------
+    requests.RequestException
+        If there is an error sending the request.
 
+    See Also
+    --------
+    requests : The underlying library used for making HTTP requests.
+
+    Examples
+    --------
+    >>> import requests
+    >>> response = rest_get_call('some_id')
+    >>> if response is not None:
+    ...     print(response.status_code)
+    """
+    try:
+        url = BASE_URL + id
+        headers = {
+            'Accept': "application/xhtml+xml",
+            'Accept-Language': "eng",
+            'Content-Type': "application/x-www-form-urlencoded",
+            'Host': "publications.europa.eu"
+        }
+        response = requests.request("GET", url, headers=headers)
+        response.raise_for_status()
+        return response
+    except requests.RequestException as e:
+        logging.error(f"Error sending GET request: {e}")
+        return None
 
 # Function to process a single file
 def process_single_file(response: requests.Response, folder_path: str, id: str):
@@ -219,7 +217,6 @@ def extract_zip(response: requests.Response, folder_path: str):
     except Exception as e:
         logging.error(f"Error downloading zip: {e}")
 
-
 # Main function
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)

From 535746619cabac9eb1431c3b0533b143f1e31c00 Mon Sep 17 00:00:00 2001
From: AlessioNar <alessio.nardin@gmail.com>
Date: Wed, 16 Oct 2024 16:00:35 +0200
Subject: [PATCH 3/5] Added further docstrings

---
 op_cellar/documents.py | 90 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 13 deletions(-)

diff --git a/op_cellar/documents.py b/op_cellar/documents.py
index 277ad19..a56345c 100644
--- a/op_cellar/documents.py
+++ b/op_cellar/documents.py
@@ -45,7 +45,6 @@ def download_documents(results, download_dir, nthreads=1):
     [t.start() for t in threads]
     [t.join() for t in threads]
 
-# Function to create a list of CELLAR ids from the given cellar_results JSON dictionary and return the list
 def get_cellar_ids_from_json_results(cellar_results):
     """
     Extract CELLAR ids from a JSON dictionary.
@@ -106,7 +105,7 @@ def rest_get_call(id: str) -> requests.Response:
     Notes
     -----
     The request is sent with the following headers:
-    - Accept: application/xhtml+xml
+    - Accept: application/xhtml+xml @todo - cater for other kinds of requests too.
     - Accept-Language: eng
     - Content-Type: application/x-www-form-urlencoded
     - Host: publications.europa.eu
@@ -144,6 +143,36 @@ def rest_get_call(id: str) -> requests.Response:
 
 # Function to process a single file
 def process_single_file(response: requests.Response, folder_path: str, id: str):
+    """
+    Process a single file by saving its contents to a file.
+
+    Parameters
+    ----------
+    response : requests.Response
+        The HTTP response object containing the file contents.
+    folder_path : str
+        The path to the folder where the file will be saved.
+    id : str
+        The id of the file, used to construct the file name.
+
+    Returns
+    -------
+    None
+
+    Notes
+    -----
+    This function saves the contents of a single file from an HTTP response to a
+    file on disk. The file name is constructed by appending the id to the folder
+    path with an '.html' extension. The function ensures that the directory path
+    exists before attempting to write the file.
+
+    Examples
+    --------
+    >>> response = requests.get('http://example.com/file')
+    >>> folder_path = '/path/to/folder'
+    >>> id = 'file_id'
+    >>> process_single_file(response, folder_path, id)
+    """
     out_file = folder_path + '/' + id + '.html'
     os.makedirs(os.path.dirname(out_file), exist_ok=True)
     with open(out_file, 'w+', encoding="utf-8") as f:
@@ -152,6 +181,38 @@ def process_single_file(response: requests.Response, folder_path: str, id: str):
 
 # Function to process a list of ids to download the corresponding zip files
 def process_range(ids: list, folder_path: str):
+    """
+    Process a list of ids to download the corresponding zip files.
+
+    Parameters
+    ----------
+    ids : list
+        List of ids to process.
+    folder_path : str
+        Path to the folder where the files will be downloaded.
+
+    Returns
+    -------
+    None
+
+    Raises
+    ------
+    Exception
+        If an error occurs during the processing.
+
+    Notes
+    -----
+    This function iterates over the list of ids, sends a GET request for each id,
+    and downloads the corresponding file. If the file is a zip file, it is extracted
+    to the specified folder. If the file is not a zip file, it is processed as a
+    single file. If the file cannot be downloaded, the id is logged to a file.
+
+    Examples
+    --------
+    >>> ids = ['id1', 'id2', 'id3']
+    >>> folder_path = '/path/to/folder'
+    >>> process_range(ids, folder_path)
+    """
     try:
         zip_files = []
         single_files = []
@@ -186,6 +247,20 @@ def process_range(ids: list, folder_path: str):
     except Exception as e:
         logging.error(f"Error processing range: {e}")
 
+
+# Function to get the current timestamp
+def get_current_timestamp():
+    return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+
+
+# Function to download a zip file and extract it
+def extract_zip(response: requests.Response, folder_path: str):
+    try:
+        z = zipfile.ZipFile(io.BytesIO(response.content))
+        z.extractall(folder_path)
+    except Exception as e:
+        logging.error(f"Error downloading zip: {e}")
+
 # Function to log downloaded files
 def log_downloaded_files(downloaded_files: list, dir_to_check: str):
     in_dir_name = LOG_DIR + 'in_dir_lists/'
@@ -199,23 +274,12 @@ def log_missing_ids(missing_ids: list):
     print_list_to_file(new_ids_dir_name + 'cellar_ids_' + get_current_timestamp() + '.txt', missing_ids)
 
 
-# Function to get the current timestamp
-def get_current_timestamp():
-    return datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
-
 # Function to print a list to a file
 def print_list_to_file(filename, lst):
     with open(filename, 'w+') as f:
         for item in lst:
             f.write(item + '\n')
 
-# Function to download a zip file and extract it
-def extract_zip(response: requests.Response, folder_path: str):
-    try:
-        z = zipfile.ZipFile(io.BytesIO(response.content))
-        z.extractall(folder_path)
-    except Exception as e:
-        logging.error(f"Error downloading zip: {e}")
 
 # Main function
 if __name__ == "__main__":

From 564570eabf4d48a5b59407762a99d0c86f8b8852 Mon Sep 17 00:00:00 2001
From: AlessioNar <alessio.nardin@gmail.com>
Date: Wed, 16 Oct 2024 17:20:55 +0200
Subject: [PATCH 4/5] reorganised functions and bugfix when storing files

---
 op_cellar/documents.py | 154 ++++++++++++++++++++---------------------
 1 file changed, 76 insertions(+), 78 deletions(-)

diff --git a/op_cellar/documents.py b/op_cellar/documents.py
index a56345c..0309827 100644
--- a/op_cellar/documents.py
+++ b/op_cellar/documents.py
@@ -36,11 +36,11 @@ def download_documents(results, download_dir, nthreads=1):
 
     if not os.path.exists(LOG_DIR):
         os.makedirs(LOG_DIR)
-    
     threads = []
     for i in range(nthreads):  
-        sub_list = cellar_ids[i::nthreads]
-        t = threading.Thread(target=process_range, args=(sub_list, os.path.join(download_dir, str(sub_list))))
+        cellar_ids_subset = cellar_ids[i::nthreads]
+        print(cellar_ids_subset)
+        t = threading.Thread(target=process_range, args=(cellar_ids_subset, os.path.join(download_dir)))
         threads.append(t)
     [t.start() for t in threads]
     [t.join() for t in threads]
@@ -87,6 +87,74 @@ def get_cellar_ids_from_json_results(cellar_results):
     cellar_ids_list = [results_list[i]["cellarURIs"]["value"].split("cellar/")[1] for i in range(len(results_list))]
     return cellar_ids_list
 
+# Function to process a list of ids to download the corresponding zip files
+def process_range(ids: list, folder_path: str):
+    """
+    Process a list of ids to download the corresponding zip files.
+
+    Parameters
+    ----------
+    ids : list
+        List of ids to process.
+    folder_path : str
+        Path to the folder where the files will be downloaded.
+
+    Returns
+    -------
+    None
+
+    Raises
+    ------
+    Exception
+        If an error occurs during the processing.
+
+    Notes
+    -----
+    This function iterates over the list of ids, sends a GET request for each id,
+    and downloads the corresponding file. If the file is a zip file, it is extracted
+    to the specified folder. If the file is not a zip file, it is processed as a
+    single file. If the file cannot be downloaded, the id is logged to a file.
+
+    Examples
+    --------
+    >>> ids = ['id1', 'id2', 'id3']
+    >>> folder_path = '/path/to/folder'
+    >>> process_range(ids, folder_path)
+    """
+    try:
+        zip_files = []
+        single_files = []
+        other_downloads = []
+        
+        for id in ids:
+            sub_folder_path = os.path.join(folder_path, id)
+            
+            response = rest_get_call(id.strip())
+            if response is None:
+                continue
+            
+            if 'Content-Type' in response.headers:
+                if 'zip' in response.headers['Content-Type']:
+                    zip_files.append(id)
+                    extract_zip(response, sub_folder_path)
+                else:
+                    single_files.append(id)
+                    process_single_file(response, sub_folder_path)
+            else:
+                other_downloads.append(id)
+        
+        if len(other_downloads) != 0:
+            # Log results
+            id_logs_path = LOG_DIR + 'failed_' + get_current_timestamp() + '.txt'
+            os.makedirs(os.path.dirname(id_logs_path), exist_ok=True)
+            with open(id_logs_path, 'w+') as f:
+                f.write('Failed downloads ' + get_current_timestamp() + '\n' + str(other_downloads))
+        
+        with open(LOG_DIR + get_current_timestamp() + '.txt', 'w+') as f:
+            f.write(f"Zip files: {len(zip_files)}, Single files: {len(single_files)}, Failed downloads: {len(other_downloads)}")
+    except Exception as e:
+        logging.error(f"Error processing range: {e}")
+
 # Function to send a GET request to download a zip file for the given id under the CELLAR URI
 def rest_get_call(id: str) -> requests.Response:
     """
@@ -129,7 +197,7 @@ def rest_get_call(id: str) -> requests.Response:
     try:
         url = BASE_URL + id
         headers = {
-            'Accept': "application/xhtml+xml",
+            'Accept': "application/zip;mtype=fmx4, application/xml;mtype=fmx4, application/xhtml+xml, text/html, text/html;type=simplified, application/msword, text/plain, application/xml;notice=object",
             'Accept-Language': "eng",
             'Content-Type': "application/x-www-form-urlencoded",
             'Host': "publications.europa.eu"
@@ -141,8 +209,9 @@ def rest_get_call(id: str) -> requests.Response:
         logging.error(f"Error sending GET request: {e}")
         return None
 
+
 # Function to process a single file
-def process_single_file(response: requests.Response, folder_path: str, id: str):
+def process_single_file(response: requests.Response, folder_path: str):
     """
     Process a single file by saving its contents to a file.
 
@@ -152,8 +221,6 @@ def process_single_file(response: requests.Response, folder_path: str, id: str):
         The HTTP response object containing the file contents.
     folder_path : str
         The path to the folder where the file will be saved.
-    id : str
-        The id of the file, used to construct the file name.
 
     Returns
     -------
@@ -170,83 +237,14 @@ def process_single_file(response: requests.Response, folder_path: str, id: str):
     --------
     >>> response = requests.get('http://example.com/file')
     >>> folder_path = '/path/to/folder'
-    >>> id = 'file_id'
-    >>> process_single_file(response, folder_path, id)
+    >>> process_single_file(response, folder_path)
     """
-    out_file = folder_path + '/' + id + '.html'
+    out_file = folder_path + '.html'
     os.makedirs(os.path.dirname(out_file), exist_ok=True)
     with open(out_file, 'w+', encoding="utf-8") as f:
         f.write(response.text)
 
 
-# Function to process a list of ids to download the corresponding zip files
-def process_range(ids: list, folder_path: str):
-    """
-    Process a list of ids to download the corresponding zip files.
-
-    Parameters
-    ----------
-    ids : list
-        List of ids to process.
-    folder_path : str
-        Path to the folder where the files will be downloaded.
-
-    Returns
-    -------
-    None
-
-    Raises
-    ------
-    Exception
-        If an error occurs during the processing.
-
-    Notes
-    -----
-    This function iterates over the list of ids, sends a GET request for each id,
-    and downloads the corresponding file. If the file is a zip file, it is extracted
-    to the specified folder. If the file is not a zip file, it is processed as a
-    single file. If the file cannot be downloaded, the id is logged to a file.
-
-    Examples
-    --------
-    >>> ids = ['id1', 'id2', 'id3']
-    >>> folder_path = '/path/to/folder'
-    >>> process_range(ids, folder_path)
-    """
-    try:
-        zip_files = []
-        single_files = []
-        other_downloads = []
-        
-        for id in ids:
-            sub_folder_path = folder_path
-            
-            response = rest_get_call(id.strip())
-            if response is None:
-                continue
-            
-            if 'Content-Type' in response.headers:
-                if 'zip' in response.headers['Content-Type']:
-                    zip_files.append(id)
-                    extract_zip(response, sub_folder_path)
-                else:
-                    single_files.append(id)
-                    process_single_file(response, sub_folder_path, id)
-            else:
-                other_downloads.append(id)
-        
-        if len(other_downloads) != 0:
-            # Log results
-            id_logs_path = LOG_DIR + 'failed_' + get_current_timestamp() + '.txt'
-            os.makedirs(os.path.dirname(id_logs_path), exist_ok=True)
-            with open(id_logs_path, 'w+') as f:
-                f.write('Failed downloads ' + get_current_timestamp() + '\n' + str(other_downloads))
-        
-        with open(LOG_DIR + get_current_timestamp() + '.txt', 'w+') as f:
-            f.write(f"Zip files: {len(zip_files)}, Single files: {len(single_files)}, Failed downloads: {len(other_downloads)}")
-    except Exception as e:
-        logging.error(f"Error processing range: {e}")
-
 
 # Function to get the current timestamp
 def get_current_timestamp():

From fcb5524ad0cfd7c14a1a0a7c7f179294ec441df5 Mon Sep 17 00:00:00 2001
From: AlessioNar <alessio.nardin@gmail.com>
Date: Wed, 16 Oct 2024 17:21:18 +0200
Subject: [PATCH 5/5] Added gitignore and named poetry project

---
 .gitignore     | 3 +++
 pyproject.toml | 7 ++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2b4ee8c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+build/*
+dist/*
+op_cellar.egg-info/*
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 36fb864..6637964 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,13 @@
+[project]
+name = "op_cellar"
+version = "0.0.2"
+description = "A generic package to query and retrieve documents from Cellar, the common data repository of the Publications Office of the European Union."
+
 [tool.poetry]
 name = "op_cellar"
 version = "0.0.2"
 description = "A generic package to query and retrieve documents from Cellar, the common data repository of the Publications Office of the European Union."
-authors = ["AlessioNar <alessio.nardin@gmail.com>", "seljaseppala"]
+authors = ["AlessioNar <alessio.nardin@gmail.com>"]
 license = "EUPL 1.2"
 readme = "README.md"
 classifiers = [