Swap module for a script and download all raw tables at once

NNPDF · Nov 27, 2024 · 37cace1 · 37cace1
1 parent 1019586
commit 37cace1
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 18 deletions.
diff --git a/nnpdf_data/nnpdf_data/filter_utils/download_hepdata.py b/nnpdf_data/nnpdf_data/filter_utils/download_hepdata.py
@@ -14,45 +14,59 @@
 import os
 import pathlib
 import tarfile
+from typing import Generator
 
 from hepdata_cli.api import Client, getFilename_fromCd
 from hepdata_cli.resilient_requests import resilient_requests
 from yaml import safe_load
 
 CLIENT = Client(verbose=True)
+COMMONDATA_PATH = pathlib.Path(__file__).parents[1]
 
 
 class HepDataTables:
     """A commondata class to download the raw HepData tables.
 
-    Parameters:
-    -----------
+    Parameters
+    ----------
     metadata: str
         a path the metadata of the current data
     """
 
-    def __init__(self, metadata: str) -> None:
-        self.metadata = safe_load(pathlib.Path(metadata).read_text())
+    def __init__(self) -> None:
+        self.metadata_files = self._get_metadata_files()
 
-    def get_hepdata_url(self) -> str:
-        # TODO: possibly Check the metadata vs HepData versions here
-        hepdata_id = self.metadata["hepdata"]["url"].split("/")[-1]
-        # NOTE: `id_list` can support many IDs allowing for downloading
-        # mutiple datasets at the same time.
-        urls = CLIENT._build_urls(
-            id_list=hepdata_id, file_format='yaml', ids='hepdata', table_name=''
-        )
-        return urls[0]
+    @staticmethod
+    def _get_metadata_files() -> Generator:
+        return COMMONDATA_PATH.glob("**/metadata.yaml")
+
+    @staticmethod
+    def _get_hepdata_id(metadata_file: pathlib.Path) -> str | None:
+        """Get the HepData ID from the metadata for a given dataset.
+
+        Parameters
+        ----------
+        metadata_file: pathlib.Path
+            path to the metadata file
 
-    def download(self) -> None:
-        url = self.get_hepdata_url()
+        Returns
+        -------
+        str | None: returns the HepData ID if the dataset is on HepData
+        """
+        metadata_yaml = safe_load(pathlib.Path(metadata_file).read_text())
+        if "hepdata" in metadata_yaml.keys():
+            url = metadata_yaml["hepdata"].get("url", None)
+            return url.split("/")[-1] if url is not None else None
+        else:
+            return None
+
+    @staticmethod
+    def _download_data(url: str, dataset_name: str) -> None:
         print(f"Downloading tables from: {url}")
         response = resilient_requests('get', url, allow_redirects=True)
 
         filename = getFilename_fromCd(response.headers.get('content-disposition'))
-        filepath = pathlib.Path().parent
-        print(filepath)
-        # filepath.mkdir(exist_ok=True)
+        filepath = COMMONDATA_PATH.joinpath(f"commondata/{dataset_name}")
         raw_path = f"{filepath}/{filename}"
 
         open(raw_path, 'wb').write(response.content)
@@ -61,3 +75,39 @@ def download(self) -> None:
         os.remove(raw_path)
 
         return
+
+    def get_hepdata_urls(self) -> tuple[list[str], list[str]]:
+        id_list = []
+        dataset = []
+        for meta in self.metadata_files:
+            hep_id = self._get_hepdata_id(metadata_file=meta)
+            if hep_id is not None and hep_id.startswith("ins"):
+                id_list.append(hep_id)
+                dataset.append(str(meta).split("/")[-2])
+        hepdata_id = " ".join(id_list)
+
+        urls = CLIENT._build_urls(
+            id_list=hepdata_id, file_format='yaml', ids='hepdata', table_name=''
+        )
+        return urls, dataset
+
+    def download_all(self) -> None:
+        urls, dataset_names = self.get_hepdata_urls()
+        print("Finished fetching all the URLs.")
+        for url, dataname in zip(urls, dataset_names):
+            print(f"Downloading: {dataname}")
+            self._download_data(url=url, dataset_name=dataname)
+        print("Raw data tables for all the datasets have been downloaded.")
+        return
+
+    def download_from_server(self) -> None:
+        """Download all of the datasets from the NNPDF server."""
+        return
+
+    def compress_raw_data(self) -> None:
+        """Compress the raw data tables to be uploaded to the NNPDF server."""
+        return
+
+
+def main():
+    HepDataTables().download_all()
diff --git a/nnpdf_data/pyproject.toml b/nnpdf_data/pyproject.toml
@@ -31,6 +31,9 @@ include = [
     "nnpdf_data/_version.py",
 ]
 
+[tool.poetry.scripts]
+download-hepdata = "nnpdf_data.filter_utils.download_hepdata:main"
+
 [tool.poetry.dependencies]
 python = "^3.9"
 "ruamel.yaml" = "<0.18"