Skip to content

Commit

Permalink
Swap module for a script and download all raw tables at once
Browse files Browse the repository at this point in the history
  • Loading branch information
Radonirinaunimi committed Nov 27, 2024
1 parent 1019586 commit 37cace1
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 18 deletions.
86 changes: 68 additions & 18 deletions nnpdf_data/nnpdf_data/filter_utils/download_hepdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,45 +14,59 @@
import os
import pathlib
import tarfile
from typing import Generator

from hepdata_cli.api import Client, getFilename_fromCd
from hepdata_cli.resilient_requests import resilient_requests
from yaml import safe_load

CLIENT = Client(verbose=True)
COMMONDATA_PATH = pathlib.Path(__file__).parents[1]


class HepDataTables:
"""A commondata class to download the raw HepData tables.
Parameters:
-----------
Parameters
----------
metadata: str
a path the metadata of the current data
"""

def __init__(self, metadata: str) -> None:
self.metadata = safe_load(pathlib.Path(metadata).read_text())
def __init__(self) -> None:
self.metadata_files = self._get_metadata_files()

def get_hepdata_url(self) -> str:
# TODO: possibly Check the metadata vs HepData versions here
hepdata_id = self.metadata["hepdata"]["url"].split("/")[-1]
# NOTE: `id_list` can support many IDs allowing for downloading
# mutiple datasets at the same time.
urls = CLIENT._build_urls(
id_list=hepdata_id, file_format='yaml', ids='hepdata', table_name=''
)
return urls[0]
@staticmethod
def _get_metadata_files() -> Generator:
return COMMONDATA_PATH.glob("**/metadata.yaml")

@staticmethod
def _get_hepdata_id(metadata_file: pathlib.Path) -> str | None:
"""Get the HepData ID from the metadata for a given dataset.
Parameters
----------
metadata_file: pathlib.Path
path to the metadata file
def download(self) -> None:
url = self.get_hepdata_url()
Returns
-------
str | None: returns the HepData ID if the dataset is on HepData
"""
metadata_yaml = safe_load(pathlib.Path(metadata_file).read_text())
if "hepdata" in metadata_yaml.keys():
url = metadata_yaml["hepdata"].get("url", None)
return url.split("/")[-1] if url is not None else None
else:
return None

@staticmethod
def _download_data(url: str, dataset_name: str) -> None:
print(f"Downloading tables from: {url}")
response = resilient_requests('get', url, allow_redirects=True)

filename = getFilename_fromCd(response.headers.get('content-disposition'))
filepath = pathlib.Path().parent
print(filepath)
# filepath.mkdir(exist_ok=True)
filepath = COMMONDATA_PATH.joinpath(f"commondata/{dataset_name}")
raw_path = f"{filepath}/{filename}"

open(raw_path, 'wb').write(response.content)
Expand All @@ -61,3 +75,39 @@ def download(self) -> None:
os.remove(raw_path)

return

def get_hepdata_urls(self) -> tuple[list[str], list[str]]:
id_list = []
dataset = []
for meta in self.metadata_files:
hep_id = self._get_hepdata_id(metadata_file=meta)
if hep_id is not None and hep_id.startswith("ins"):
id_list.append(hep_id)
dataset.append(str(meta).split("/")[-2])
hepdata_id = " ".join(id_list)

urls = CLIENT._build_urls(
id_list=hepdata_id, file_format='yaml', ids='hepdata', table_name=''
)
return urls, dataset

def download_all(self) -> None:
urls, dataset_names = self.get_hepdata_urls()
print("Finished fetching all the URLs.")
for url, dataname in zip(urls, dataset_names):
print(f"Downloading: {dataname}")
self._download_data(url=url, dataset_name=dataname)
print("Raw data tables for all the datasets have been downloaded.")
return

def download_from_server(self) -> None:
"""Download all of the datasets from the NNPDF server."""
return

def compress_raw_data(self) -> None:
"""Compress the raw data tables to be uploaded to the NNPDF server."""
return


def main():
HepDataTables().download_all()
3 changes: 3 additions & 0 deletions nnpdf_data/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ include = [
"nnpdf_data/_version.py",
]

[tool.poetry.scripts]
download-hepdata = "nnpdf_data.filter_utils.download_hepdata:main"

[tool.poetry.dependencies]
python = "^3.9"
"ruamel.yaml" = "<0.18"
Expand Down

0 comments on commit 37cace1

Please sign in to comment.