diff --git a/README.md b/README.md index 258725c2..41529529 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,8 @@ df_2015_and_newer = df.loc[df.year_from > 2014] df.year_from.value_counts().sort_index().plot.line() ``` - +![](docs/_static/logolink_OP_VVV_hor_barva_eng.jpg) diff --git a/src/sec_certs/configuration.py b/src/sec_certs/configuration.py index 47ac5821..7667a8ec 100644 --- a/src/sec_certs/configuration.py +++ b/src/sec_certs/configuration.py @@ -46,6 +46,10 @@ class Configuration(BaseSettings): "https://sec-certs.org/cc/dataset.json", description="URL from where to fetch the latest snapshot of fully processed CC dataset.", ) + cc_latest_full_archive: AnyHttpUrl = Field( + "https://sec-certs.org/cc/cc.tar.gz", + description="URL from where to fetch the latest full archive of fully processed CC dataset.", + ) cc_maintenances_latest_snapshot: AnyHttpUrl = Field( "https://sec-certs.org/cc/maintenance_updates.json", description="URL from where to fetch the latest snapshot of CC maintenance updates", @@ -57,6 +61,10 @@ class Configuration(BaseSettings): fips_latest_snapshot: AnyHttpUrl = Field( "https://sec-certs.org/fips/dataset.json", description="URL for the latest snapshot of FIPS dataset." ) + fips_latest_full_archive: AnyHttpUrl = Field( + "https://sec-certs.org/fips/fips.tar.gz", + description="URL from where to fetch the latest full archive of fully processed FIPS dataset.", + ) fips_iut_dataset: AnyHttpUrl = Field( "https://sec-certs.org/fips/iut/dataset.json", description="URL for the dataset of FIPS IUT data." ) diff --git a/src/sec_certs/dataset/cc.py b/src/sec_certs/dataset/cc.py index ebffd912..ec125542 100644 --- a/src/sec_certs/dataset/cc.py +++ b/src/sec_certs/dataset/cc.py @@ -3,6 +3,7 @@ import itertools import locale import shutil +import tarfile import tempfile from collections.abc import Iterator from dataclasses import dataclass @@ -253,6 +254,34 @@ def from_web_latest(cls) -> CCDataset: """ return cls.from_web(config.cc_latest_snapshot, "Downloading CC Dataset", "cc_latest_dataset.json") + @classmethod + def from_web_latest_full(cls, path: str | Path) -> CCDataset: + """ + Fetches the full (and fresh) archive of the CCDataset from sec-certs.org, including the PDFs and auxiliary datasets. + + Note that this is quite large (several gigabytes). + """ + if not path: + raise ValueError("Path needs to be defined.") + path = Path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError("Path needs to be a directory.") + with tempfile.TemporaryDirectory() as tmp_dir: + dset_path = Path(tmp_dir) / "cc.tar.gz" + res = helpers.download_file( + config.cc_latest_full_archive, + dset_path, + show_progress_bar=True, + progress_bar_desc="Downloading CC archive", + ) + if res != constants.RESPONSE_OK: + raise ValueError("Download failed.") + with tarfile.open(dset_path, "r:gz") as tar: + tar.extractall(path) + return cls.from_json(path / "dataset.json") + def _set_local_paths(self): super()._set_local_paths() @@ -262,6 +291,9 @@ def _set_local_paths(self): if self.auxiliary_datasets.mu_dset: self.auxiliary_datasets.mu_dset.root_dir = self.mu_dataset_dir + if self.auxiliary_datasets.scheme_dset: + self.auxiliary_datasets.scheme_dset.json_path = self.scheme_dataset_path + for cert in self: cert.set_local_paths( self.reports_pdf_dir, @@ -271,7 +303,6 @@ def _set_local_paths(self): self.targets_txt_dir, self.certificates_txt_dir, ) - # TODO: This forgets to set local paths for other auxiliary datasets def _merge_certs(self, certs: dict[str, CCCertificate], cert_source: str | None = None) -> None: """ diff --git a/src/sec_certs/dataset/fips.py b/src/sec_certs/dataset/fips.py index 77f38754..d279f407 100644 --- a/src/sec_certs/dataset/fips.py +++ b/src/sec_certs/dataset/fips.py @@ -4,6 +4,8 @@ import itertools import logging import shutil +import tarfile +import tempfile from pathlib import Path from typing import Final @@ -221,6 +223,34 @@ def from_web_latest(cls) -> FIPSDataset: """ return cls.from_web(config.fips_latest_snapshot, "Downloading FIPS Dataset", "fips_latest_dataset.json") + @classmethod + def from_web_latest_full(cls, path: str | Path) -> FIPSDataset: + """ + Fetches the full (and fresh) archive of the FIPSDataset from sec-certs.org, including the PDFs and auxiliary datasets. + + Note that this is quite large (several gigabytes). + """ + if not path: + raise ValueError("Path needs to be defined.") + path = Path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError("Path needs to be a directory.") + with tempfile.TemporaryDirectory() as tmp_dir: + dset_path = Path(tmp_dir) / "fips.tar.gz" + res = helpers.download_file( + config.fips_latest_full_archive, + dset_path, + show_progress_bar=True, + progress_bar_desc="Downloading FIPS archive", + ) + if res != constants.RESPONSE_OK: + raise ValueError("Download failed.") + with tarfile.open(dset_path, "r:gz") as tar: + tar.extractall(path) + return cls.from_json(path / "dataset.json") + def _set_local_paths(self) -> None: super()._set_local_paths() if self.auxiliary_datasets.algorithm_dset: