Skip to content

Commit

Permalink
Add a way to download full dataset archive (including PDFs) from the …
Browse files Browse the repository at this point in the history
…web.
  • Loading branch information
J08nY committed Oct 17, 2024
1 parent c8c91a0 commit fc3a612
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 3 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ df_2015_and_newer = df.loc[df.year_from > 2014]
df.year_from.value_counts().sort_index().plot.line()
```

<!-- ## Authors
## Authors

This work is being done at [CRoCS MUNI](https://crocs.fi.muni.cz/) by Adam Janovsky, Jan Jancar, Petr Svenda, Jiri Michalik, Lukasz Chmielewski and other contributors. This work was supported by the Internal grant agency of Masaryk University, CZ.02.2.69/0.0/0.0/19_073/0016943.

![](docs/_static/logolink_OP_VVV_hor_barva_eng.jpg) -->
![](docs/_static/logolink_OP_VVV_hor_barva_eng.jpg)
8 changes: 8 additions & 0 deletions src/sec_certs/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class Configuration(BaseSettings):
"https://sec-certs.org/cc/dataset.json",
description="URL from where to fetch the latest snapshot of fully processed CC dataset.",
)
cc_latest_full_archive: AnyHttpUrl = Field(
"https://sec-certs.org/cc/cc.tar.gz",
description="URL from where to fetch the latest full archive of fully processed CC dataset.",
)
cc_maintenances_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/cc/maintenance_updates.json",
description="URL from where to fetch the latest snapshot of CC maintenance updates",
Expand All @@ -57,6 +61,10 @@ class Configuration(BaseSettings):
fips_latest_snapshot: AnyHttpUrl = Field(
"https://sec-certs.org/fips/dataset.json", description="URL for the latest snapshot of FIPS dataset."
)
fips_latest_full_archive: AnyHttpUrl = Field(
"https://sec-certs.org/fips/fips.tar.gz",
description="URL from where to fetch the latest full archive of fully processed FIPS dataset.",
)
fips_iut_dataset: AnyHttpUrl = Field(
"https://sec-certs.org/fips/iut/dataset.json", description="URL for the dataset of FIPS IUT data."
)
Expand Down
33 changes: 32 additions & 1 deletion src/sec_certs/dataset/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import itertools
import locale
import shutil
import tarfile
import tempfile
from collections.abc import Iterator
from dataclasses import dataclass
Expand Down Expand Up @@ -253,6 +254,34 @@ def from_web_latest(cls) -> CCDataset:
"""
return cls.from_web(config.cc_latest_snapshot, "Downloading CC Dataset", "cc_latest_dataset.json")

@classmethod
def from_web_latest_full(cls, path: str | Path) -> CCDataset:
"""
Fetches the full (and fresh) archive of the CCDataset from sec-certs.org, including the PDFs and auxiliary datasets.
Note that this is quite large (several gigabytes).
"""
if not path:
raise ValueError("Path needs to be defined.")
path = Path(path)
if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError("Path needs to be a directory.")
with tempfile.TemporaryDirectory() as tmp_dir:
dset_path = Path(tmp_dir) / "cc.tar.gz"
res = helpers.download_file(

Check warning on line 273 in src/sec_certs/dataset/cc.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/dataset/cc.py#L264-L273

Added lines #L264 - L273 were not covered by tests
config.cc_latest_full_archive,
dset_path,
show_progress_bar=True,
progress_bar_desc="Downloading CC archive",
)
if res != constants.RESPONSE_OK:
raise ValueError("Download failed.")
with tarfile.open(dset_path, "r:gz") as tar:
tar.extractall(path)
return cls.from_json(path / "dataset.json")

Check warning on line 283 in src/sec_certs/dataset/cc.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/dataset/cc.py#L279-L283

Added lines #L279 - L283 were not covered by tests

def _set_local_paths(self):
super()._set_local_paths()

Expand All @@ -262,6 +291,9 @@ def _set_local_paths(self):
if self.auxiliary_datasets.mu_dset:
self.auxiliary_datasets.mu_dset.root_dir = self.mu_dataset_dir

if self.auxiliary_datasets.scheme_dset:
self.auxiliary_datasets.scheme_dset.json_path = self.scheme_dataset_path

Check warning on line 295 in src/sec_certs/dataset/cc.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/dataset/cc.py#L295

Added line #L295 was not covered by tests

for cert in self:
cert.set_local_paths(
self.reports_pdf_dir,
Expand All @@ -271,7 +303,6 @@ def _set_local_paths(self):
self.targets_txt_dir,
self.certificates_txt_dir,
)
# TODO: This forgets to set local paths for other auxiliary datasets

def _merge_certs(self, certs: dict[str, CCCertificate], cert_source: str | None = None) -> None:
"""
Expand Down
30 changes: 30 additions & 0 deletions src/sec_certs/dataset/fips.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import itertools
import logging
import shutil
import tarfile
import tempfile
from pathlib import Path
from typing import Final

Expand Down Expand Up @@ -221,6 +223,34 @@ def from_web_latest(cls) -> FIPSDataset:
"""
return cls.from_web(config.fips_latest_snapshot, "Downloading FIPS Dataset", "fips_latest_dataset.json")

@classmethod
def from_web_latest_full(cls, path: str | Path) -> FIPSDataset:
"""
Fetches the full (and fresh) archive of the FIPSDataset from sec-certs.org, including the PDFs and auxiliary datasets.
Note that this is quite large (several gigabytes).
"""
if not path:
raise ValueError("Path needs to be defined.")
path = Path(path)
if not path.exists():
path.mkdir(parents=True)
if not path.is_dir():
raise ValueError("Path needs to be a directory.")
with tempfile.TemporaryDirectory() as tmp_dir:
dset_path = Path(tmp_dir) / "fips.tar.gz"
res = helpers.download_file(

Check warning on line 242 in src/sec_certs/dataset/fips.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/dataset/fips.py#L233-L242

Added lines #L233 - L242 were not covered by tests
config.fips_latest_full_archive,
dset_path,
show_progress_bar=True,
progress_bar_desc="Downloading FIPS archive",
)
if res != constants.RESPONSE_OK:
raise ValueError("Download failed.")
with tarfile.open(dset_path, "r:gz") as tar:
tar.extractall(path)
return cls.from_json(path / "dataset.json")

Check warning on line 252 in src/sec_certs/dataset/fips.py

View check run for this annotation

Codecov / codecov/patch

src/sec_certs/dataset/fips.py#L248-L252

Added lines #L248 - L252 were not covered by tests

def _set_local_paths(self) -> None:
super()._set_local_paths()
if self.auxiliary_datasets.algorithm_dset:
Expand Down

0 comments on commit fc3a612

Please sign in to comment.