Skip to content

Commit

Permalink
Merge pull request #424 from crocs-muni/fix/new-dgsts
Browse files Browse the repository at this point in the history
New digests
  • Loading branch information
J08nY authored Jul 21, 2024
2 parents 2806225 + ee63131 commit da10d50
Show file tree
Hide file tree
Showing 22 changed files with 1,053 additions and 202 deletions.
7 changes: 3 additions & 4 deletions src/sec_certs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@
CC_SWEDEN_ARCHIVED_URL = CC_SWEDEN_BASE_URL + "/verksamhet/ovrig-verksamhet/csec/arkiverade-certifikat-aldre-an-5-ar/"
CC_TURKEY_ARCHIVED_URL = "https://statik.tse.org.tr/upload/tr/dosya/icerikyonetimi/3300/03112021143434-2.pdf"
CC_USA_BASE_URL = "https://www.niap-ccevs.org"
CC_USA_PRODUCT_URL = CC_USA_BASE_URL + "/Product/"
CC_USA_CERTIFIED_URL = CC_USA_BASE_URL + "/Product/PCL.cfm"
CC_USA_INEVAL_URL = CC_USA_BASE_URL + "/Product/PINE.cfm"
CC_USA_ARCHIVED_URL = CC_USA_BASE_URL + "/Product/Archived.cfm"
CC_USA_PRODUCTS_URL = CC_USA_BASE_URL + "/api/project/product/pcl_products/"
CC_USA_FILES_URL = CC_USA_BASE_URL + "/api/file/get_pcl_files/"
CC_USA_GETFILE_URL = CC_USA_BASE_URL + "/api/file/get_public_file/"
23 changes: 17 additions & 6 deletions src/sec_certs/dataset/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from sec_certs.sample.cc_scheme import EntryType
from sec_certs.sample.protection_profile import ProtectionProfile
from sec_certs.serialization.json import ComplexSerializableType, serialize
from sec_certs.utils import helpers
from sec_certs.utils import helpers, sanitization
from sec_certs.utils import parallel_processing as cert_processing
from sec_certs.utils.profiling import staged

Expand Down Expand Up @@ -368,7 +368,14 @@ def map_ip_to_hostname(url: str) -> str:
return CCDataset.BASE_URL + relative_path

def _get_primary_key_str(row: Tag):
return row["category"] + row["cert_name"] + row["report_link"]
return "|".join(
[
row["category"],
row["cert_name"],
sanitization.sanitize_link_fname(row["report_link"]) or "None",
sanitization.sanitize_link_fname(row["st_link"]) or "None",
]
)

cert_status = "active" if "active" in str(file) else "archived"

Expand Down Expand Up @@ -408,11 +415,15 @@ def _get_primary_key_str(row: Tag):
df_base = df.loc[~df.is_maintenance].copy()
df_main = df.loc[df.is_maintenance].copy()

df_base.report_link = df_base.report_link.map(map_ip_to_hostname)
df_base.st_link = df_base.st_link.map(map_ip_to_hostname)
df_base.report_link = df_base.report_link.map(map_ip_to_hostname).map(sanitization.sanitize_link)
df_base.st_link = df_base.st_link.map(map_ip_to_hostname).map(sanitization.sanitize_link)

df_main.maintenance_report_link = df_main.maintenance_report_link.map(map_ip_to_hostname)
df_main.maintenance_st_link = df_main.maintenance_st_link.map(map_ip_to_hostname)
df_main.maintenance_report_link = df_main.maintenance_report_link.map(map_ip_to_hostname).map(
sanitization.sanitize_link
)
df_main.maintenance_st_link = df_main.maintenance_st_link.map(map_ip_to_hostname).map(
sanitization.sanitize_link
)

n_all = len(df_base)
n_deduplicated = len(df_base.drop_duplicates(subset=["dgst"]))
Expand Down
58 changes: 37 additions & 21 deletions src/sec_certs/sample/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

import sec_certs.utils.extract
import sec_certs.utils.pdf
import sec_certs.utils.sanitization
from sec_certs import constants
from sec_certs.cert_rules import SARS_IMPLIED_FROM_EAL, cc_rules, rules, security_level_csv_scan
from sec_certs.configuration import config
Expand All @@ -27,7 +26,7 @@
from sec_certs.sample.sar import SAR
from sec_certs.serialization.json import ComplexSerializableType
from sec_certs.serialization.pandas import PandasSerializableType
from sec_certs.utils import helpers
from sec_certs.utils import helpers, sanitization
from sec_certs.utils.extract import normalize_match_string, scheme_frontpage_functions


Expand Down Expand Up @@ -57,16 +56,10 @@ class MaintenanceReport(ComplexSerializableType):
maintenance_st_link: str | None

def __post_init__(self):
super().__setattr__(
"maintenance_report_link", sec_certs.utils.sanitization.sanitize_cc_link(self.maintenance_report_link)
)
super().__setattr__(
"maintenance_st_link", sec_certs.utils.sanitization.sanitize_cc_link(self.maintenance_st_link)
)
super().__setattr__(
"maintenance_title", sec_certs.utils.sanitization.sanitize_string(self.maintenance_title)
)
super().__setattr__("maintenance_date", sec_certs.utils.sanitization.sanitize_date(self.maintenance_date))
super().__setattr__("maintenance_report_link", sanitization.sanitize_link(self.maintenance_report_link))
super().__setattr__("maintenance_st_link", sanitization.sanitize_link(self.maintenance_st_link))
super().__setattr__("maintenance_title", sanitization.sanitize_string(self.maintenance_title))
super().__setattr__("maintenance_date", sanitization.sanitize_date(self.maintenance_date))

@classmethod
def from_dict(cls, dct: dict) -> CCCertificate.MaintenanceReport:
Expand Down Expand Up @@ -420,20 +413,20 @@ def __init__(

self.status = status
self.category = category
self.name = sec_certs.utils.sanitization.sanitize_string(name)
self.name = sanitization.sanitize_string(name)

self.manufacturer = None
if manufacturer:
self.manufacturer = sec_certs.utils.sanitization.sanitize_string(manufacturer)
self.manufacturer = sanitization.sanitize_string(manufacturer)

self.scheme = scheme
self.security_level = sec_certs.utils.sanitization.sanitize_security_levels(security_level)
self.not_valid_before = sec_certs.utils.sanitization.sanitize_date(not_valid_before)
self.not_valid_after = sec_certs.utils.sanitization.sanitize_date(not_valid_after)
self.report_link = sec_certs.utils.sanitization.sanitize_cc_link(report_link)
self.st_link = sec_certs.utils.sanitization.sanitize_cc_link(st_link)
self.cert_link = sec_certs.utils.sanitization.sanitize_cc_link(cert_link)
self.manufacturer_web = sec_certs.utils.sanitization.sanitize_link(manufacturer_web)
self.security_level = sanitization.sanitize_security_levels(security_level)
self.not_valid_before = sanitization.sanitize_date(not_valid_before)
self.not_valid_after = sanitization.sanitize_date(not_valid_after)
self.report_link = sanitization.sanitize_link(report_link)
self.st_link = sanitization.sanitize_link(st_link)
self.cert_link = sanitization.sanitize_link(cert_link)
self.manufacturer_web = sanitization.sanitize_link(manufacturer_web)
self.protection_profiles = protection_profiles
self.maintenance_updates = maintenance_updates
self.state = state if state else self.InternalState()
Expand All @@ -445,6 +438,29 @@ def dgst(self) -> str:
"""
Computes the primary key of the sample using first 16 bytes of SHA-256 digest
"""
if not (self.name is not None and self.category is not None):
raise RuntimeError("Certificate digest can't be computed, because information is missing.")
return helpers.get_first_16_bytes_sha256(
"|".join(
[
self.category,
self.name,
sanitization.sanitize_link_fname(self.report_link) or "None",
sanitization.sanitize_link_fname(self.st_link) or "None",
]
)
)

@property
def old_dgst(self) -> str:
if not (self.name is not None and self.report_link is not None and self.category is not None):
raise RuntimeError("Certificate digest can't be computed, because information is missing.")
return helpers.get_first_16_bytes_sha256(
self.category + self.name + sanitization.sanitize_cc_link(self.report_link) # type: ignore
)

@property
def older_dgst(self) -> str:
if not (self.name is not None and self.report_link is not None and self.category is not None):
raise RuntimeError("Certificate digest can't be computed, because information is missing.")
return helpers.get_first_16_bytes_sha256(self.category + self.name + self.report_link)
Expand Down
Loading

0 comments on commit da10d50

Please sign in to comment.