-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
295 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,295 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "10337a8316f35aeb", | ||
"metadata": {}, | ||
"source": [ | ||
"# Scheme data matching evaluation\n", | ||
"This notebook evaluates the performance of matching the data extracted from scheme websites to data from the commoncriteriaportal." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "initial_id", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from pprint import pprint\n", | ||
"from tqdm.auto import trange, tqdm\n", | ||
"\n", | ||
"from sec_certs.dataset import CCDataset, CCSchemeDataset\n", | ||
"from sec_certs.model import CCSchemeMatcher\n", | ||
"from sec_certs.sample.cc_certificate_id import canonicalize\n", | ||
"from sec_certs.sample.cc_scheme import CCScheme, EntryType\n", | ||
"from sec_certs.configuration import config" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1fccbff4e5cee78a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dset = CCDataset.from_json(\"../../dset.json\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2d711cd0bebf1daa", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"schemes = CCSchemeDataset.from_json(\"../../schemes_new.json\")\n", | ||
"#schemes = CCSchemeDataset.from_web(enhanced=True)\n", | ||
"#schemes.to_json(\"../../schemes_new.json\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b5470ec4719da0d9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dset.auxiliary_datasets.scheme_dset = schemes\n", | ||
"\n", | ||
"count_was = 0\n", | ||
"count_is = 0\n", | ||
"for cert in dset:\n", | ||
" if cert.heuristics.scheme_data is not None:\n", | ||
" count_was += 1\n", | ||
" cert.heuristics.old_scheme_data = cert.heuristics.scheme_data\n", | ||
" cert.heuristics.scheme_data = None\n", | ||
"dset._compute_scheme_data()\n", | ||
"for cert in dset:\n", | ||
" if cert.heuristics.scheme_data is not None:\n", | ||
" count_is += 1\n", | ||
"print(count_was, count_is)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4d6b6db4956b4774", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def build_df(dset):\n", | ||
" df = pd.DataFrame([(cert.scheme, cert.name, cert.manufacturer, cert.status, cert.heuristics.cert_id, cert.not_valid_before, cert.heuristics.scheme_data) for cert in dset],\n", | ||
" columns=[\"scheme\", \"name\", \"vendor\", \"status\", \"cert_id\", \"cert_date\", \"scheme_data\"])\n", | ||
" df[\"scheme_cert_id\"] = df[\"scheme_data\"].map(lambda data: (data.get(\"cert_id\") or data.get(\"enhanced\", {}).get(\"cert_id\")) if data else None)\n", | ||
" def try_canonicalize(cert_id, scheme):\n", | ||
" try:\n", | ||
" return canonicalize(cert_id, scheme)\n", | ||
" except:\n", | ||
" return None\n", | ||
" df[\"scheme_cert_id_canonical\"] = df.apply(lambda x: try_canonicalize(x[\"scheme_cert_id\"], x[\"scheme\"]), axis=1)\n", | ||
" def get_from_entry(entry, *keys: str):\n", | ||
" if e := entry.get(\"enhanced\"):\n", | ||
" for key in keys:\n", | ||
" if val := e.get(key):\n", | ||
" return val\n", | ||
" for key in keys:\n", | ||
" if val := entry.get(key):\n", | ||
" return val\n", | ||
" return None\n", | ||
" df[\"scheme_cert_date\"] = df[\"scheme_data\"].map(lambda data: get_from_entry(data, \"certification_date\") if data else None)\n", | ||
" return df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "89cbf34713ce6c6a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df = build_df(dset)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "8a7976ae31969150", | ||
"metadata": {}, | ||
"source": [ | ||
"## Evaluate all schemes" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "f2e5c8d5-e08a-4fb9-919a-4de0718f5de5", | ||
"metadata": {}, | ||
"source": [ | ||
"Let's look at how the threshold setting changes the match rate." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "e8a7140f3738166f", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"original_threshold = config.cc_matching_threshold\n", | ||
"thresholds = list(range(100, -10, -10))\n", | ||
"rates = {}\n", | ||
"dfs = {}\n", | ||
"for threshold in tqdm(thresholds):\n", | ||
" config.cc_matching_threshold = threshold\n", | ||
" for cert in dset:\n", | ||
" cert.heuristics.scheme_data = None\n", | ||
" dset._compute_scheme_data()\n", | ||
" count = 0\n", | ||
" for cert in dset:\n", | ||
" if cert.heuristics.scheme_data is not None:\n", | ||
" count += 1\n", | ||
" print(f\"Threshold: {threshold}\")\n", | ||
" print(f\"Assigned count: {count}\")\n", | ||
" df = build_df(dset)\n", | ||
" dfs[threshold] = df\n", | ||
" for scheme in schemes:\n", | ||
" country = scheme.country\n", | ||
" total = df[df[\"scheme\"] == country]\n", | ||
" assigned = df[(df[\"scheme\"] == country) & df[\"scheme_data\"].notnull()]\n", | ||
" rate = len(assigned)/len(total) * 100 if len(total) != 0 else 0\n", | ||
" rate_list = rates.setdefault(country, [])\n", | ||
" rate_list.append(rate)\n", | ||
" \n", | ||
" print(f\"{country}: {len(assigned)} assigned out of {len(total)} -> {rate:.1f}%\")\n", | ||
" total_active = total[total[\"status\"] == \"active\"]\n", | ||
" assigned_active = assigned[assigned[\"status\"] == \"active\"]\n", | ||
" print(f\"\\t- active: {len(assigned_active)} out of {len(total_active)}, entries: {len(scheme.lists.get(EntryType.Certified, []))}\")\n", | ||
" total_archived = total[total[\"status\"] == \"archived\"]\n", | ||
" assigned_archived = assigned[assigned[\"status\"] == \"archived\"]\n", | ||
" print(f\"\\t- archived: {len(assigned_archived)} out of {len(total_archived)}, entries: {len(scheme.lists.get(EntryType.Archived, []))}\")\n", | ||
" print()\n", | ||
"\n", | ||
"config.cc_matching_threshold = original_threshold" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "05ef5991-1b3f-4fd3-9f4b-808ddb51a89f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%matplotlib inline\n", | ||
"from matplotlib import pyplot as plt\n", | ||
"from itertools import cycle\n", | ||
"\n", | ||
"lines = [\"-\",\"--\",\"-.\",\":\"]\n", | ||
"linecycler = cycle(lines)\n", | ||
"\n", | ||
"fig, ax = plt.subplots(figsize=(12,4))\n", | ||
"for scheme in schemes:\n", | ||
" ax.plot(thresholds, rates[scheme.country], next(linecycler), label=scheme.country)\n", | ||
"ax.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\");" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "62df8d488e204ac2", | ||
"metadata": {}, | ||
"source": [ | ||
"## Evaluate a scheme" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ab0d21164906fe3f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"scheme = \"DE\"\n", | ||
"threshold = 70\n", | ||
"df = dfs[threshold]\n", | ||
"df[df[\"scheme\"] == scheme].sample(10)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "709598fe26cc4371", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"un_df = pd.DataFrame(schemes[scheme].lists[EntryType.Certified])\n", | ||
"un_df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "11c70f74ce9c4d56", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"sd = list(df[\"scheme_data\"])\n", | ||
"unmatched_certs = [cert for cert in dset if cert.scheme == scheme and cert.heuristics.scheme_data is None and cert.status == \"active\"]\n", | ||
"unmatched_entries = [entry for entry in schemes[scheme].lists[EntryType.Certified] if entry not in sd]\n", | ||
"matches = CCSchemeMatcher.match_all(unmatched_entries, scheme, unmatched_certs)\n", | ||
"matches" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "f5fa79176664dac8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd.DataFrame([cert.pandas_tuple[:5] for cert in unmatched_certs])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "95a6e048e53601c7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd.DataFrame(unmatched_entries)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ff67f13e-bf99-4ba5-bef0-37c85dd3e2c8", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |