From 740777367530119e365aa9ccc7de1a3241639d3c Mon Sep 17 00:00:00 2001 From: J08nY Date: Thu, 21 Nov 2024 12:40:44 +0100 Subject: [PATCH] Add forgotten scheme eval notebook. --- notebooks/cc/scheme_eval.ipynb | 295 +++++++++++++++++++++++++++++++++ 1 file changed, 295 insertions(+) create mode 100644 notebooks/cc/scheme_eval.ipynb diff --git a/notebooks/cc/scheme_eval.ipynb b/notebooks/cc/scheme_eval.ipynb new file mode 100644 index 00000000..7ffc6f16 --- /dev/null +++ b/notebooks/cc/scheme_eval.ipynb @@ -0,0 +1,295 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "10337a8316f35aeb", + "metadata": {}, + "source": [ + "# Scheme data matching evaluation\n", + "This notebook evaluates the performance of matching the data extracted from scheme websites to data from the commoncriteriaportal." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from pprint import pprint\n", + "from tqdm.auto import trange, tqdm\n", + "\n", + "from sec_certs.dataset import CCDataset, CCSchemeDataset\n", + "from sec_certs.model import CCSchemeMatcher\n", + "from sec_certs.sample.cc_certificate_id import canonicalize\n", + "from sec_certs.sample.cc_scheme import CCScheme, EntryType\n", + "from sec_certs.configuration import config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fccbff4e5cee78a", + "metadata": {}, + "outputs": [], + "source": [ + "dset = CCDataset.from_json(\"../../dset.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d711cd0bebf1daa", + "metadata": {}, + "outputs": [], + "source": [ + "schemes = CCSchemeDataset.from_json(\"../../schemes_new.json\")\n", + "#schemes = CCSchemeDataset.from_web(enhanced=True)\n", + "#schemes.to_json(\"../../schemes_new.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5470ec4719da0d9", + "metadata": {}, + "outputs": [], + "source": [ + "dset.auxiliary_datasets.scheme_dset = schemes\n", + "\n", + "count_was = 0\n", + "count_is = 0\n", + "for cert in dset:\n", + " if cert.heuristics.scheme_data is not None:\n", + " count_was += 1\n", + " cert.heuristics.old_scheme_data = cert.heuristics.scheme_data\n", + " cert.heuristics.scheme_data = None\n", + "dset._compute_scheme_data()\n", + "for cert in dset:\n", + " if cert.heuristics.scheme_data is not None:\n", + " count_is += 1\n", + "print(count_was, count_is)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d6b6db4956b4774", + "metadata": {}, + "outputs": [], + "source": [ + "def build_df(dset):\n", + " df = pd.DataFrame([(cert.scheme, cert.name, cert.manufacturer, cert.status, cert.heuristics.cert_id, cert.not_valid_before, cert.heuristics.scheme_data) for cert in dset],\n", + " columns=[\"scheme\", \"name\", \"vendor\", \"status\", \"cert_id\", \"cert_date\", \"scheme_data\"])\n", + " df[\"scheme_cert_id\"] = df[\"scheme_data\"].map(lambda data: (data.get(\"cert_id\") or data.get(\"enhanced\", {}).get(\"cert_id\")) if data else None)\n", + " def try_canonicalize(cert_id, scheme):\n", + " try:\n", + " return canonicalize(cert_id, scheme)\n", + " except:\n", + " return None\n", + " df[\"scheme_cert_id_canonical\"] = df.apply(lambda x: try_canonicalize(x[\"scheme_cert_id\"], x[\"scheme\"]), axis=1)\n", + " def get_from_entry(entry, *keys: str):\n", + " if e := entry.get(\"enhanced\"):\n", + " for key in keys:\n", + " if val := e.get(key):\n", + " return val\n", + " for key in keys:\n", + " if val := entry.get(key):\n", + " return val\n", + " return None\n", + " df[\"scheme_cert_date\"] = df[\"scheme_data\"].map(lambda data: get_from_entry(data, \"certification_date\") if data else None)\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "89cbf34713ce6c6a", + "metadata": {}, + "outputs": [], + "source": [ + "df = build_df(dset)" + ] + }, + { + "cell_type": "markdown", + "id": "8a7976ae31969150", + "metadata": {}, + "source": [ + "## Evaluate all schemes" + ] + }, + { + "cell_type": "markdown", + "id": "f2e5c8d5-e08a-4fb9-919a-4de0718f5de5", + "metadata": {}, + "source": [ + "Let's look at how the threshold setting changes the match rate." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8a7140f3738166f", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "original_threshold = config.cc_matching_threshold\n", + "thresholds = list(range(100, -10, -10))\n", + "rates = {}\n", + "dfs = {}\n", + "for threshold in tqdm(thresholds):\n", + " config.cc_matching_threshold = threshold\n", + " for cert in dset:\n", + " cert.heuristics.scheme_data = None\n", + " dset._compute_scheme_data()\n", + " count = 0\n", + " for cert in dset:\n", + " if cert.heuristics.scheme_data is not None:\n", + " count += 1\n", + " print(f\"Threshold: {threshold}\")\n", + " print(f\"Assigned count: {count}\")\n", + " df = build_df(dset)\n", + " dfs[threshold] = df\n", + " for scheme in schemes:\n", + " country = scheme.country\n", + " total = df[df[\"scheme\"] == country]\n", + " assigned = df[(df[\"scheme\"] == country) & df[\"scheme_data\"].notnull()]\n", + " rate = len(assigned)/len(total) * 100 if len(total) != 0 else 0\n", + " rate_list = rates.setdefault(country, [])\n", + " rate_list.append(rate)\n", + " \n", + " print(f\"{country}: {len(assigned)} assigned out of {len(total)} -> {rate:.1f}%\")\n", + " total_active = total[total[\"status\"] == \"active\"]\n", + " assigned_active = assigned[assigned[\"status\"] == \"active\"]\n", + " print(f\"\\t- active: {len(assigned_active)} out of {len(total_active)}, entries: {len(scheme.lists.get(EntryType.Certified, []))}\")\n", + " total_archived = total[total[\"status\"] == \"archived\"]\n", + " assigned_archived = assigned[assigned[\"status\"] == \"archived\"]\n", + " print(f\"\\t- archived: {len(assigned_archived)} out of {len(total_archived)}, entries: {len(scheme.lists.get(EntryType.Archived, []))}\")\n", + " print()\n", + "\n", + "config.cc_matching_threshold = original_threshold" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05ef5991-1b3f-4fd3-9f4b-808ddb51a89f", + "metadata": {}, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "from matplotlib import pyplot as plt\n", + "from itertools import cycle\n", + "\n", + "lines = [\"-\",\"--\",\"-.\",\":\"]\n", + "linecycler = cycle(lines)\n", + "\n", + "fig, ax = plt.subplots(figsize=(12,4))\n", + "for scheme in schemes:\n", + " ax.plot(thresholds, rates[scheme.country], next(linecycler), label=scheme.country)\n", + "ax.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\");" + ] + }, + { + "cell_type": "markdown", + "id": "62df8d488e204ac2", + "metadata": {}, + "source": [ + "## Evaluate a scheme" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab0d21164906fe3f", + "metadata": {}, + "outputs": [], + "source": [ + "scheme = \"DE\"\n", + "threshold = 70\n", + "df = dfs[threshold]\n", + "df[df[\"scheme\"] == scheme].sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "709598fe26cc4371", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "un_df = pd.DataFrame(schemes[scheme].lists[EntryType.Certified])\n", + "un_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11c70f74ce9c4d56", + "metadata": {}, + "outputs": [], + "source": [ + "sd = list(df[\"scheme_data\"])\n", + "unmatched_certs = [cert for cert in dset if cert.scheme == scheme and cert.heuristics.scheme_data is None and cert.status == \"active\"]\n", + "unmatched_entries = [entry for entry in schemes[scheme].lists[EntryType.Certified] if entry not in sd]\n", + "matches = CCSchemeMatcher.match_all(unmatched_entries, scheme, unmatched_certs)\n", + "matches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5fa79176664dac8", + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame([cert.pandas_tuple[:5] for cert in unmatched_certs])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95a6e048e53601c7", + "metadata": {}, + "outputs": [], + "source": [ + "pd.DataFrame(unmatched_entries)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff67f13e-bf99-4ba5-bef0-37c85dd3e2c8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}