Add forgotten scheme eval notebook.

crocs-muni · Nov 21, 2024 · 7407773 · 7407773
1 parent 6da6641
commit 7407773
Showing 1 changed file with 295 additions and 0 deletions.
diff --git a/notebooks/cc/scheme_eval.ipynb b/notebooks/cc/scheme_eval.ipynb
@@ -0,0 +1,295 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "10337a8316f35aeb",
+   "metadata": {},
+   "source": [
+    "# Scheme data matching evaluation\n",
+    "This notebook evaluates the performance of matching the data extracted from scheme websites to data from the commoncriteriaportal."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pprint import pprint\n",
+    "from tqdm.auto import trange, tqdm\n",
+    "\n",
+    "from sec_certs.dataset import CCDataset, CCSchemeDataset\n",
+    "from sec_certs.model import CCSchemeMatcher\n",
+    "from sec_certs.sample.cc_certificate_id import canonicalize\n",
+    "from sec_certs.sample.cc_scheme import CCScheme, EntryType\n",
+    "from sec_certs.configuration import config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fccbff4e5cee78a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset = CCDataset.from_json(\"../../dset.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d711cd0bebf1daa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "schemes = CCSchemeDataset.from_json(\"../../schemes_new.json\")\n",
+    "#schemes = CCSchemeDataset.from_web(enhanced=True)\n",
+    "#schemes.to_json(\"../../schemes_new.json\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b5470ec4719da0d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dset.auxiliary_datasets.scheme_dset = schemes\n",
+    "\n",
+    "count_was = 0\n",
+    "count_is = 0\n",
+    "for cert in dset:\n",
+    "    if cert.heuristics.scheme_data is not None:\n",
+    "        count_was += 1\n",
+    "    cert.heuristics.old_scheme_data = cert.heuristics.scheme_data\n",
+    "    cert.heuristics.scheme_data = None\n",
+    "dset._compute_scheme_data()\n",
+    "for cert in dset:\n",
+    "    if cert.heuristics.scheme_data is not None:\n",
+    "        count_is += 1\n",
+    "print(count_was, count_is)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d6b6db4956b4774",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def build_df(dset):\n",
+    "    df = pd.DataFrame([(cert.scheme, cert.name, cert.manufacturer, cert.status, cert.heuristics.cert_id, cert.not_valid_before, cert.heuristics.scheme_data)  for cert in dset],\n",
+    "                      columns=[\"scheme\", \"name\", \"vendor\", \"status\", \"cert_id\", \"cert_date\", \"scheme_data\"])\n",
+    "    df[\"scheme_cert_id\"] = df[\"scheme_data\"].map(lambda data: (data.get(\"cert_id\") or data.get(\"enhanced\", {}).get(\"cert_id\")) if data else None)\n",
+    "    def try_canonicalize(cert_id, scheme):\n",
+    "        try:\n",
+    "            return canonicalize(cert_id, scheme)\n",
+    "        except:\n",
+    "            return None\n",
+    "    df[\"scheme_cert_id_canonical\"] = df.apply(lambda x: try_canonicalize(x[\"scheme_cert_id\"], x[\"scheme\"]), axis=1)\n",
+    "    def get_from_entry(entry, *keys: str):\n",
+    "        if e := entry.get(\"enhanced\"):\n",
+    "            for key in keys:\n",
+    "                if val := e.get(key):\n",
+    "                    return val\n",
+    "        for key in keys:\n",
+    "            if val := entry.get(key):\n",
+    "                return val\n",
+    "        return None\n",
+    "    df[\"scheme_cert_date\"] = df[\"scheme_data\"].map(lambda data: get_from_entry(data, \"certification_date\") if data else None)\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89cbf34713ce6c6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = build_df(dset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a7976ae31969150",
+   "metadata": {},
+   "source": [
+    "## Evaluate all schemes"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2e5c8d5-e08a-4fb9-919a-4de0718f5de5",
+   "metadata": {},
+   "source": [
+    "Let's look at how the threshold setting changes the match rate."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8a7140f3738166f",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "original_threshold = config.cc_matching_threshold\n",
+    "thresholds = list(range(100, -10, -10))\n",
+    "rates = {}\n",
+    "dfs = {}\n",
+    "for threshold in tqdm(thresholds):\n",
+    "    config.cc_matching_threshold = threshold\n",
+    "    for cert in dset:\n",
+    "        cert.heuristics.scheme_data = None\n",
+    "    dset._compute_scheme_data()\n",
+    "    count = 0\n",
+    "    for cert in dset:\n",
+    "        if cert.heuristics.scheme_data is not None:\n",
+    "            count += 1\n",
+    "    print(f\"Threshold: {threshold}\")\n",
+    "    print(f\"Assigned count: {count}\")\n",
+    "    df = build_df(dset)\n",
+    "    dfs[threshold] = df\n",
+    "    for scheme in schemes:\n",
+    "        country = scheme.country\n",
+    "        total = df[df[\"scheme\"] == country]\n",
+    "        assigned = df[(df[\"scheme\"] == country) & df[\"scheme_data\"].notnull()]\n",
+    "        rate = len(assigned)/len(total) * 100 if len(total) != 0 else 0\n",
+    "        rate_list = rates.setdefault(country, [])\n",
+    "        rate_list.append(rate)\n",
+    "        \n",
+    "        print(f\"{country}: {len(assigned)} assigned out of {len(total)} -> {rate:.1f}%\")\n",
+    "        total_active = total[total[\"status\"] == \"active\"]\n",
+    "        assigned_active = assigned[assigned[\"status\"] == \"active\"]\n",
+    "        print(f\"\\t- active: {len(assigned_active)} out of {len(total_active)}, entries: {len(scheme.lists.get(EntryType.Certified, []))}\")\n",
+    "        total_archived = total[total[\"status\"] == \"archived\"]\n",
+    "        assigned_archived = assigned[assigned[\"status\"] == \"archived\"]\n",
+    "        print(f\"\\t- archived: {len(assigned_archived)} out of {len(total_archived)}, entries: {len(scheme.lists.get(EntryType.Archived, []))}\")\n",
+    "    print()\n",
+    "\n",
+    "config.cc_matching_threshold = original_threshold"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05ef5991-1b3f-4fd3-9f4b-808ddb51a89f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "from matplotlib import pyplot as plt\n",
+    "from itertools import cycle\n",
+    "\n",
+    "lines = [\"-\",\"--\",\"-.\",\":\"]\n",
+    "linecycler = cycle(lines)\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(12,4))\n",
+    "for scheme in schemes:\n",
+    "    ax.plot(thresholds, rates[scheme.country], next(linecycler), label=scheme.country)\n",
+    "ax.legend(bbox_to_anchor=(1.04, 1), loc=\"upper left\");"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "62df8d488e204ac2",
+   "metadata": {},
+   "source": [
+    "## Evaluate a scheme"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab0d21164906fe3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scheme = \"DE\"\n",
+    "threshold = 70\n",
+    "df = dfs[threshold]\n",
+    "df[df[\"scheme\"] == scheme].sample(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "709598fe26cc4371",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "un_df = pd.DataFrame(schemes[scheme].lists[EntryType.Certified])\n",
+    "un_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11c70f74ce9c4d56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sd = list(df[\"scheme_data\"])\n",
+    "unmatched_certs = [cert for cert in dset if cert.scheme == scheme and cert.heuristics.scheme_data is None and cert.status == \"active\"]\n",
+    "unmatched_entries = [entry for entry in schemes[scheme].lists[EntryType.Certified] if entry not in sd]\n",
+    "matches = CCSchemeMatcher.match_all(unmatched_entries, scheme, unmatched_certs)\n",
+    "matches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5fa79176664dac8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame([cert.pandas_tuple[:5] for cert in unmatched_certs])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95a6e048e53601c7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame(unmatched_entries)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff67f13e-bf99-4ba5-bef0-37c85dd3e2c8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}