From 844ae8ae0eb8a08461d38e41eaf0d9dba3e90261 Mon Sep 17 00:00:00 2001
From: adamjanovsky <janovsky.vm@gmail.com>
Date: Fri, 24 Nov 2023 17:10:33 +0100
Subject: [PATCH] add inter-annotator agreement on simplified labels

---
 .../inter_annotator_agreement.ipynb           | 51 +++++++++++++++----
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/notebooks/cc/reference_annotations/inter_annotator_agreement.ipynb b/notebooks/cc/reference_annotations/inter_annotator_agreement.ipynb
index d02a8975..2f8f0b78 100644
--- a/notebooks/cc/reference_annotations/inter_annotator_agreement.ipynb
+++ b/notebooks/cc/reference_annotations/inter_annotator_agreement.ipynb
@@ -2,23 +2,36 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Cohen's Kappa: 0.7101271765978729\n",
-      "Percentage agreement: 0.8225\n"
+      "Results on 5 classes:\n",
+      "\t- Cohen's Kappa: 0.7101271765978729\n",
+      "\t- Percentage agreement: 0.8225\n",
+      "Results on simplified 2 classes:\n",
+      "\t- Cohen's Kappa: 0.8424203759140207\n",
+      "\t- Percentage agreement: 0.9437869822485208\n"
      ]
     }
    ],
    "source": [
-    "import pandas as pd\n",
     "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
     "from sklearn.metrics import cohen_kappa_score\n",
     "\n",
+    "label_mapping = {\n",
+    "    \"COMPONENT_USED\": \"COMPONENT_USED\",\n",
+    "    \"RE-EVALUATION\": \"PREVIOUS_VERSION\",\n",
+    "    \"EVALUATION_REUSED\": \"COMPONENT_USED\",\n",
+    "    \"PREVIOUS_VERSION\": \"PREVIOUS_VERSION\",\n",
+    "    \"COMPONENT_SHARED\": \"COMPONENT_USED\",\n",
+    "}\n",
+    "\n",
     "\n",
     "def load_all_dataframes(base_folder: Path) -> pd.DataFrame:\n",
     "    splits = [\"train\", \"valid\", \"test\"]\n",
@@ -33,17 +46,37 @@
     "        else:\n",
     "            df_test = df\n",
     "\n",
-    "    return pd.concat([df_train, df_valid, df_test])\n",
+    "    df_to_return = pd.concat([df_train, df_valid, df_test])\n",
+    "    return df_to_return.assign(label=lambda df_: df_.label.fillna(\"unknown\")).assign(\n",
+    "        label=lambda df_: df_.label.str.upper(),\n",
+    "        simplified_label=lambda df_: df_.label.map(label_mapping),\n",
+    "    )\n",
     "\n",
     "\n",
-    "REPO_ROOT = Path(\".\")\n",
+    "REPO_ROOT = Path()\n",
+    "\n",
     "\n",
     "adam_df = load_all_dataframes(REPO_ROOT / \"src/sec_certs/data/reference_annotations/adam\")\n",
     "jano_df = load_all_dataframes(REPO_ROOT / \"src/sec_certs/data/reference_annotations/jano\")\n",
     "agreement_series = adam_df.label == jano_df.label\n",
     "\n",
-    "print(f\"Cohen's Kappa: {cohen_kappa_score(adam_df.label, jano_df.label)}\")\n",
-    "print(f\"Percentage agreement: {agreement_series.loc[agreement_series == True].count() / agreement_series.count()}\")\n"
+    "print(\"Results on 5 classes:\")\n",
+    "print(f\"\\t- Cohen's Kappa: {cohen_kappa_score(adam_df.label, jano_df.label)}\")\n",
+    "print(f\"\\t- Percentage agreement: {agreement_series.loc[agreement_series == True].count() / agreement_series.count()}\")\n",
+    "\n",
+    "indices_to_drop = set(adam_df.loc[adam_df.simplified_label.isnull()].index.tolist()) | set(\n",
+    "    jano_df.loc[jano_df.simplified_label.isnull()].index.tolist()\n",
+    ")\n",
+    "adam_df_simplified = adam_df.drop(indices_to_drop)\n",
+    "jano_df_simplified = jano_df.drop(indices_to_drop)\n",
+    "agreement_series = adam_df_simplified.simplified_label == jano_df_simplified.simplified_label\n",
+    "\n",
+    "\n",
+    "print(\"Results on simplified 2 classes:\")\n",
+    "print(\n",
+    "    f\"\\t- Cohen's Kappa: {cohen_kappa_score(adam_df_simplified.simplified_label, jano_df_simplified.simplified_label)}\"\n",
+    ")\n",
+    "print(f\"\\t- Percentage agreement: {agreement_series.loc[agreement_series == True].count() / agreement_series.count()}\")\n"
    ]
   }
  ],
@@ -63,7 +96,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.11.6"
   },
   "orig_nbformat": 4
  },