From e1a63860e089ba2d6dae19e17ad08f643a6475a6 Mon Sep 17 00:00:00 2001 From: adamjanovsky Date: Sun, 22 Oct 2023 14:01:33 +0200 Subject: [PATCH] dim. red. prediction WiP --- .../cc/reference_annotations/prediction.ipynb | 893 ++++++++++++------ 1 file changed, 616 insertions(+), 277 deletions(-) diff --git a/notebooks/cc/reference_annotations/prediction.ipynb b/notebooks/cc/reference_annotations/prediction.ipynb index fe51f611..65a251b0 100644 --- a/notebooks/cc/reference_annotations/prediction.ipynb +++ b/notebooks/cc/reference_annotations/prediction.ipynb @@ -2,11 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 97, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPU available: True\n" + ] + } + ], "source": [ - "from __future__ import annotations\n", "import os\n", "\n", "# When on Aura, it is important to first set CUDA_VISIBLE_DEVICES environment variable directly from notebook\n", @@ -15,170 +22,533 @@ "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"MIG-56c53afb-6f08-5e5b-83fa-32fc6f09eeb0\"\n", "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"FALSE\"\n", "\n", + "\n", + "from rapidfuzz import fuzz\n", + "from sec_certs.model.references.segment_extractor import ReferenceSegmentExtractor\n", + "import spacy\n", + "import numpy as np\n", + "import torch\n", "import pandas as pd\n", - "from sec_certs.dataset import CCDataset\n", - "from shutil import copy\n", "from pathlib import Path\n", - "from sec_certs.model.references.segment_extractor import ReferenceSegmentExtractor\n", - "from sec_certs.utils.nlp import prec_recall_metric\n", - "from sklearn.dummy import DummyClassifier\n", - "from sec_certs.utils.nlp import prec_recall_metric\n", + "from sec_certs.model.references.annotator import ReferenceAnnotator\n", "from sec_certs.model.references.annotator_trainer import ReferenceAnnotatorTrainer\n", - "from sklearn.metrics import ConfusionMatrixDisplay\n", - "from sec_certs.utils.helpers import compute_heuristics_version\n", - "from rapidfuzz import fuzz\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "import umap\n", + "import umap.plot\n", + "from sklearn.preprocessing import LabelEncoder\n", + "import numpy as np\n", + "from scipy.spatial import ConvexHull, distance_matrix, QhullError\n", + "from scipy.stats import skew, kurtosis\n", + "from collections import Counter\n", + "from ast import literal_eval\n", + "import matplotlib.pyplot as plt\n", "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import classification_report, f1_score\n", - "import torch\n", - "import optuna\n", - "from matplotlib import pyplot as plt\n", - "\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "import plotly.express as px\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.base import ClassifierMixin\n", + "from typing import Literal\n", + "from sec_certs.dataset import CCDataset\n", + "from sec_certs.utils.nlp import prec_recall_metric\n", + "from typing import Final\n", "\n", "REPO_ROOT = Path(\".\").resolve()\n", "DATASET_PATH = REPO_ROOT / \"dataset/cc_final_run_may_23/dataset.json\"\n", - "ANNOTATIONS_PATH = REPO_ROOT / \"src/sec_certs/data/reference_annotations/final/\"\n", + "TENSORBOARD_DATA_DIR = REPO_ROOT / \"dataset/tensorboard_visualisation/\"\n", + "TRAINED_MODEL_PATH = REPO_ROOT / \"dataset/reference_prediction/final_model\"\n", "\n", - "def replace_all(text: str, to_replce: set[str]) -> str:\n", - " for i in to_replce:\n", - " text = text.replace(i, \"\")\n", - " return text\n", + "print(f\"GPU available: {torch.cuda.is_available()}\")\n", "\n", - "print(f\"GPU available: {torch.cuda.is_available()}\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load data\n", + "nlp = spacy.load(\"en_core_web_sm\")\n", "\n", - "Enrich annotations with string similarity of cert. and referenced cert." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_annotations = pd.read_csv(ANNOTATIONS_PATH / \"train.csv\")\n", - "valid_annotations = pd.read_csv(ANNOTATIONS_PATH / \"valid.csv\")\n", - "all_annotations = pd.concat([train_annotations, valid_annotations])\n", - "all_annotations = all_annotations[all_annotations.label != \"None\"].assign(label=lambda df: df.label.str.upper())\n", - "\n", - "dset = CCDataset.from_json(DATASET_PATH)\n", - "all_certs = {x.dgst: x for x in dset.certs.values()}\n", - "dset.certs = {x.dgst: x for x in dset.certs.values() if x.dgst in all_annotations.dgst.unique()}\n", - "\n", - "cert_id_to_name_mapping = {x.heuristics.cert_id: x.name for x in all_certs.values()}\n", - "all_annotations[\"referenced_cert_name\"] = all_annotations[\"referenced_cert_id\"].map(cert_id_to_name_mapping)\n", - "all_annotations[\"cert_name\"] = all_annotations[\"dgst\"].map(lambda x: dset[x].name)\n", - "all_annotations[\"cert_versions\"] = all_annotations[\"cert_name\"].map(compute_heuristics_version)\n", - "all_annotations = all_annotations.loc[all_annotations[\"referenced_cert_name\"].notnull()].copy()\n", - "all_annotations[\"referenced_cert_versions\"] = all_annotations[\"referenced_cert_name\"].map(compute_heuristics_version)\n", - "all_annotations[\"cert_name_stripped_version\"] = all_annotations.apply(lambda x: replace_all(x[\"cert_name\"], x[\"cert_versions\"]), axis=1)\n", - "all_annotations[\"referenced_cert_name_stripped_version\"] = all_annotations.apply(lambda x: replace_all(x[\"referenced_cert_name\"], x[\"referenced_cert_versions\"]), axis=1)\n", - "all_annotations[\"name_similarity\"] = all_annotations.apply(lambda x: fuzz.token_set_ratio(x[\"cert_name\"], x[\"referenced_cert_name\"]), axis=1)\n", - "all_annotations[\"name_similarity_stripped_version\"] = all_annotations.apply(lambda x: fuzz.token_set_ratio(x[\"cert_name_stripped_version\"], x[\"referenced_cert_name_stripped_version\"]), axis=1)\n", - "all_annotations[\"name_len_diff\"] = all_annotations.apply(lambda x: abs(len(x[\"cert_name_stripped_version\"]) - len(x[\"referenced_cert_name_stripped_version\"])), axis=1)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Retrieve segments" + "RANDOM_STATE: Final[int] = 42\n", + "MODES = Literal[\"training\", \"evaluation\", \"production\"]\n", + "EMBEDDING_METHOD = Literal[\"tf_idf\", \"transformer\"]\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ - "df = ReferenceSegmentExtractor()(dset.certs.values())\n", - "df = df.loc[df.label.notnull()].copy()\n", - "df = df.merge(all_annotations.loc[:, [\"dgst\", \"referenced_cert_id\", \"name_similarity_stripped_version\", \"name_len_diff\", \"cert_name\", \"referenced_cert_name\"]], on=[\"dgst\", \"referenced_cert_id\"])\n", + "def extract_segments(cc_dset: CCDataset, mode: MODES) -> pd.DataFrame:\n", + " df = ReferenceSegmentExtractor()(list(cc_dset.certs.values()))\n", + " if mode == \"training\":\n", + " return df.loc[(df.label.notnull()) & ((df.split == \"train\") | (df.split == \"valid\"))]\n", + " elif mode == \"evaluation\":\n", + " return df.loc[df.label.notnull()]\n", + " elif mode == \"production\":\n", + " return df\n", + " else:\n", + " raise ValueError(f\"Unknown mode {mode}\")\n", "\n", - "# Simplified binary labels\n", - "# label_mapping = {\"COMPONENT_USED\": \"COMPONENT_SHARED\", \"REEVALUATION\": \"PREVIOUS_VERSION\"}\n", - "# df.label = df.label.map(lambda x: label_mapping[x] if x in label_mapping else x)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Segment post-processing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def process_segment(segment: str, referenced_cert_id: str) -> str:\n", - " segment = segment.replace(referenced_cert_id, \"the referenced product\")\n", - " return segment\n", "\n", - "df.segments = df.apply(lambda row: [process_segment(x, row.referenced_cert_id) for x in row.segments], axis=1)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train & evaluate the baseline classifier (majority class)" + "def _build_transformer_embeddings(segments: pd.DataFrame, mode: MODES, model_path: Path | None = None) -> pd.DataFrame:\n", + " should_save_model = model_path is None\n", + " annotator = None\n", + " if model_path:\n", + " try:\n", + " annotator = ReferenceAnnotator.from_pretrained(model_path)\n", + " should_save_model = False\n", + " except Exception as e:\n", + " print(f\"Failed to load ReferenceAnnotator from {model_path}.\")\n", + " should_save_model = True\n", + "\n", + " if not annotator:\n", + " print(f\"Training ReferenceAnnotator from scratch.\")\n", + " trainer = ReferenceAnnotatorTrainer.from_df(\n", + " segments,\n", + " prec_recall_metric,\n", + " mode=mode,\n", + " n_iterations=20,\n", + " n_epochs=1,\n", + " batch_size=16,\n", + " segmenter_metric=\"f1\",\n", + " ensemble_soft_voting_power=2,\n", + " )\n", + " trainer.train()\n", + " annotator = trainer.clf\n", + "\n", + " if should_save_model:\n", + " annotator.save_pretrained(model_path)\n", + "\n", + " return segments.copy().assign(embeddings=lambda df_: df_.segments.map(annotator._model.model_body.encode))\n", + "\n", + "\n", + "def _build_tf_idf_embeddings(segments: pd.DataFrame, mode: MODES) -> pd.DataFrame:\n", + " def choose_values_to_fit(df_: pd.DataFrame) -> list[str]:\n", + " if mode == \"training\":\n", + " return df_.loc[df_.split == \"train\"].copy().explode(\"segments\").segments.values\n", + " elif mode == \"evaluation\":\n", + " return df_.loc[df_.split != \"test\"].copy().explode(\"segments\").segments.values\n", + " elif mode == \"production\":\n", + " return df_.copy().explode(\"segments\").segments.values\n", + " else:\n", + " raise ValueError(f\"Unknown mode {mode}\")\n", + "\n", + " tf_idf = TfidfVectorizer()\n", + " tf_idf = tf_idf.fit(choose_values_to_fit(segments))\n", + "\n", + " return segments.copy().assign(\n", + " embeddings=lambda df_: df_.segments.map(lambda x: tf_idf.transform(x).toarray().tolist())\n", + " )\n", + "\n", + "\n", + "def build_embeddings(\n", + " segments: pd.DataFrame, mode: MODES, method: EMBEDDING_METHOD, model_path: Path | None = None\n", + ") -> pd.DataFrame:\n", + " return (\n", + " _build_transformer_embeddings(segments, mode, model_path)\n", + " if method == \"transformer\"\n", + " else _build_tf_idf_embeddings(segments, mode)\n", + " )\n", + "\n", + "\n", + "def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFrame:\n", + " def strip_all(text: str, to_strip) -> str:\n", + " if pd.isna(to_strip):\n", + " return text\n", + " for i in to_strip:\n", + " text = text.replace(i, \"\")\n", + " return text\n", + "\n", + " def compute_ngram_overlap_spacy(string1, string2, n):\n", + " doc1 = nlp(string1)\n", + " doc2 = nlp(string2)\n", + "\n", + " ngrams1 = [\" \".join([token.text for token in doc1[i : i + n]]) for i in range(len(doc1) - n + 1)]\n", + " ngrams2 = [\" \".join([token.text for token in doc2[i : i + n]]) for i in range(len(doc2) - n + 1)]\n", + "\n", + " overlap = sum((Counter(ngrams1) & Counter(ngrams2)).values())\n", + " return overlap\n", + "\n", + " def compute_character_ngram_overlap(str1, str2, n):\n", + " ngrams1 = [str1[i : i + n] for i in range(len(str1) - n + 1)]\n", + " ngrams2 = [str2[i : i + n] for i in range(len(str2) - n + 1)]\n", + " overlap = sum((Counter(ngrams1) & Counter(ngrams2)).values())\n", + " return overlap\n", + "\n", + " def compute_common_length(str1, str2, prefix=True):\n", + " length = 0\n", + " min_length = min(len(str1), len(str2))\n", + " if prefix:\n", + " for i in range(min_length):\n", + " if str1[i] == str2[i]:\n", + " length += 1\n", + " else:\n", + " break\n", + " else:\n", + " for i in range(1, min_length + 1):\n", + " if str1[-i] == str2[-i]:\n", + " length += 1\n", + " else:\n", + " break\n", + " return length\n", + "\n", + " def compute_numeric_token_overlap(str1, str2):\n", + " doc1 = nlp(str1)\n", + " doc2 = nlp(str2)\n", + "\n", + " tokens1 = [token.text for token in doc1 if token.like_num]\n", + " tokens2 = [token.text for token in doc2 if token.like_num]\n", + "\n", + " overlap = sum((Counter(tokens1) & Counter(tokens2)).values())\n", + " return overlap\n", + "\n", + " def extract_language_features(base_name: str, referenced_name: str) -> tuple:\n", + " common_numeric_words = compute_numeric_token_overlap(base_name, referenced_name)\n", + " common_words = compute_ngram_overlap_spacy(base_name, referenced_name, 1)\n", + " bigram_overlap = compute_ngram_overlap_spacy(base_name, referenced_name, 2)\n", + " trigram_overlap = compute_ngram_overlap_spacy(base_name, referenced_name, 3)\n", + " common_prefix_len = compute_common_length(base_name, referenced_name, True)\n", + " common_suffix_len = compute_common_length(base_name, referenced_name, False)\n", + " character_bigram_overlap = compute_character_ngram_overlap(base_name, referenced_name, 2)\n", + " character_trigram_overlap = compute_character_ngram_overlap(base_name, referenced_name, 3)\n", + " base_len = len(base_name)\n", + " referenced_len = len(referenced_name)\n", + " len_difference = abs(base_len - referenced_len)\n", + "\n", + " return (\n", + " common_numeric_words,\n", + " common_words,\n", + " bigram_overlap,\n", + " trigram_overlap,\n", + " common_prefix_len,\n", + " common_suffix_len,\n", + " character_bigram_overlap,\n", + " character_trigram_overlap,\n", + " base_len,\n", + " referenced_len,\n", + " len_difference,\n", + " )\n", + "\n", + " certs = list(cc_dset.certs.values())\n", + " dgst_to_cert_name = {x.dgst: x.name for x in certs}\n", + " cert_id_to_cert_name = {x.heuristics.cert_id: x.name for x in certs}\n", + " dgst_to_extracted_versions = {x.dgst: x.heuristics.extracted_versions for x in certs}\n", + " cert_id_to_extracted_versions = {x.heuristics.cert_id: x.heuristics.extracted_versions for x in certs}\n", + "\n", + " df_lang = (\n", + " df.copy()\n", + " .assign(\n", + " cert_name=lambda df_: df_.dgst.map(dgst_to_cert_name),\n", + " referenced_cert_name=lambda df_: df_.canonical_reference_keyword.map(cert_id_to_cert_name),\n", + " cert_versions=lambda df_: df_.dgst.map(dgst_to_extracted_versions),\n", + " referenced_cert_versions=lambda df_: df_.canonical_reference_keyword.map(cert_id_to_extracted_versions),\n", + " cert_name_stripped_version=lambda df_: df_.apply(\n", + " lambda x: strip_all(x[\"cert_name\"], x[\"cert_versions\"]), axis=1\n", + " ),\n", + " referenced_cert_name_stripped_version=lambda df_: df_.apply(\n", + " lambda x: strip_all(x[\"referenced_cert_name\"], x[\"referenced_cert_versions\"]), axis=1\n", + " ),\n", + " lang_token_set_ratio=lambda df_: df_.apply(\n", + " lambda x: fuzz.token_set_ratio(\n", + " x[\"cert_name_stripped_version\"], x[\"referenced_cert_name_stripped_version\"]\n", + " ),\n", + " axis=1,\n", + " ),\n", + " lang_partial_ratio=lambda df_: df_.apply(\n", + " lambda x: fuzz.partial_ratio(\n", + " x[\"cert_name_stripped_version\"], x[\"referenced_cert_name_stripped_version\"]\n", + " ),\n", + " axis=1,\n", + " ),\n", + " lang_token_sort_ratio=lambda df_: df_.apply(\n", + " lambda x: fuzz.token_sort_ratio(\n", + " x[\"cert_name_stripped_version\"], x[\"referenced_cert_name_stripped_version\"]\n", + " ),\n", + " axis=1,\n", + " ),\n", + " lang_n_segments=lambda df_: df_.segments.map(lambda x: len(x) if x else 0),\n", + " )\n", + " .assign(\n", + " lang_n_extracted_versions=lambda df_: df_.cert_versions.map(lambda x: len(x) if x else 0),\n", + " lang_n_intersection_versions=lambda df_: df_.apply(\n", + " lambda x: len(set(x[\"cert_versions\"]).intersection(set(x[\"referenced_cert_versions\"]))), axis=1\n", + " ),\n", + " )\n", + " )\n", + "\n", + " df_lang_other_features = df_lang.apply(\n", + " lambda row: extract_language_features(row[\"cert_name\"], row[\"referenced_cert_name\"]), axis=1\n", + " ).apply(pd.Series)\n", + " lang_features = [\n", + " \"common_numeric_words\",\n", + " \"common_words\",\n", + " \"bigram_overlap\",\n", + " \"trigram_overlap\",\n", + " \"common_prefix_len\",\n", + " \"common_suffix_len\",\n", + " \"character_bigram_overlap\",\n", + " \"character_trigram_overlap\",\n", + " \"base_len\",\n", + " \"referenced_len\",\n", + " \"len_difference\",\n", + " ]\n", + " df_lang_other_features.columns = [\"lang_\" + x for x in lang_features]\n", + "\n", + " return pd.concat([df_lang, df_lang_other_features], axis=1)\n", + "\n", + "\n", + "def perform_dimensionality_reduction(df: pd.DataFrame, mode: MODES) -> pd.DataFrame:\n", + " def choose_values_to_fit(df_: pd.DataFrame):\n", + " if mode == \"training\":\n", + " return df_.loc[df_.split == \"train\"].copy().embeddings.values\n", + " elif mode == \"evaluation\":\n", + " return df_.loc[df_.split != \"test\"].copy().embeddings.values\n", + " elif mode == \"production\":\n", + " return df_.copy().embeddings.values\n", + " else:\n", + " raise ValueError(f\"Unknown mode {mode}\")\n", + "\n", + " def choose_labels_to_fit(df_: pd.DataFrame):\n", + " if mode == \"training\":\n", + " return df_.loc[df_.split == \"train\"].copy().label.values\n", + " elif mode == \"evaluation\":\n", + " return df_.loc[df_.split != \"test\"].copy().label.values\n", + " elif mode == \"production\":\n", + " return df_.copy().label.values\n", + " else:\n", + " raise ValueError(f\"Unknown mode {mode}\")\n", + "\n", + " df_exploded = df.copy().explode([\"segments\", \"embeddings\"]).reset_index(drop=True)\n", + " label_encoder = LabelEncoder()\n", + "\n", + " embeddings_to_fit = np.vstack(choose_values_to_fit(df_exploded))\n", + " labels_to_fit = label_encoder.fit_transform(choose_labels_to_fit(df_exploded))\n", + "\n", + " scaler = StandardScaler()\n", + " embeddings_to_fit_scaled = scaler.fit_transform(embeddings_to_fit)\n", + "\n", + " # parallel UMAP not available with random state\n", + " umapper = umap.UMAP(n_neighbors=5, random_state=RANDOM_STATE, n_jobs=1).fit(embeddings_to_fit, y=labels_to_fit)\n", + " pca_mapper = PCA(n_components=2, random_state=RANDOM_STATE).fit(embeddings_to_fit_scaled, y=labels_to_fit)\n", + "\n", + " all_embeddings = np.vstack(df.embeddings.values)\n", + " all_embeddings_scaled = scaler.transform(all_embeddings)\n", + "\n", + " df_exploded[\"umap\"] = umapper.transform(all_embeddings).tolist()\n", + " df_exploded[\"pca\"] = pca_mapper.transform(all_embeddings_scaled).tolist()\n", + "\n", + " return (\n", + " df_exploded.groupby([\"dgst\", \"canonical_reference_keyword\"])\n", + " .agg(\n", + " {\n", + " \"segments\": lambda x: x.tolist(),\n", + " \"actual_reference_keywords\": \"first\",\n", + " \"label\": \"first\",\n", + " \"split\": \"first\",\n", + " \"embeddings\": lambda x: x.tolist(),\n", + " \"umap\": lambda x: x.tolist(),\n", + " \"pca\": lambda x: x.tolist(),\n", + " }\n", + " )\n", + " .reset_index()\n", + " )\n", + "\n", + "\n", + "def extract_geometrical_features(df: pd.DataFrame) -> pd.DataFrame:\n", + " def extract_features(points):\n", + " # Convert list of points to a numpy array\n", + " points = np.array(points)\n", + " xs = points[:, 0]\n", + " ys = points[:, 1]\n", + "\n", + " # Basic Descriptive Statistics\n", + " mean_x, mean_y = np.mean(xs), np.mean(ys)\n", + " var_x, var_y = np.var(xs), np.var(ys)\n", + " std_x, std_y = np.std(xs), np.std(ys)\n", + " if len(points) > 1:\n", + " skew_x, skew_y = skew(xs), skew(ys)\n", + " kurt_x, kurt_y = kurtosis(xs), kurtosis(ys)\n", + " else:\n", + " skew_x, skew_y = 0, 0\n", + " kurt_x, kurt_y = 0, 0\n", + "\n", + " # Spatial Spread\n", + " range_x, range_y = np.ptp(xs), np.ptp(ys)\n", + " cov_xy = np.cov(xs, ys)[0, 1] if len(points) > 1 else 0\n", + " median_x, median_y = np.median(xs), np.median(ys)\n", + "\n", + " # Distance-based Features\n", + " centroid = [mean_x, mean_y]\n", + " distances_to_centroid = np.linalg.norm(points - centroid, axis=1) if len(points) > 1 else [0]\n", + " mean_distance = np.mean(distances_to_centroid)\n", + " max_distance = np.max(distances_to_centroid)\n", + " min_distance = np.min(distances_to_centroid)\n", + " std_distance = np.std(distances_to_centroid)\n", + " max_min_distance = max_distance - min_distance\n", + "\n", + " sorted_points = points[np.argsort(distances_to_centroid)]\n", + " total_distance = np.sum(np.linalg.norm(sorted_points[1:] - sorted_points[:-1], axis=1))\n", + "\n", + " # Geometric Features\n", + " hull_area, hull_perimeter = (0, 0)\n", + " if len(points) > 2: # ConvexHull needs at least 3 points\n", + " try:\n", + " hull = ConvexHull(points)\n", + " hull_area = hull.volume\n", + " hull_perimeter = hull.area\n", + " except QhullError:\n", + " pass\n", + "\n", + " pairwise_distances = distance_matrix(points, points) if len(points) > 1 else np.array([[0]])\n", + " mean_pairwise_distance = np.mean(pairwise_distances)\n", + " max_pairwise_distance = np.max(pairwise_distances)\n", + "\n", + " if len(points) > 1:\n", + " min_coords = np.min(points, axis=0)\n", + " max_coords = np.max(points, axis=0)\n", + " bounding_box_width = max_coords[0] - min_coords[0]\n", + " bounding_box_height = max_coords[1] - min_coords[1]\n", + " bounding_box_area = bounding_box_width * bounding_box_height\n", + "\n", + " aspect_ratio = bounding_box_width / bounding_box_height if bounding_box_height != 0 else 1\n", + " point_density = len(points) / bounding_box_area\n", + " else:\n", + " aspect_ratio = 0\n", + " point_density = 0\n", + "\n", + " # Gather all features into a list\n", + " features = [\n", + " mean_x,\n", + " mean_y,\n", + " var_x,\n", + " var_y,\n", + " std_x,\n", + " std_y,\n", + " skew_x,\n", + " skew_y,\n", + " kurt_x,\n", + " kurt_y,\n", + " range_x,\n", + " range_y,\n", + " cov_xy,\n", + " median_x,\n", + " median_y,\n", + " mean_distance,\n", + " max_distance,\n", + " min_distance,\n", + " max_min_distance,\n", + " std_distance,\n", + " total_distance,\n", + " hull_area,\n", + " hull_perimeter,\n", + " mean_pairwise_distance,\n", + " max_pairwise_distance,\n", + " aspect_ratio,\n", + " point_density,\n", + " ]\n", + "\n", + " return features\n", + "\n", + " feature_names = [\n", + " \"mean_x\",\n", + " \"mean_y\",\n", + " \"var_x\",\n", + " \"var_y\",\n", + " \"std_x\",\n", + " \"std_y\",\n", + " \"skew_x\",\n", + " \"skew_y\",\n", + " \"kurt_x\",\n", + " \"kurt_y\",\n", + " \"range_x\",\n", + " \"range_y\",\n", + " \"cov_xy\",\n", + " \"median_x\",\n", + " \"median_y\",\n", + " \"mean_distance_to_centroid\",\n", + " \"max_distance_to_centroid\",\n", + " \"min_distance_to_centroid\",\n", + " \"max_min_distance_to_centroid\",\n", + " \"std_distance_to_centroid\",\n", + " \"total_distances_to_centroid\",\n", + " \"hull_area\",\n", + " \"hull_perimeter\",\n", + " \"mean_pairwise_distance\",\n", + " \"max_pairwise_distance\",\n", + " \"aspect_ratio\",\n", + " \"point_density\",\n", + " ]\n", + "\n", + " df_features_pca = df.pca.apply(extract_features).apply(pd.Series)\n", + " feature_names_pca = [\"pca_\" + x for x in feature_names]\n", + " df_features_pca.columns = feature_names_pca\n", + "\n", + " df_features_umap = df.umap.apply(extract_features).apply(pd.Series)\n", + " feature_names_umap = [\"umap_\" + x for x in feature_names]\n", + " df_features_umap.columns = feature_names_umap\n", + "\n", + " return pd.concat([df, df_features_pca, df_features_umap], axis=1)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ - "dummy_clf = DummyClassifier()\n", - "dummy_clf.fit(df.loc[df.split == \"train\", [\"segments\"]], df.loc[df.split == \"train\"].label)\n", - "y_pred_dummy = dummy_clf.predict(df.loc[df.split == \"valid\", [\"segments\"]])\n", - "print(classification_report(df.loc[df.split == \"valid\"].label, y_pred_dummy, zero_division=0))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train & evaluate the transformer" + "def build_clf_and_predict(\n", + " df: pd.DataFrame, mode: MODES, train_baseline: bool = False\n", + ") -> tuple[ClassifierMixin, pd.DataFrame]:\n", + " pass\n", + "\n", + "\n", + "def evaluate_clf(clf: ClassifierMixin, df: pd.DataFrame, mode: MODES) -> None:\n", + " pass\n", + "\n", + "\n", + "def display_dim_red_scatter(df: pd.DataFrame, dim_red: Literal[\"umap\", \"pca\"]) -> None:\n", + " df_exploded = df.explode([\"segments\", dim_red]).reset_index()\n", + "\n", + " x_col = dim_red + \"_x\"\n", + " y_col = dim_red + \"_y\"\n", + "\n", + " df_exploded[x_col] = df_exploded[dim_red].map(lambda x: x[0])\n", + " df_exploded[y_col] = df_exploded[dim_red].map(lambda x: x[1])\n", + " df_exploded[\"wrapped_segment\"] = df_exploded.segments.str.wrap(60).map(lambda x: x.replace(\"\\n\", \"
\"))\n", + "\n", + " fig = px.scatter(\n", + " df_exploded,\n", + " x=x_col,\n", + " y=y_col,\n", + " color=\"label\",\n", + " hover_data=[\"dgst\", \"canonical_reference_keyword\", \"wrapped_segment\"],\n", + " width=1500,\n", + " height=1000,\n", + " title=f\"{dim_red.upper()} projection of segment embeddings.\",\n", + " )\n", + " fig.show()\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ - "trainer = ReferenceAnnotatorTrainer.from_df(df, prec_recall_metric, mode=\"training\", use_analytical_rule_name_similarity=True, n_iterations=20, n_epochs=1, batch_size=16, segmenter_metric=\"f1\", ensemble_soft_voting_power=2)\n", - "trainer.train()\n", - "trainer.evaluate()\n", + "cc_dset = CCDataset.from_json(DATASET_PATH)\n", "\n", - "annotator = trainer.clf\n", - "df_predicted = annotator.predict_df(df)\n", + "# df = extract_segments(cc_dset, mode=\"training\")\n", + "# df.to_csv(REPO_ROOT / \"dataset/reference_prediction/dataset.csv\", index=False)\n", "\n", - "print(classification_report(df_predicted.loc[df_predicted.split == \"valid\", [\"y_pred\"]], df_predicted.loc[df_predicted.split == \"valid\", [\"label\"]], zero_division=0))\n", + "df = pd.read_csv(REPO_ROOT / \"dataset/reference_prediction/dataset.csv\")\n", + "df.segments = df.segments.apply(literal_eval)\n", + "df.actual_reference_keywords = df.actual_reference_keywords.apply(literal_eval)\n", + "# df.label = df.label.map(lambda x: np.nan if x == \"IRRELEVANT\" else x)\n", "\n", - "# Print confusion matrix\n", - "ConfusionMatrixDisplay.from_predictions(df_predicted.loc[df_predicted.split == \"valid\", [\"label\"]], df_predicted.loc[df_predicted.split == \"valid\", [\"y_pred\"]], labels=list(trainer.label_mapping.values()), display_labels=list(trainer.label_mapping.values()), xticks_rotation=90)\n", + "mode = \"training\"\n", + "# TODO: Plug this into pandas pipeline\n", + "df = build_embeddings(df, mode=mode, method=\"transformer\", model_path=TRAINED_MODEL_PATH)\n", + "df = perform_dimensionality_reduction(df, mode=mode)\n", + "df = extract_language_features(df, cc_dset)\n", + "df = extract_geometrical_features(df)\n", "\n", - "# Serialize errors into file\n", - "df_predicted.y_proba = df_predicted.y_proba.map(lambda x: {y: z for y, z in zip(trainer.label_mapping.values(), x)})\n", - "df_predicted.loc[~df_predicted.correct].to_json(\"/var/tmp/xjanovsk/certs/sec-certs/dataset/annotator_errors.json\", orient=\"records\", indent=4)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Hyperparameter optimization" + "# display_dim_red_scatter(df.loc[df.split == \"valid\"], \"umap\")\n" ] }, { @@ -187,178 +557,148 @@ "metadata": {}, "outputs": [], "source": [ - "def define_trainer(trial, df):\n", - " use_analytical_rule_name_similarity = trial.suggest_categorical(\"use_analytical_rule_name_similarity\", [True, False])\n", - " n_iterations = trial.suggest_int(\"n_iterations\", 1, 50)\n", - " n_epochs = trial.suggest_int(\"n_epochs\", 1, 5)\n", - " batch_size = trial.suggest_int(\"batch_size\", 8, 32)\n", - " segmenter_metric = trial.suggest_categorical(\"segmenter_metric\", [\"accuracy\", \"f1\"])\n", - " ensemble_soft_voting_power = trial.suggest_int(\"ensemble_soft_voting_power\", 1, 5)\n", - " return ReferenceAnnotatorTrainer.from_df(df, prec_recall_metric, mode=\"training\", use_analytical_rule_name_similarity=use_analytical_rule_name_similarity, n_iterations=n_iterations, n_epochs=n_epochs, batch_size=batch_size, segmenter_metric=segmenter_metric, ensemble_soft_voting_power=ensemble_soft_voting_power)\n", - "\n", - "def objective(trial):\n", - " trainer = define_trainer(trial, df)\n", - " trainer.train()\n", - "\n", - " annotator = trainer.clf\n", - " df_predicted = annotator.predict_df(df)\n", - " return f1_score(df_predicted.loc[df_predicted.split == \"valid\", [\"y_pred\"]], df_predicted.loc[df_predicted.split == \"valid\", [\"label\"]], zero_division=\"warn\", average=\"weighted\")\n" + "def build_clf_and_predict(\n", + " df: pd.DataFrame,\n", + " mode: MODES,\n", + " train_baseline: bool = False,\n", + " use_pca: bool = True,\n", + " use_umap: bool = True,\n", + " use_lang: bool = True,\n", + ") -> tuple[ClassifierMixin, pd.DataFrame]:\n", + " pass\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 152, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['dgst', 'canonical_reference_keyword', 'segments',\n", + " 'actual_reference_keywords', 'label', 'split', 'embeddings', 'umap',\n", + " 'pca', 'n_sentences', 'cert_name', 'referenced_cert_name',\n", + " 'cert_versions', 'referenced_cert_versions',\n", + " 'cert_name_stripped_version', 'referenced_cert_name_stripped_version',\n", + " 'lang_token_set_ratio', 'lang_partial_ratio', 'lang_token_sort_ratio',\n", + " 'lang_n_segments', 'lang_n_extracted_versions',\n", + " 'lang_n_intersection_versions', 'lang_common_numeric_words',\n", + " 'lang_common_words', 'lang_bigram_overlap', 'lang_trigram_overlap',\n", + " 'lang_common_prefix_len', 'lang_common_suffix_len',\n", + " 'lang_character_bigram_overlap', 'lang_character_trigram_overlap',\n", + " 'lang_base_len', 'lang_referenced_len', 'lang_len_difference',\n", + " 'pca_mean_x', 'pca_mean_y', 'pca_var_x', 'pca_var_y', 'pca_std_x',\n", + " 'pca_std_y', 'pca_skew_x', 'pca_skew_y', 'pca_kurt_x', 'pca_kurt_y',\n", + " 'pca_range_x', 'pca_range_y', 'pca_cov_xy', 'pca_median_x',\n", + " 'pca_median_y', 'pca_mean_distance_to_centroid',\n", + " 'pca_max_distance_to_centroid', 'pca_min_distance_to_centroid',\n", + " 'pca_max_min_distance_to_centroid', 'pca_std_distance_to_centroid',\n", + " 'pca_total_distances_to_centroid', 'pca_hull_area',\n", + " 'pca_hull_perimeter', 'pca_mean_pairwise_distance',\n", + " 'pca_max_pairwise_distance', 'pca_aspect_ratio', 'pca_point_density',\n", + " 'umap_mean_x', 'umap_mean_y', 'umap_var_x', 'umap_var_y', 'umap_std_x',\n", + " 'umap_std_y', 'umap_skew_x', 'umap_skew_y', 'umap_kurt_x',\n", + " 'umap_kurt_y', 'umap_range_x', 'umap_range_y', 'umap_cov_xy',\n", + " 'umap_median_x', 'umap_median_y', 'umap_mean_distance_to_centroid',\n", + " 'umap_max_distance_to_centroid', 'umap_min_distance_to_centroid',\n", + " 'umap_max_min_distance_to_centroid', 'umap_std_distance_to_centroid',\n", + " 'umap_total_distances_to_centroid', 'umap_hull_area',\n", + " 'umap_hull_perimeter', 'umap_mean_pairwise_distance',\n", + " 'umap_max_pairwise_distance', 'umap_aspect_ratio',\n", + " 'umap_point_density'],\n", + " dtype='object')" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "study = optuna.create_study(direction=\"maximize\")\n", - "study.optimize(objective, n_trials=3, timeout=60*60*24)\n", - "\n", - "best_trial = study.best_trial\n", - "print(\"Best Trial:\", best_trial.params)\n", - "print(\"Best Trial Value:\", best_trial.value)\n" + "df.columns\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 150, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['lang_token_set_ratio',\n", + " 'lang_partial_ratio',\n", + " 'lang_token_sort_ratio',\n", + " 'lang_n_segments',\n", + " 'lang_n_extracted_versions',\n", + " 'lang_n_intersection_versions',\n", + " 'lang_common_numeric_words',\n", + " 'lang_common_words',\n", + " 'lang_bigram_overlap',\n", + " 'lang_trigram_overlap',\n", + " 'lang_common_prefix_len',\n", + " 'lang_common_suffix_len',\n", + " 'lang_character_bigram_overlap',\n", + " 'lang_character_trigram_overlap',\n", + " 'lang_base_len',\n", + " 'lang_referenced_len',\n", + " 'lang_len_difference']" + ] + }, + "execution_count": 150, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "ax = optuna.visualization.matplotlib.plot_optimization_history(study)\n", - "ax.figure.savefig(\"/var/tmp/xjanovsk/certs/sec-certs/dataset/cc_refs_hyperparam_search/optimization_history.pdf\", bbox_inches=\"tight\")\n", - "\n", - "ax = optuna.visualization.matplotlib.plot_param_importances(study)\n", - "ax.figure.savefig(\"/var/tmp/xjanovsk/certs/sec-certs/dataset/cc_refs_hyperparam_search/param_importances.pdf\", bbox_inches=\"tight\")\n", - "\n", - "ax = optuna.visualization.matplotlib.plot_timeline(study)\n", - "ax.figure.savefig(\"/var/tmp/xjanovsk/certs/sec-certs/dataset/cc_refs_hyperparam_search/timeline.pdf\", bbox_inches=\"tight\")\n" + "[x for x in df.columns if x.startswith(\"lang_\")]\n" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 151, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
numbervaluedatetime_startdatetime_completedurationparams_batch_sizeparams_ensemble_soft_voting_powerparams_n_epochsparams_n_iterationsparams_segmenter_metricparams_use_analytical_rule_name_similaritystate
000.8728142023-08-24 13:52:46.4107962023-08-24 13:55:53.9700110 days 00:03:07.559215195210accuracyTrueCOMPLETE
110.8843452023-08-24 13:55:53.9714592023-08-24 14:09:15.5898680 days 00:13:21.618409185243f1TrueCOMPLETE
220.8697852023-08-24 14:09:15.5919652023-08-24 14:17:14.7800160 days 00:07:59.188051104145f1FalseCOMPLETE
\n", - "
" - ], "text/plain": [ - " number value datetime_start datetime_complete \\\n", - "0 0 0.872814 2023-08-24 13:52:46.410796 2023-08-24 13:55:53.970011 \n", - "1 1 0.884345 2023-08-24 13:55:53.971459 2023-08-24 14:09:15.589868 \n", - "2 2 0.869785 2023-08-24 14:09:15.591965 2023-08-24 14:17:14.780016 \n", - "\n", - " duration params_batch_size \\\n", - "0 0 days 00:03:07.559215 19 \n", - "1 0 days 00:13:21.618409 18 \n", - "2 0 days 00:07:59.188051 10 \n", - "\n", - " params_ensemble_soft_voting_power params_n_epochs params_n_iterations \\\n", - "0 5 2 10 \n", - "1 5 2 43 \n", - "2 4 1 45 \n", - "\n", - " params_segmenter_metric params_use_analytical_rule_name_similarity \\\n", - "0 accuracy True \n", - "1 f1 True \n", - "2 f1 False \n", - "\n", - " state \n", - "0 COMPLETE \n", - "1 COMPLETE \n", - "2 COMPLETE " + "['umap_mean_x',\n", + " 'umap_mean_y',\n", + " 'umap_var_x',\n", + " 'umap_var_y',\n", + " 'umap_std_x',\n", + " 'umap_std_y',\n", + " 'umap_skew_x',\n", + " 'umap_skew_y',\n", + " 'umap_kurt_x',\n", + " 'umap_kurt_y',\n", + " 'umap_range_x',\n", + " 'umap_range_y',\n", + " 'umap_cov_xy',\n", + " 'umap_median_x',\n", + " 'umap_median_y',\n", + " 'umap_mean_distance_to_centroid',\n", + " 'umap_max_distance_to_centroid',\n", + " 'umap_min_distance_to_centroid',\n", + " 'umap_max_min_distance_to_centroid',\n", + " 'umap_std_distance_to_centroid',\n", + " 'umap_total_distances_to_centroid',\n", + " 'umap_hull_area',\n", + " 'umap_hull_perimeter',\n", + " 'umap_mean_pairwise_distance',\n", + " 'umap_max_pairwise_distance',\n", + " 'umap_aspect_ratio',\n", + " 'umap_point_density']" ] }, - "execution_count": 35, + "execution_count": 151, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "study.trials_dataframe()\n" + "[x for x in df.columns if x.startswith(\"umap_\")]\n" ] } ], @@ -378,9 +718,8 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.1" - }, - "orig_nbformat": 4 + "version": "3.10.13" + } }, "nbformat": 4, "nbformat_minor": 2