diff --git a/src/sec_certs/model/references_nlp/feature_extraction.py b/src/sec_certs/model/references_nlp/feature_extraction.py index c875cfab..9e62879e 100644 --- a/src/sec_certs/model/references_nlp/feature_extraction.py +++ b/src/sec_certs/model/references_nlp/feature_extraction.py @@ -121,7 +121,10 @@ def get_lang_features(base_name: str, referenced_name: str) -> tuple: def extract_segments( - cc_dset: CCDataset, mode: REF_ANNOTATION_MODES, n_sents_before: int = 2, n_sents_after: int = 1 + cc_dset: CCDataset, + mode: REF_ANNOTATION_MODES, + n_sents_before: int = 2, + n_sents_after: int = 1, ) -> pd.DataFrame: logger.info("Extracting segments.") df = ReferenceSegmentExtractor(n_sents_before, n_sents_after)(list(cc_dset.certs.values())) @@ -197,7 +200,10 @@ def choose_values_to_fit(df_: pd.DataFrame) -> list[str]: def build_embeddings( - segments: pd.DataFrame, mode: REF_ANNOTATION_MODES, method: REF_EMBEDDING_METHOD, model_path: Path | None = None + segments: pd.DataFrame, + mode: REF_ANNOTATION_MODES, + method: REF_EMBEDDING_METHOD, + model_path: Path | None = None, ) -> pd.DataFrame: return ( _build_transformer_embeddings(segments, mode, model_path) @@ -225,23 +231,27 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr lambda x: strip_all(x["cert_name"], x["cert_versions"]), axis=1 ), referenced_cert_name_stripped_version=lambda df_: df_.apply( - lambda x: strip_all(x["referenced_cert_name"], x["referenced_cert_versions"]), axis=1 + lambda x: strip_all(x["referenced_cert_name"], x["referenced_cert_versions"]), + axis=1, ), lang_token_set_ratio=lambda df_: df_.apply( lambda x: fuzz.token_set_ratio( - x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"] + x["cert_name_stripped_version"], + x["referenced_cert_name_stripped_version"], ), axis=1, ), lang_partial_ratio=lambda df_: df_.apply( lambda x: fuzz.partial_ratio( - x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"] + x["cert_name_stripped_version"], + x["referenced_cert_name_stripped_version"], ), axis=1, ), lang_token_sort_ratio=lambda df_: df_.apply( lambda x: fuzz.token_sort_ratio( - x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"] + x["cert_name_stripped_version"], + x["referenced_cert_name_stripped_version"], ), axis=1, ), @@ -251,13 +261,15 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr .assign( lang_n_extracted_versions=lambda df_: df_.cert_versions.map(lambda x: len(x) if x else 0), lang_n_intersection_versions=lambda df_: df_.apply( - lambda x: len(set(x["cert_versions"]).intersection(set(x["referenced_cert_versions"]))), axis=1 + lambda x: len(set(x["cert_versions"]).intersection(set(x["referenced_cert_versions"]))), + axis=1, ), ) ) df_lang_other_features = df_lang.apply( - lambda row: get_lang_features(row["cert_name"], row["referenced_cert_name"]), axis=1 + lambda row: get_lang_features(row["cert_name"], row["referenced_cert_name"]), + axis=1, ).apply(pd.Series) lang_features = [ "common_numeric_words", @@ -276,7 +288,8 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr df_lang = pd.concat([df_lang, df_lang_other_features], axis=1).assign( lang_should_not_be_component=lambda df_: df_.apply( - lambda x: x.lang_len_difference < 5 and x.lang_token_set_ratio == 100, axis=1 + lambda x: x.lang_len_difference < 5 and x.lang_token_set_ratio == 100, + axis=1, ), ) for col in df_lang.columns: @@ -289,9 +302,9 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr def perform_dimensionality_reduction( df: pd.DataFrame, mode: REF_ANNOTATION_MODES, - umap_n_neighbors: int = 5, - umap_min_dist: float = 0.1, - umap_metric: Literal["cosine", "euclidean", "manhattan"] = "euclidean", + umap_n_neighbors: int = 10, + umap_min_dist: float = 0.51026, + umap_metric: Literal["cosine", "euclidean", "manhattan"] = "cosine", ) -> pd.DataFrame: def choose_values_to_fit(df_: pd.DataFrame): if mode == "training": @@ -325,7 +338,11 @@ def choose_labels_to_fit(df_: pd.DataFrame): # parallel UMAP not available with random state umapper = umap.UMAP( - n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, metric=umap_metric, random_state=RANDOM_STATE, n_jobs=1 + n_neighbors=umap_n_neighbors, + min_dist=umap_min_dist, + metric=umap_metric, + random_state=RANDOM_STATE, + n_jobs=1, ).fit(embeddings_to_fit, y=labels_to_fit) pca_mapper = PCA(n_components=2, random_state=RANDOM_STATE).fit(embeddings_to_fit_scaled, y=labels_to_fit) @@ -539,4 +556,9 @@ def get_data_for_clf( else: raise ValueError(f"Unknown mode {mode}") - return np.vstack(train_df[feature_columns].values), train_df.label.values, eval_df, feature_columns + return ( + np.vstack(train_df[feature_columns].values), + train_df.label.values, + eval_df, + feature_columns, + ) diff --git a/src/sec_certs/model/references_nlp/segment_extractor.py b/src/sec_certs/model/references_nlp/segment_extractor.py index d5d727a1..b30d5db7 100644 --- a/src/sec_certs/model/references_nlp/segment_extractor.py +++ b/src/sec_certs/model/references_nlp/segment_extractor.py @@ -34,7 +34,7 @@ def swap_and_filter_dict(dct: dict[str, Any], filter_to_keys: set[str]): return {key: frozenset(val) for key, val in new_dct.items() if key in filter_to_keys} -def fill_reference_segments(record: ReferenceRecord, n_sent_before: int = 1, n_sent_after: int = 0) -> ReferenceRecord: +def fill_reference_segments(record: ReferenceRecord, n_sent_before: int = 4, n_sent_after: int = 4) -> ReferenceRecord: """ Compute indices of the sentences containing the reference keyword, take their surrounding sentences and join them. """ @@ -186,7 +186,10 @@ def _prepare_df_from_cc_dset(self, certs: Iterable[CCCertificate]) -> pd.DataFra def _build_records(self, certs: list[CCCertificate], source: Literal["target", "report"]) -> list[ReferenceRecord]: def get_cert_records(cert: CCCertificate, source: Literal["target", "report"]) -> list[ReferenceRecord]: - canonical_ref_var = {"target": "st_references", "report": "report_references"} + canonical_ref_var = { + "target": "st_references", + "report": "report_references", + } actual_ref_var = {"target": "st_keywords", "report": "report_keywords"} raw_source_var = {"target": "st_txt_path", "report": "report_txt_path"} @@ -242,7 +245,13 @@ def _build_df(self, certs: list[CCCertificate], source: Literal["target", "repor print(f"I now have {len(results)} in {source} mode") return pd.DataFrame.from_records( [x.to_pandas_tuple() for x in results], - columns=["dgst", "canonical_reference_keyword", "actual_reference_keywords", "source", "segments"], + columns=[ + "dgst", + "canonical_reference_keyword", + "actual_reference_keywords", + "source", + "segments", + ], ) @staticmethod @@ -282,7 +291,10 @@ def load_single_df(pth: Path, split_name: str) -> pd.DataFrame: }, ) .dropna(subset="label") - .assign(label=lambda df_: df_.label.str.replace(" ", "_").str.upper(), split=split_name) + .assign( + label=lambda df_: df_.label.str.replace(" ", "_").str.upper(), + split=split_name, + ) ) annotations_directory = Path(str(files("sec_certs.data") / "reference_annotations/final/")) @@ -350,6 +362,7 @@ def unique_elements(series): ) ) df_processed.segments = df_processed.apply( - lambda row: [process_segment(x, row.actual_reference_keywords) for x in row.segments], axis=1 + lambda row: [process_segment(x, row.actual_reference_keywords) for x in row.segments], + axis=1, ) return df_processed diff --git a/src/sec_certs/model/references_nlp/training.py b/src/sec_certs/model/references_nlp/training.py index 4e0f82ce..d65bae59 100644 --- a/src/sec_certs/model/references_nlp/training.py +++ b/src/sec_certs/model/references_nlp/training.py @@ -14,7 +14,15 @@ logger = logging.getLogger(__name__) -def _train_model(x_train, y_train, x_eval, y_eval, learning_rate: float = 0.03, depth: int = 6, l2_leaf_reg: int = 3): +def _train_model( + x_train, + y_train, + x_eval, + y_eval, + learning_rate: float = 0.03, + depth: int = 6, + l2_leaf_reg: float = 3, +): clf = CatBoostClassifier( learning_rate=learning_rate, depth=depth, @@ -26,7 +34,14 @@ def _train_model(x_train, y_train, x_eval, y_eval, learning_rate: float = 0.03, train_pool = Pool(x_train, y_train) eval_pool = Pool(x_eval, y_eval) - clf.fit(train_pool, eval_set=eval_pool, verbose=False, plot=True, early_stopping_rounds=100, use_best_model=True) + clf.fit( + train_pool, + eval_set=eval_pool, + verbose=False, + plot=True, + early_stopping_rounds=100, + use_best_model=True, + ) return clf @@ -38,9 +53,9 @@ def train_model( use_umap: bool = True, use_lang: bool = True, use_pred: bool = True, - learning_rate: float = 0.03, - depth: int = 6, - l2_leaf_reg: int = 3, + learning_rate: float = 0.079573, + depth: int = 10, + l2_leaf_reg: float = 7.303517, ) -> tuple[DummyClassifier | CatBoostClassifier, pd.DataFrame, list[str]]: logger.info(f"Training model for mode {mode}") X_train, y_train, eval_df, feature_cols = get_data_for_clf(df, mode, use_pca, use_umap, use_lang, use_pred) @@ -49,7 +64,15 @@ def train_model( clf.fit(X_train, y_train) else: assert eval_df is not None - clf = _train_model(X_train, y_train, eval_df[feature_cols], eval_df.label, learning_rate, depth, l2_leaf_reg) + clf = _train_model( + X_train, + y_train, + eval_df[feature_cols], + eval_df.label, + learning_rate, + depth, + l2_leaf_reg, + ) return clf, eval_df, feature_cols