Skip to content

Commit

Permalink
hardcode hyperparams for all stages
Browse files Browse the repository at this point in the history
  • Loading branch information
adamjanovsky committed Nov 23, 2023
1 parent 258bed5 commit dbe3fed
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 25 deletions.
50 changes: 36 additions & 14 deletions src/sec_certs/model/references_nlp/feature_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ def get_lang_features(base_name: str, referenced_name: str) -> tuple:


def extract_segments(
cc_dset: CCDataset, mode: REF_ANNOTATION_MODES, n_sents_before: int = 2, n_sents_after: int = 1
cc_dset: CCDataset,
mode: REF_ANNOTATION_MODES,
n_sents_before: int = 2,
n_sents_after: int = 1,
) -> pd.DataFrame:
logger.info("Extracting segments.")
df = ReferenceSegmentExtractor(n_sents_before, n_sents_after)(list(cc_dset.certs.values()))
Expand Down Expand Up @@ -197,7 +200,10 @@ def choose_values_to_fit(df_: pd.DataFrame) -> list[str]:


def build_embeddings(
segments: pd.DataFrame, mode: REF_ANNOTATION_MODES, method: REF_EMBEDDING_METHOD, model_path: Path | None = None
segments: pd.DataFrame,
mode: REF_ANNOTATION_MODES,
method: REF_EMBEDDING_METHOD,
model_path: Path | None = None,
) -> pd.DataFrame:
return (
_build_transformer_embeddings(segments, mode, model_path)
Expand Down Expand Up @@ -225,23 +231,27 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr
lambda x: strip_all(x["cert_name"], x["cert_versions"]), axis=1
),
referenced_cert_name_stripped_version=lambda df_: df_.apply(
lambda x: strip_all(x["referenced_cert_name"], x["referenced_cert_versions"]), axis=1
lambda x: strip_all(x["referenced_cert_name"], x["referenced_cert_versions"]),
axis=1,
),
lang_token_set_ratio=lambda df_: df_.apply(
lambda x: fuzz.token_set_ratio(
x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"]
x["cert_name_stripped_version"],
x["referenced_cert_name_stripped_version"],
),
axis=1,
),
lang_partial_ratio=lambda df_: df_.apply(
lambda x: fuzz.partial_ratio(
x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"]
x["cert_name_stripped_version"],
x["referenced_cert_name_stripped_version"],
),
axis=1,
),
lang_token_sort_ratio=lambda df_: df_.apply(
lambda x: fuzz.token_sort_ratio(
x["cert_name_stripped_version"], x["referenced_cert_name_stripped_version"]
x["cert_name_stripped_version"],
x["referenced_cert_name_stripped_version"],
),
axis=1,
),
Expand All @@ -251,13 +261,15 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr
.assign(
lang_n_extracted_versions=lambda df_: df_.cert_versions.map(lambda x: len(x) if x else 0),
lang_n_intersection_versions=lambda df_: df_.apply(
lambda x: len(set(x["cert_versions"]).intersection(set(x["referenced_cert_versions"]))), axis=1
lambda x: len(set(x["cert_versions"]).intersection(set(x["referenced_cert_versions"]))),
axis=1,
),
)
)

df_lang_other_features = df_lang.apply(
lambda row: get_lang_features(row["cert_name"], row["referenced_cert_name"]), axis=1
lambda row: get_lang_features(row["cert_name"], row["referenced_cert_name"]),
axis=1,
).apply(pd.Series)
lang_features = [
"common_numeric_words",
Expand All @@ -276,7 +288,8 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr

df_lang = pd.concat([df_lang, df_lang_other_features], axis=1).assign(
lang_should_not_be_component=lambda df_: df_.apply(
lambda x: x.lang_len_difference < 5 and x.lang_token_set_ratio == 100, axis=1
lambda x: x.lang_len_difference < 5 and x.lang_token_set_ratio == 100,
axis=1,
),
)
for col in df_lang.columns:
Expand All @@ -289,9 +302,9 @@ def extract_language_features(df: pd.DataFrame, cc_dset: CCDataset) -> pd.DataFr
def perform_dimensionality_reduction(
df: pd.DataFrame,
mode: REF_ANNOTATION_MODES,
umap_n_neighbors: int = 5,
umap_min_dist: float = 0.1,
umap_metric: Literal["cosine", "euclidean", "manhattan"] = "euclidean",
umap_n_neighbors: int = 10,
umap_min_dist: float = 0.51026,
umap_metric: Literal["cosine", "euclidean", "manhattan"] = "cosine",
) -> pd.DataFrame:
def choose_values_to_fit(df_: pd.DataFrame):
if mode == "training":
Expand Down Expand Up @@ -325,7 +338,11 @@ def choose_labels_to_fit(df_: pd.DataFrame):

# parallel UMAP not available with random state
umapper = umap.UMAP(
n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, metric=umap_metric, random_state=RANDOM_STATE, n_jobs=1
n_neighbors=umap_n_neighbors,
min_dist=umap_min_dist,
metric=umap_metric,
random_state=RANDOM_STATE,
n_jobs=1,
).fit(embeddings_to_fit, y=labels_to_fit)
pca_mapper = PCA(n_components=2, random_state=RANDOM_STATE).fit(embeddings_to_fit_scaled, y=labels_to_fit)

Expand Down Expand Up @@ -539,4 +556,9 @@ def get_data_for_clf(
else:
raise ValueError(f"Unknown mode {mode}")

return np.vstack(train_df[feature_columns].values), train_df.label.values, eval_df, feature_columns
return (
np.vstack(train_df[feature_columns].values),
train_df.label.values,
eval_df,
feature_columns,
)
23 changes: 18 additions & 5 deletions src/sec_certs/model/references_nlp/segment_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def swap_and_filter_dict(dct: dict[str, Any], filter_to_keys: set[str]):
return {key: frozenset(val) for key, val in new_dct.items() if key in filter_to_keys}


def fill_reference_segments(record: ReferenceRecord, n_sent_before: int = 1, n_sent_after: int = 0) -> ReferenceRecord:
def fill_reference_segments(record: ReferenceRecord, n_sent_before: int = 4, n_sent_after: int = 4) -> ReferenceRecord:
"""
Compute indices of the sentences containing the reference keyword, take their surrounding sentences and join them.
"""
Expand Down Expand Up @@ -186,7 +186,10 @@ def _prepare_df_from_cc_dset(self, certs: Iterable[CCCertificate]) -> pd.DataFra

def _build_records(self, certs: list[CCCertificate], source: Literal["target", "report"]) -> list[ReferenceRecord]:
def get_cert_records(cert: CCCertificate, source: Literal["target", "report"]) -> list[ReferenceRecord]:
canonical_ref_var = {"target": "st_references", "report": "report_references"}
canonical_ref_var = {
"target": "st_references",
"report": "report_references",
}
actual_ref_var = {"target": "st_keywords", "report": "report_keywords"}
raw_source_var = {"target": "st_txt_path", "report": "report_txt_path"}

Expand Down Expand Up @@ -242,7 +245,13 @@ def _build_df(self, certs: list[CCCertificate], source: Literal["target", "repor
print(f"I now have {len(results)} in {source} mode")
return pd.DataFrame.from_records(
[x.to_pandas_tuple() for x in results],
columns=["dgst", "canonical_reference_keyword", "actual_reference_keywords", "source", "segments"],
columns=[
"dgst",
"canonical_reference_keyword",
"actual_reference_keywords",
"source",
"segments",
],
)

@staticmethod
Expand Down Expand Up @@ -282,7 +291,10 @@ def load_single_df(pth: Path, split_name: str) -> pd.DataFrame:
},
)
.dropna(subset="label")
.assign(label=lambda df_: df_.label.str.replace(" ", "_").str.upper(), split=split_name)
.assign(
label=lambda df_: df_.label.str.replace(" ", "_").str.upper(),
split=split_name,
)
)

annotations_directory = Path(str(files("sec_certs.data") / "reference_annotations/final/"))
Expand Down Expand Up @@ -350,6 +362,7 @@ def unique_elements(series):
)
)
df_processed.segments = df_processed.apply(
lambda row: [process_segment(x, row.actual_reference_keywords) for x in row.segments], axis=1
lambda row: [process_segment(x, row.actual_reference_keywords) for x in row.segments],
axis=1,
)
return df_processed
35 changes: 29 additions & 6 deletions src/sec_certs/model/references_nlp/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,15 @@
logger = logging.getLogger(__name__)


def _train_model(x_train, y_train, x_eval, y_eval, learning_rate: float = 0.03, depth: int = 6, l2_leaf_reg: int = 3):
def _train_model(
x_train,
y_train,
x_eval,
y_eval,
learning_rate: float = 0.03,
depth: int = 6,
l2_leaf_reg: float = 3,
):
clf = CatBoostClassifier(
learning_rate=learning_rate,
depth=depth,
Expand All @@ -26,7 +34,14 @@ def _train_model(x_train, y_train, x_eval, y_eval, learning_rate: float = 0.03,

train_pool = Pool(x_train, y_train)
eval_pool = Pool(x_eval, y_eval)
clf.fit(train_pool, eval_set=eval_pool, verbose=False, plot=True, early_stopping_rounds=100, use_best_model=True)
clf.fit(
train_pool,
eval_set=eval_pool,
verbose=False,
plot=True,
early_stopping_rounds=100,
use_best_model=True,
)
return clf


Expand All @@ -38,9 +53,9 @@ def train_model(
use_umap: bool = True,
use_lang: bool = True,
use_pred: bool = True,
learning_rate: float = 0.03,
depth: int = 6,
l2_leaf_reg: int = 3,
learning_rate: float = 0.079573,
depth: int = 10,
l2_leaf_reg: float = 7.303517,
) -> tuple[DummyClassifier | CatBoostClassifier, pd.DataFrame, list[str]]:
logger.info(f"Training model for mode {mode}")
X_train, y_train, eval_df, feature_cols = get_data_for_clf(df, mode, use_pca, use_umap, use_lang, use_pred)
Expand All @@ -49,7 +64,15 @@ def train_model(
clf.fit(X_train, y_train)
else:
assert eval_df is not None
clf = _train_model(X_train, y_train, eval_df[feature_cols], eval_df.label, learning_rate, depth, l2_leaf_reg)
clf = _train_model(
X_train,
y_train,
eval_df[feature_cols],
eval_df.label,
learning_rate,
depth,
l2_leaf_reg,
)

return clf, eval_df, feature_cols

Expand Down

0 comments on commit dbe3fed

Please sign in to comment.