-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add parameter to config and gui to set the min precursors required for update #460
base: main
Are you sure you want to change the base?
Changes from 21 commits
fcbcbbc
f67756e
e621da2
16ce82a
716be78
c8b9463
5fc2fe9
0abc255
18fa5ab
d93fae3
8df18ea
a6ee75d
1fee785
8662354
82ea90d
d969206
6a168a3
25e2aca
76aff18
033918d
4ec9c01
d978ef4
42b1c0c
1f4841f
927960f
f161030
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,8 +21,7 @@ def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5) | |
second_classifier: Classifier, | ||
first_fdr_cutoff: float = 0.6, | ||
second_fdr_cutoff: float = 0.01, | ||
min_precursors_for_update: int = 5000, | ||
max_iterations: int = 5, | ||
min_precursors_for_update: int = 200, | ||
train_on_top_n: int = 1, | ||
): | ||
"""Initializing a two-step classifier. | ||
|
@@ -37,10 +36,8 @@ def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5) | |
The fdr threshold for the first classifier, determining how selective the first classification step is. | ||
second_fdr_cutoff : float, default=0.01 | ||
The fdr threshold for the second classifier, typically set stricter to ensure high confidence in the final classification results. | ||
min_precursors_for_update : int, default=5000 | ||
min_precursors_for_update : int, default=200 | ||
The minimum number of precursors required to update the first classifier. | ||
max_iterations : int | ||
Maximum number of refinement iterations during training. | ||
train_on_top_n : int | ||
Use candidates up to this rank for training. During inference, all ranks are used. | ||
|
||
|
@@ -51,7 +48,6 @@ def __init__( # noqa: PLR0913 Too many arguments in function definition (> 5) | |
self.second_fdr_cutoff = second_fdr_cutoff | ||
|
||
self._min_precursors_for_update = min_precursors_for_update | ||
self._max_iterations = max_iterations | ||
self._train_on_top_n = train_on_top_n | ||
|
||
logger.info( | ||
|
@@ -67,11 +63,11 @@ def fit_predict( | |
y_col: str = "decoy", | ||
group_columns: list[str] | None = None, | ||
) -> pd.DataFrame: | ||
"""Train the two-step classifier and predict precursors using an iterative approach. | ||
"""Train the two-step classifier and predict precursors using the following approach. | ||
|
||
1. First iteration: Train neural network on top-n candidates. | ||
2. Subsequent iterations: Use linear classifier to filter data, then refine with neural network. | ||
3. Update linear classifier if enough high-confidence predictions are found, else break. | ||
1. Train neural network on top-n candidates. | ||
2. Update linear classifier if enough high-confidence predictions are found, else break. | ||
3. Use linear classifier to filter data, then refine with neural network. | ||
|
||
Parameters | ||
---------- | ||
|
@@ -90,91 +86,63 @@ def fit_predict( | |
DataFrame containing predictions and q-values | ||
|
||
""" | ||
min_train_size = 1 | ||
logger.info("=== Starting training of TwoStepClassifier ===") | ||
|
||
df = self._preprocess_data(df, x_cols) | ||
best_result = None | ||
df_train = df[df["rank"] < self._train_on_top_n] | ||
df_predict = df | ||
|
||
# tracking precursors identified at fdr cutoffs `self.first_fdr_cutoff` and `self.second_fdr_cutoff`` | ||
previous_target_count_after_first_clf = -1 | ||
previous_target_count_after_second_clf = -1 | ||
|
||
for i in range(self._max_iterations): | ||
logger.info(f"Starting iteration {i + 1} / {self._max_iterations}.") | ||
|
||
# extract preselction using first classifier if it is fitted | ||
if self.first_classifier.fitted and i > 0: | ||
df_train = self._apply_filtering_with_first_classifier( | ||
df, x_cols, group_columns | ||
) | ||
df_predict = df_train # using the same df for training and predicting, unlike in the following else block. | ||
logger.info( | ||
f"Application of first classifier at fdr={self.first_fdr_cutoff} results in " | ||
f"{len(df_train):,} samples ({get_target_count(df_train):,} precursors)" | ||
) | ||
|
||
previous_target_count_after_first_clf = get_target_count(df_train) | ||
self.second_classifier.epochs = 50 | ||
else: | ||
logger.debug("First classifier not fitted yet. Proceeding without it.") | ||
df_train = df[df["rank"] < self._train_on_top_n] | ||
df_predict = df | ||
# train and apply NN classifier | ||
self.second_classifier.epochs = 10 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. deliberately hardcoded? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but the value was chosen somewhat arbitrarily. The reason I set it to 10 was to avoid the error where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. at least, make it either a module-wide constant or create a new method parameter and set it as default (then is is more obvious that there is a knob to tune).. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops sorry, I mixed something up here, and was talking about something else. Yes, this one is deliberately set to 10, but I will add a |
||
df_after_second_clf = self._train_and_apply_second_classifier( | ||
df_train, df_predict, x_cols, y_col, group_columns | ||
) | ||
best_result = df_after_second_clf | ||
|
||
self.second_classifier.epochs = 10 | ||
df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff) | ||
target_count_after_second_clf = get_target_count(df_filtered) | ||
logger.info( | ||
f"{target_count_after_second_clf:,} targets found " | ||
f"after second classifier, at fdr={self.second_fdr_cutoff}" | ||
) | ||
|
||
# train and apply second classifier | ||
df_after_second_clf = self._train_and_apply_second_classifier( | ||
df_train, df_predict, x_cols, y_col, group_columns | ||
) | ||
# stop if not enough targets found after NN classifier | ||
if target_count_after_second_clf < self._min_precursors_for_update: | ||
return best_result | ||
|
||
df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff) | ||
current_target_count = get_target_count(df_filtered) | ||
# update and use the linear classifier | ||
self._update_first_classifier(df_filtered, df, x_cols, y_col, group_columns) | ||
df_train = self._apply_filtering_with_first_classifier( | ||
df, x_cols, group_columns | ||
) | ||
if len(df_train) < min_train_size: | ||
return best_result | ||
|
||
if current_target_count < previous_target_count_after_second_clf: | ||
logger.info( | ||
f"Training stopped on iteration {i + 1}. Decrease in precursor count from " | ||
f"{previous_target_count_after_second_clf:,} to {current_target_count:,}." | ||
) | ||
return best_result | ||
df_predict = df_train # using the same df for training and predicting, unlike in the following else block. | ||
previous_target_count_after_first_clf = get_target_count(df_train) | ||
|
||
previous_target_count_after_second_clf = current_target_count | ||
best_result = df_after_second_clf # TODO: Remove if multiple iterations are dropped to save memory. | ||
# train and apply second classifier | ||
self.second_classifier.epochs = 50 | ||
df_after_second_clf = self._train_and_apply_second_classifier( | ||
df_train, df_predict, x_cols, y_col, group_columns | ||
) | ||
df_filtered = filter_by_qval(df_after_second_clf, self.second_fdr_cutoff) | ||
current_target_count = get_target_count(df_filtered) | ||
|
||
logger.info( | ||
f"Application of second classifier at fdr={self.second_fdr_cutoff} results in " | ||
f"{get_target_count(df_train):,} precursors." | ||
) | ||
if current_target_count > target_count_after_second_clf: | ||
target_count_after_second_clf = current_target_count | ||
best_result = df_after_second_clf | ||
|
||
# update first classifier if enough confident predictions | ||
if current_target_count > self._min_precursors_for_update: | ||
target_count_after_first_clf, new_classifier = ( | ||
self._fit_and_eval_first_classifier( | ||
df_filtered, df, x_cols, y_col, group_columns | ||
) | ||
self._update_first_classifier( | ||
df_filtered, | ||
df, | ||
x_cols, | ||
y_col, | ||
group_columns, | ||
previous_target_count_after_first_clf, | ||
) | ||
if target_count_after_first_clf > previous_target_count_after_first_clf: | ||
logger.debug( | ||
f"Update of first classifier initiated: previous version had {previous_target_count_after_first_clf:,} " | ||
f"precursors, current version has {target_count_after_first_clf:,} precursors." | ||
) | ||
self.first_classifier = new_classifier | ||
previous_target_count_after_first_clf = target_count_after_first_clf | ||
|
||
else: | ||
logger.debug( | ||
f"Update of first classifier skipped: previous version had {previous_target_count_after_first_clf:,} " | ||
f"precursors, current version has {target_count_after_first_clf:,} precursors." | ||
) | ||
else: | ||
logger.info( | ||
f"=== Insufficient precursors detected; ending after {i + 1} iterations ===" | ||
) | ||
break | ||
else: | ||
logger.info( | ||
f"=== Stopping fitting after reaching the maximum number of iterations: " | ||
f"{self._max_iterations} / {self._max_iterations} ===" | ||
) | ||
|
||
return best_result | ||
|
||
|
@@ -187,11 +155,22 @@ def _apply_filtering_with_first_classifier( | |
self, df: pd.DataFrame, x_cols: list[str], group_columns: list[str] | ||
) -> pd.DataFrame: | ||
"""Apply first classifier to filter data for the training of the second classifier.""" | ||
n_precursors = get_target_count(df) | ||
logger.info( | ||
f"Applying first classifier to {len(df):,} precursors ({n_precursors:,} targets)" | ||
) | ||
|
||
df["proba"] = self.first_classifier.predict_proba(df[x_cols].to_numpy())[:, 1] | ||
|
||
return compute_and_filter_q_values( | ||
filtered_df = compute_and_filter_q_values( | ||
df, self.first_fdr_cutoff, group_columns, remove_decoys=False | ||
) | ||
logger.info( | ||
f"Preselection of first classifier at fdr={self.first_fdr_cutoff} results in " | ||
f"{len(filtered_df):,} precursors ({get_target_count(filtered_df):,} targets)" | ||
) | ||
|
||
return filtered_df | ||
|
||
def _train_and_apply_second_classifier( | ||
self, | ||
|
@@ -202,24 +181,35 @@ def _train_and_apply_second_classifier( | |
group_columns: list[str], | ||
) -> pd.DataFrame: | ||
"""Train second_classifier and apply it to get predictions.""" | ||
logger.info( | ||
f"Training second classifier on {len(train_df):,} precursors " | ||
f"({get_target_count(train_df):,} targets, top_n={self._train_on_top_n})" | ||
) | ||
|
||
self.second_classifier.fit( | ||
train_df[x_cols].to_numpy().astype(np.float32), | ||
train_df[y_col].to_numpy().astype(np.float32), | ||
) | ||
|
||
logger.info( | ||
f"Applying second classifier to {len(predict_df):,} precursors " | ||
f"({get_target_count(predict_df):,} targets, top_n={max(predict_df['rank']) + 1})" | ||
) | ||
|
||
x = predict_df[x_cols].to_numpy().astype(np.float32) | ||
predict_df["proba"] = self.second_classifier.predict_proba(x)[:, 1] | ||
|
||
return compute_q_values(predict_df, group_columns) | ||
|
||
def _fit_and_eval_first_classifier( | ||
def _update_first_classifier( # noqa: PLR0913 | ||
self, | ||
subset_df: pd.DataFrame, | ||
full_df: pd.DataFrame, | ||
x_cols: list[str], | ||
y_col: str, | ||
group_columns: list[str], | ||
) -> tuple[int, Classifier]: | ||
previous_count: int = -1, | ||
) -> None: | ||
"""Fits a copy of the first classifier on a given subset and applies it to the full dataset. | ||
|
||
Returns the number of targets found and the trained classifier. | ||
|
@@ -231,18 +221,27 @@ def _fit_and_eval_first_classifier( | |
x_all = full_df[x_cols].to_numpy() | ||
reduced_df = full_df[[*group_columns, "decoy"]] | ||
|
||
logger.info(f"Fitting first classifier on {len(df_train):,} samples.") | ||
logger.info( | ||
f"Fitting first classifier on {len(df_train):,} precursors, applying it to {len(x_all):,} precursors." | ||
) | ||
new_classifier = copy.deepcopy(self.first_classifier) | ||
new_classifier.fit(x_train, y_train) | ||
|
||
logger.info(f"Applying first classifier to {len(x_all):,} samples.") | ||
reduced_df["proba"] = new_classifier.predict_proba(x_all)[:, 1] | ||
df_targets = compute_and_filter_q_values( | ||
reduced_df, self.first_fdr_cutoff, group_columns | ||
) | ||
n_targets = get_target_count(df_targets) | ||
|
||
return n_targets, new_classifier | ||
# update first classifier if imrpovement | ||
if n_targets > previous_count: | ||
logger.info( | ||
f"Updating the first classifier as new target count increased: {n_targets:,} > {previous_count:,}" | ||
) | ||
self.first_classifier = new_classifier | ||
previous_count = n_targets | ||
|
||
# return previous_count | ||
|
||
@property | ||
def fitted(self) -> bool: | ||
|
@@ -288,12 +287,30 @@ def get_target_count(df: pd.DataFrame) -> int: | |
|
||
|
||
def compute_q_values( | ||
df: pd.DataFrame, group_columns: list[str] | None = None | ||
df: pd.DataFrame, | ||
group_columns: list[str] | None = None, | ||
qval_col: str = "qval", | ||
scale_by_target_decoy_ratio: bool = True, # noqa: FBT001, FBT002 | ||
) -> pd.DataFrame: | ||
"""Compute q-values for each entry after keeping only best entries per group.""" | ||
scaling_factor = 1.0 | ||
if scale_by_target_decoy_ratio: | ||
n_targets = (df["decoy"] == 0).sum() | ||
n_decoys = (df["decoy"] == 1).sum() | ||
scaling_factor = round(n_targets / n_decoys, 3) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please avoid raising |
||
if not np.isfinite(scaling_factor) or scaling_factor == 0: | ||
scaling_factor = 1.0 | ||
|
||
df.sort_values("proba", ascending=True, inplace=True) | ||
df = keep_best(df, group_columns=group_columns) | ||
return get_q_values(df, "proba", "decoy") | ||
df = get_q_values(df, "proba", "decoy", qval_col) | ||
|
||
logger.info( | ||
f"Normalizing q-values using {n_targets:,} targets and {n_decoys:,} decoys (scaling factor = {scaling_factor})" | ||
) | ||
df[qval_col] = df[qval_col] * scaling_factor | ||
|
||
return df | ||
|
||
|
||
def filter_by_qval(df: pd.DataFrame, fdr_cutoff: float) -> pd.DataFrame: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -786,6 +786,8 @@ def save_classifier_store(self, path: None | str = None, version: int = -1): | |
path = os.path.join( | ||
os.path.dirname(alphadia.__file__), "constants", "classifier" | ||
) | ||
if self.is_two_step_classifier: | ||
path = os.path.join(path, "two_step_classifier") | ||
|
||
logger.info(f"Saving classifier store to {path}") | ||
|
||
|
@@ -808,22 +810,21 @@ def load_classifier_store(self, path: None | str = None): | |
path = os.path.join( | ||
os.path.dirname(alphadia.__file__), "constants", "classifier" | ||
) | ||
if self.is_two_step_classifier: | ||
path = os.path.join(path, "two_step_classifier") | ||
|
||
logger.info(f"Loading classifier store from {path}") | ||
|
||
if ( | ||
not self.is_two_step_classifier | ||
): # TODO add pretrained model for TwoStepClassifier | ||
for file in os.listdir(path): | ||
if file.endswith(".pth"): | ||
classifier_hash = file.split(".")[0] | ||
|
||
if classifier_hash not in self.classifier_store: | ||
classifier = deepcopy(self.classifier_base) | ||
classifier.from_state_dict( | ||
torch.load(os.path.join(path, file), weights_only=False) | ||
) | ||
self.classifier_store[classifier_hash].append(classifier) | ||
for file in os.listdir(path): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://www.stuartellis.name/articles/python-modern-practices/#use-osscandir-instead-of-oslistdir |
||
if file.endswith(".pth"): | ||
classifier_hash = file.split(".")[0] | ||
|
||
if classifier_hash not in self.classifier_store: | ||
classifier = deepcopy(self.classifier_base) | ||
classifier.from_state_dict( | ||
torch.load(os.path.join(path, file), weights_only=False) | ||
) | ||
self.classifier_store[classifier_hash].append(classifier) | ||
|
||
def get_classifier(self, available_columns: list, version: int = -1): | ||
"""Gets the classifier for a given set of feature columns and version. If the classifier is not found in the store, gets the base classifier instead. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just from the name, I would assume
_train_on_top_n
is a boolean, but then this line would make no sense .. maybe find a better name?