Skip to content

New Workflow: LDA then XGBoost #155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pyprophet/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class RunnerConfig:
Configuration for scoring, classifier setup, learning parameters, and optional features.

Attributes:
classifier (str): Classifier type used for semi-supervised learning ('LDA', 'SVM' or 'XGBoost').
classifier (str): Classifier type used for semi-supervised learning Can either be a single classifier ('LDA', 'SVM', 'XGBoost') or a multiclassifier ('LDA_XGBoost').
autotune (bool): Whether to autotune hyperparameters for the classifier (XGBoost / SVM)
ss_main_score (str): Starting main score for semi-supervised learning (can be 'auto').
main_score_selection_report (bool): Whether to generate a report for main score selection.
Expand Down Expand Up @@ -127,7 +127,7 @@ class RunnerConfig:
"""

# Scoring / classifier options
classifier: Literal["LDA", "SVM", "XGBoost"] = "LDA"
classifier: Literal["LDA", "SVM", "XGBoost", 'LDA_XGBoost'] = "LDA"
autotune: bool = False
ss_main_score: str = "auto"
main_score_selection_report: bool = False
Expand Down
21 changes: 14 additions & 7 deletions pyprophet/cli/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
memray_profile,
)
from .._config import RunnerIOConfig
from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier
from ..scoring.runner import PyProphetLearner, PyProphetWeightApplier, LDA_XGBoostMultiLearner


# PyProphet semi-supervised learning and scoring
Expand Down Expand Up @@ -43,7 +43,7 @@
"--classifier",
default="LDA",
show_default=True,
type=click.Choice(["LDA", "SVM", "XGBoost"]),
type=click.Choice(["LDA", "SVM", "XGBoost", "LDA_XGBoost"]),
help='Either a "LDA", "SVM" or "XGBoost" classifier is used for semi-supervised learning.',
)
@click.option(
Expand Down Expand Up @@ -360,7 +360,7 @@ def score(
config.subsample_ratio = 1.0

if not apply_weights:
if config.subsample_ratio < 1.0:
if config.subsample_ratio < 1.0: # currently LDA_XGBoostMultiLearner does not support subsampling
logger.info(
f"Conducting {level} semi-supervised learning on {config.subsample_ratio * 100}% of the data.",
)
Expand Down Expand Up @@ -399,11 +399,18 @@ def score(
PyProphetWeightApplier(weights_path, run_config).run()
else:
PyProphetWeightApplier(weights_path, config).run()
else:
logger.info(
else: # No subsampling
if config.runner.classifier == "LDA_XGBoost":
logger.info(
f"Conducting {level} semi-supervised learning with LDA followed by XGBoost.",
)
LDA_XGBoostMultiLearner(config).run()

else:
logger.info(
f"Conducting {level} semi-supervised learning.",
)
PyProphetLearner(config).run()
)
PyProphetLearner(config).run()
else:
logger.info(
f"Applying {level} weights from {apply_weights} to the full data set.",
Expand Down
2 changes: 1 addition & 1 deletion pyprophet/io/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def _finalize_feature_table(self, df, ss_main_score):
f"Main score ({main_score}) not found in input columns: {df.columns}"
)

if self.classifier == "XGBoost" and self.level != "alignment":
if self.classifier in ["XGBoost", "LDA_XGBoost"] and self.level != "alignment":
logger.info(
"Enable number of transitions & precursor / product charge scores for XGBoost-based classifier"
)
Expand Down
54 changes: 54 additions & 0 deletions pyprophet/scoring/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,60 @@ def print_summary(self, result):
logger.opt(raw=True).info("\n")


class PyProphetMultiLearner(PyProphetRunner):
"""
Implements the learning and scoring workflow for PyProphet with multiple classifiers run sequentially.
"""

__metaclass__ = abc.ABCMeta

@abc.abstractmethod
def run_algo(self, part=None):
if self.glyco:
raise click.ClickException(
"Multi-classifier learning is not supported for glycopeptide workflows."
)


class LDA_XGBoostMultiLearner(PyProphetMultiLearner):
"""
Implements the learning and scoring workflow for PyProphet with multiple classifiers run sequentially
"""

def run_algo(self, part=None):
"""
Runs the learning and scoring algorithm for multiple classifiers.

Returns:
tuple: A tuple containing the result, scorer, and weights.
"""

super(LDA_XGBoostMultiLearner, self).run_algo(part)

config_lda = self.config.copy()
config_lda.runner.classifier = "LDA"

# remove columns that are not needed for LDA
table_lda = self.table.drop(columns=["var_precursor_charge", "var_product_charge", "var_transition_count"], errors='ignore')

(result_lda, scorer_lda, weights_lda) = PyProphet(config_lda).learn_and_apply(table_lda)

# rename the column that was the main score
self.table.columns = self.table.columns.str.replace('^main', '', regex=True)

self.table['main_var_lda_score'] = result_lda.scored_tables['d_score']

logger.info("LDA scores computed! Now running XGBoost using the LDA score as the main score")

config_xgb = self.config.copy()
config_xgb.runner.ss_main_score = 'var_lda_score' # use lda score as the main score for XGBoost
config_xgb.runner.classifier = "XGBoost"
config_xgb.runner.ss_use_dynamic_main_score = False # since using lda score do not need to dynamically select the main score
self.config.runner.classifier = "XGBoost" # need to change to XGBoost for saving the weights

(result_xgb, scorer_xgb, weights_xgb) = PyProphet(config_xgb).learn_and_apply(self.table)
return (result_xgb, scorer_xgb, weights_xgb)

class PyProphetLearner(PyProphetRunner):
"""
Implements the learning and scoring workflow for PyProphet.
Expand Down
14 changes: 14 additions & 0 deletions tests/_regtest_outputs/test_pyprophet_score.test_osw_11.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
feature_id ms1_precursor_pep ms2_peakgroup_pep ms2_precursor_pep
0 -9078977811506172301 0.0063 0.0022 0.0025
1 -9009602369958523731 0.0063 0.0022 0.0325
2 -8990894093332793487 0.0063 0.0022 0.0025
3 -8915955323477460297 0.0063 0.0022 0.0071
4 -8858715981476206597 0.0063 0.0022 0.0025
.. ... ... ... ...
95 -2912234918591861719 0.0063 0.0022 0.0025
96 -2872329084347808160 0.0063 0.0022 0.0025
97 -2789098353857361973 1.0000 0.0022 0.0025
98 -2788620575140019858 0.0063 0.0022 0.0025
99 -2741276427609241638 0.0063 0.0022 0.0325

[100 rows x 4 columns]
15 changes: 15 additions & 0 deletions tests/test_pyprophet_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ def execute(self, levels=None, **kwargs):
level_cmd += " --classifier=XGBoost"
if kwargs.get("xgboost_tune"):
level_cmd += " --autotune"
if kwargs.get("lda_xgboost"):
level_cmd += " --classifier=LDA_XGBoost"
if kwargs.get("score_filter"):
level_cmd = self.config.add_score_filter(level_cmd, level)

Expand Down Expand Up @@ -770,6 +772,19 @@ def test_osw_9(test_runner, test_config, regtest):
def test_osw_10(test_runner, test_config, regtest):
run_metabo_test(test_runner, test_config, regtest, ms1ms2=True, score_filter=True)

# Tests LDA then XGBoost
def test_osw_11(test_runner, test_config, regtest):
run_generic_test(
test_runner,
test_config,
OSWTestStrategy,
regtest,
pfdr=True,
pi0_lambda="0 0 0",
ms1ms2=True,
lda_xgboost=True,
)


# Parquet Tests
def test_parquet_0(test_runner, test_config, regtest):
Expand Down
Loading