From 592410bb267142bb71133cf448d98dc117cba781 Mon Sep 17 00:00:00 2001 From: Sven <6873303+gieses@users.noreply.github.com> Date: Sun, 15 Oct 2023 21:24:16 +0200 Subject: [PATCH] generation of qc plots failing, but training succeeds, adding a fast con --- Makefile | 8 +- environment.yml | 6 +- .../learning_params_training_cv_fast.yaml | 92 +++++++++++ .../xirt_params_rp_fast.yaml | 155 ++++++++++++++++++ setup.py | 12 +- xirt/__main__.py | 2 +- 6 files changed, 263 insertions(+), 12 deletions(-) create mode 100644 sample_data/parameter_examples/learning_params_training_cv_fast.yaml create mode 100644 sample_data/parameter_examples/xirt_params_rp_fast.yaml diff --git a/Makefile b/Makefile index 1d341f8..60bed58 100644 --- a/Makefile +++ b/Makefile @@ -28,12 +28,16 @@ clean: ## clean up - remove docs, dist and build rm -r docs/build rm -r dist rm -r build + rm -r htmlcov sample: - xirt -i DSS_xisearch_fdr_CSM50percent_minimal.csv -o out_dir -x parameter_examples//xirt_params_rp.yaml -l parameter_examples//learning_params_training_cv.yaml + xirt -i sample_data/DSS_xisearch_fdr_CSM50percent_minimal.csv -o sample_data/rt_test -x sample_data//parameter_examples//xirt_params_rp.yaml -l sample_data//parameter_examples//learning_params_training_cv.yaml + +sample_fast: + xirt -i sample_data/DSS_xisearch_fdr_CSM50percent_minimal.csv -o sample_data/rt_test_fast -x sample_data//parameter_examples//xirt_params_rp_fast.yaml -l sample_data//parameter_examples//learning_params_training_cv_fast.yaml env: - conda env update -f environment.yml + conda env update -f environment.yml --prune pip_me: pip install -e . --no-deps \ No newline at end of file diff --git a/environment.yml b/environment.yml index 17997f3..99eae43 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: xirt +name: xirt_test channels: - conda-forge - bioconda @@ -13,6 +13,7 @@ dependencies: - scikit-learn - pandas - anaconda::tensorflow + #- anaconda::tensorflow-gpu - tqdm - pyyaml - seaborn @@ -42,4 +43,5 @@ dependencies: - sphinx - sphinx_rtd_theme - recommonmark - - pytest-xdist \ No newline at end of file + - pytest-xdist + - -e . \ No newline at end of file diff --git a/sample_data/parameter_examples/learning_params_training_cv_fast.yaml b/sample_data/parameter_examples/learning_params_training_cv_fast.yaml new file mode 100644 index 0000000..47b0000 --- /dev/null +++ b/sample_data/parameter_examples/learning_params_training_cv_fast.yaml @@ -0,0 +1,92 @@ + +# Learning options generated with xiRT v. 1.2.3+2.g84a5484 + +# the preprocessing options define how the sequences are encoded / filtered. Usually, default values +# are fine. +# If transfer learning is intended, the label encoder and max_length parameter need to be adapted. + +preprocessing: + # label encoder, str or none. If str, use a previously trained label encoder to translate + # amino acids to specific integers. If you are using xiRT on a single data file set to None + # default None + le: None + + # max sequence length, integer. Filter all sequences longer than this number. Disable by setting + # it to -1 + # default -1 + max_length: -1 + + # for crosslinks only, bool: encode crosslinked residues as different residues than their + # unmodified counter parts + # e.g. a crosslinked K, will be encoded as clK in modX format. + # default True + cl_residue: True + + # filter, str. string filter that must be contained in the description for a CSM to be included + # default "" + filter: "_ECOLI" + +# these options are crucial for the setting up xiRT with the correct training mode. Stay strong! +# It's easier than it seems right now. +# Check the readthedocs documentation if you need more info / examples. +train: + # float value, defines cutoff to filter the input CSMs, e.g. all CSMs with a lower fdr are + # used for training + # default 0.01 + fdr: 0.01 + + # int, the number of crossvalidation folds to be done. 1=nocv, 3=minimal value, recommended + # alternatives with higher run time:5 or 10. + # default 1 + ncv: 2 + + # bool, if True the training data is used to fit a new neural network model after the + # cross-validation step, this model is used for the prediction of RTs for all peptides > + # the given FDR value. + # refit=False: use best CV predictor; b) refit=True: retrain on all CSMs < 0.01 FDR. + # default False + refit: False + + # str, important that defines the training mode (important!) + # "train", train on entire data set: use + # "crossvalidation", perform crossvalidation on the input data (train multiple classifiers) + # "predict", do NOT train on the supplied CSMs but simply predict with an already trained model + # default "train" + mode: "crossvalidation" + + # str, augment the input data by swapping sequences (peptide1, peptide2). Marginal gains in + # predicition were observed here. + # Can usually, be left as False. If you are dealing with very small data sets, this option + # might also help. + # default False + augment: False + + # str, multiple sequence types are supported: "linear", "crosslink", "pseudolinear" (concatenate + # peptide1 and peptide2 sequences) + # default "crosslink" + sequence_type: "crosslink" + + # str (file location), this option can be set with any of the above described options. + # if a valid weight set is supplied, the network is initalized with the given weights + # default "None" + pretrained_weights: "None" + + # str (file location), similarly to the option above, a pretrained model can be supplied. + # this is necessary when (extreme) transfer-learning applications are intended (e.g. different + # number of fractions for e.g. SCX) + # this requires adjustments of the network architecture + # default: "None" + pretrained_model: "None" + + # float, defines the fraction of test data (e.g. a small fraction of the training folds that is + # used for validation + # default 0.10 + test_frac: 0.10 + + # float, used for downsampling the input data (e.g. to create learning curves). Can usually left as 1. + # default 1 + sample_frac: 1 + + # int, seed value for the sampling described above + # default 21 + sample_state: 21 diff --git a/sample_data/parameter_examples/xirt_params_rp_fast.yaml b/sample_data/parameter_examples/xirt_params_rp_fast.yaml new file mode 100644 index 0000000..03e37c3 --- /dev/null +++ b/sample_data/parameter_examples/xirt_params_rp_fast.yaml @@ -0,0 +1,155 @@ + +# xiRT options generated with xiRT v. 1.2.3+2.g84a5484 +# options for the recurrent layer used in xiRT +# can usually be used with default values, except for type +LSTM: + # activation parameters, leave as default unless you know what you are doing + activation: tanh + activity_regularization: l2 + activityregularizer_value: 0.001 + + # option that activates the bidirectional layer to the used LSTM layer + bidirectional: true + + # kernal regularization, leave as default + kernel_regularization: l2 + kernelregularizer_value: 0.001 + lstm_bn: true + + # central layer parameters + # increasing the values here will drastically increase runtime but might also improve results + # usually, 1 and GRU (for CPUs) or CuDNNGRU (for GPUs) will deliver good performance + nlayers: 1 + type: GRU + units: 10 + +# dense parameters are used for the individual task subnetworks (e.g. RP, SCX, ...) +dense: + # activation functions in the layers between the embedding and prediction layer + # recommended to leave on defaults for most applications + activation: + - relu + - relu + - relu + + # boolean indicator if batch_normalization shoulde be used, leave on default + # recommended to leave on defaults for most applications + dense_bn: + - true + - true + - true + + # dropout rate to use + # recommended to leave on defaults for most applications + dropout: + - 0.1 + - 0.1 + - 0.1 + + # regularization methods to use on the kernels, leave on defaults + kernel_regularizer: + - l2 + - l2 + - l2 + regularization: + - true + - true + - true + regularizer_value: + - 0.001 + - 0.001 + - 0.001 + # size of the individual layers, defaults deliver good results. Changes here might need adjustments + # on dropout rates and other hyper-parameters + neurons: + - 300 + - 150 + - 50 + + # int, number of layers to use. Note that all other parameters in the 'dense' section + # must be adapted to the new number used in this variable + nlayers: 3 + +# dimension of the embedding output +embedding: + length: 50 + +# parameters influencing the learning +learning: + # numbers of samples to pass during a single iteration + batch_size: 512 + # number of epochs to train + epochs: 15 + # other tested/reasonable values for learning rate: 0.003, 0.001 + learningrate: 0.01 + verbose: 1 + # default optimizer, most tensorflow optimizers are implemented as well + optimizer: adam + +#!!!!!!!!!!!!!!!!!! most important parameters!!!!!!!!!!!!!!! +output: + # task-parameters. Here the prefix hsax and rp are used to build and parameterize the + # respective sub-networks (this prefix must also match the "predictions" section. + # each task needs to contain the sufixes: activation, column, dimension, loss, metric and weight. + + # They must be carefully adapted for each prediction task. + # recommended to use sigmoid for fractions (SCX/hSAX) if ordinal regression method should be used + hsax-activation: sigmoid + # columne where the fraction RT is in the CSV input + hsax-column: hsax_ordinal + # the number of unique / distinct values (e.g. fractions) + hsax-dimension: 10 + # must be binary_crossentropy for sigmoid activations + hsax-loss: binary_crossentropy + # must be mse + hsax-metrics: mse + # weight parameter to combine the loss of this task to any other defined task + hsax-weight: 50 + + # use linear for regression tasks (revesed phase) + rp-activation: linear + rp-column: rp + # dimension is always 1 for regression + rp-dimension: 1 + # loss and metrics should not be changed from mse + rp-loss: mse + rp-metrics: mse + # again, a weight parameter that might need tuning for multi-task settings + rp-weight: 1 + +# siames parameters +siamese: + # set to True for crosslinks (default) + use: True + # define how to combine the outputs of the siamese layers, most tensorflow options are supported. + # default value should be fine + merge_type: add + # add predictions for single peptides based on the crosslink model (default) + single_predictions: True +callbacks: + # for debugging and model storage + # define which callbacks to use. + # default values are fine here and should not be changed + # options define the meta data that is written throughout the training process. The results can + # be find in the callback in the specified outdir + check_point: True + log_csv: True + # early stopping callback + early_stopping: True + early_stopping_patience: 15 + tensor_board: False + progressbar: True + # reduce learning rate callback + reduce_lr: True + reduce_lr_factor: 0.5 + reduce_lr_patience: 15 +predictions: + # define the prediction tasks unambiguously as they appear in the output file; need to match + # column labels defined in output + # "continues" is reserved for regression problems e.g. reversed-phase chromatography here + continues: + - rp + # fractions are reserved for classification or ordinal regression problems e.g. + # fractionation method that led to discrete fractions + # use [] if no fraction prediction is desired + fractions: []# simply write fractions: [] if no fraction prediction is desired diff --git a/setup.py b/setup.py index 2c3e812..a8cfc63 100644 --- a/setup.py +++ b/setup.py @@ -27,12 +27,11 @@ "Chromatography", "Peptides"] RAPPSILBER_SOFTWARE = "https://www.rappsilberlab.org/software/" # What packages are required for this module to be executed? -REQUIRED = ['numpy', 'pandas', 'tensorflow', 'seaborn', 'xlwt', 'pyyaml', - 'pyteomics', 'scikit-learn', 'tqdm', 'biopython', 'palettable', 'statannot', - 'tensorflow_addons'] +# REQUIRED = ['numpy', 'pandas', 'tensorflow', 'seaborn', 'xlwt', 'pyyaml', +# 'pyteomics', 'scikit-learn', 'tqdm', 'biopython', 'palettable', 'statannot', +# 'tensorflow_addons'] +REQUIRED = [] -# What packages are optional? -# 'fancy feature': ['django'],} EXTRAS = {} # The rest you shouldn't have to touch too much :) @@ -53,8 +52,7 @@ # Load the package's __version__.py module as a dictionary. about = {} project_slug = "xirt" -# with open(os.path.join(here, project_slug, '__version__.py')) as f: -# exec(f.read(), about) + class UploadCommand(Command): diff --git a/xirt/__main__.py b/xirt/__main__.py index d6c9a75..76f5979 100644 --- a/xirt/__main__.py +++ b/xirt/__main__.py @@ -313,7 +313,7 @@ def xirt_runner(peptides_file, out_dir, xirt_loc, setup_loc, nrows=None, perform "xirt_weights_{}.h5".format( str(best_model_idx + 1).zfill(2)))) logger.info("Model Summary:") - logger.info(model_summary_df.groupby("Split").agg([np.mean, np.std]).to_string()) + logger.info(model_summary_df.groupby("Split").agg([np.mean, np.std]).round(2).to_string()) else: logger.info("Loading model weights.") xirtnetwork.build_model(siamese=xirt_params["siamese"]["use"],