From 592410bb267142bb71133cf448d98dc117cba781 Mon Sep 17 00:00:00 2001
From: Sven <6873303+gieses@users.noreply.github.com>
Date: Sun, 15 Oct 2023 21:24:16 +0200
Subject: [PATCH] generation of qc plots failing, but training succeeds, adding
 a fast con

---
 Makefile                                      |   8 +-
 environment.yml                               |   6 +-
 .../learning_params_training_cv_fast.yaml     |  92 +++++++++++
 .../xirt_params_rp_fast.yaml                  | 155 ++++++++++++++++++
 setup.py                                      |  12 +-
 xirt/__main__.py                              |   2 +-
 6 files changed, 263 insertions(+), 12 deletions(-)
 create mode 100644 sample_data/parameter_examples/learning_params_training_cv_fast.yaml
 create mode 100644 sample_data/parameter_examples/xirt_params_rp_fast.yaml

diff --git a/Makefile b/Makefile
index 1d341f8..60bed58 100644
--- a/Makefile
+++ b/Makefile
@@ -28,12 +28,16 @@ clean:                                ## clean up - remove docs, dist and build
 	rm -r docs/build
 	rm -r dist
 	rm -r build
+	rm -r htmlcov
 
 sample:
-	xirt -i DSS_xisearch_fdr_CSM50percent_minimal.csv -o out_dir -x parameter_examples//xirt_params_rp.yaml -l parameter_examples//learning_params_training_cv.yaml
+	xirt -i sample_data/DSS_xisearch_fdr_CSM50percent_minimal.csv -o sample_data/rt_test -x sample_data//parameter_examples//xirt_params_rp.yaml -l sample_data//parameter_examples//learning_params_training_cv.yaml
+
+sample_fast:
+	xirt -i sample_data/DSS_xisearch_fdr_CSM50percent_minimal.csv -o sample_data/rt_test_fast -x sample_data//parameter_examples//xirt_params_rp_fast.yaml -l sample_data//parameter_examples//learning_params_training_cv_fast.yaml
 
 env:
-	conda env update -f environment.yml
+	conda env update -f environment.yml --prune
 
 pip_me:
 	pip install -e . --no-deps
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index 17997f3..99eae43 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,4 +1,4 @@
-name: xirt
+name: xirt_test
 channels:
   - conda-forge
   - bioconda
@@ -13,6 +13,7 @@ dependencies:
   - scikit-learn
   - pandas
   - anaconda::tensorflow
+  #- anaconda::tensorflow-gpu
   - tqdm
   - pyyaml
   - seaborn
@@ -42,4 +43,5 @@ dependencies:
     - sphinx
     - sphinx_rtd_theme
     - recommonmark
-    - pytest-xdist
\ No newline at end of file
+    - pytest-xdist
+    - -e .
\ No newline at end of file
diff --git a/sample_data/parameter_examples/learning_params_training_cv_fast.yaml b/sample_data/parameter_examples/learning_params_training_cv_fast.yaml
new file mode 100644
index 0000000..47b0000
--- /dev/null
+++ b/sample_data/parameter_examples/learning_params_training_cv_fast.yaml
@@ -0,0 +1,92 @@
+
+# Learning options generated with xiRT v. 1.2.3+2.g84a5484
+
+# the preprocessing options define how the sequences are encoded / filtered. Usually, default values
+# are fine.
+# If transfer learning is intended, the label encoder and max_length parameter need to be adapted.
+
+preprocessing:
+    # label encoder, str or none. If str, use a previously trained label encoder to translate
+    # amino acids to specific integers. If you are using xiRT on a single data file set to None
+    # default None
+    le: None
+    
+    # max sequence length, integer. Filter all sequences longer than this number. Disable by setting
+    # it to -1
+    # default -1
+    max_length: -1
+    
+    # for crosslinks only, bool: encode crosslinked residues as different residues than their 
+    # unmodified counter parts
+    # e.g. a crosslinked K, will be encoded as clK in modX format.
+    # default True
+    cl_residue: True
+    
+    # filter, str. string filter that must be contained in the description for a CSM to be included
+    # default ""
+    filter: "_ECOLI"
+
+# these options are crucial for the setting up xiRT with the correct training mode. Stay strong! 
+# It's easier than it seems right now. 
+# Check the readthedocs documentation if you need more info / examples.
+train:
+  # float value, defines cutoff to filter the input CSMs, e.g. all CSMs with a lower fdr are 
+  # used for training
+  # default 0.01
+  fdr: 0.01
+  
+  # int, the number of crossvalidation folds to be done. 1=nocv, 3=minimal value, recommended
+  # alternatives with higher run time:5 or 10.
+  # default 1
+  ncv: 2
+  
+  # bool, if True the training data is used to fit a new neural network model after the 
+  # cross-validation step, this model is used for the prediction of RTs for all peptides > 
+  # the given FDR value.
+  # refit=False: use best CV predictor; b) refit=True: retrain on all CSMs < 0.01 FDR.
+  # default False
+  refit: False
+  
+  # str, important that defines the training mode (important!)
+  # "train", train on entire data set: use
+  # "crossvalidation", perform crossvalidation on the input data (train multiple classifiers)
+  # "predict", do NOT train on the supplied CSMs but simply predict with an already trained model
+  # default "train"
+  mode: "crossvalidation"
+  
+  # str, augment the input data by swapping sequences (peptide1, peptide2). Marginal gains in
+  # predicition were observed here.
+  # Can usually, be left as False. If you are dealing with very small data sets, this option 
+  # might also help.
+  # default False
+  augment: False
+  
+  # str, multiple sequence types are supported: "linear", "crosslink", "pseudolinear" (concatenate
+  # peptide1 and peptide2 sequences)
+  # default "crosslink"
+  sequence_type: "crosslink"
+  
+  # str (file location), this option can be set with any of the above described options.
+  # if a valid weight set is supplied, the network is initalized with the given weights
+  # default "None"
+  pretrained_weights: "None"
+  
+  # str (file location), similarly to the option above, a pretrained model can be supplied. 
+  # this is necessary when (extreme) transfer-learning applications are intended (e.g. different 
+  # number of fractions for e.g. SCX)
+  # this requires adjustments of the network architecture
+  # default: "None"
+  pretrained_model: "None"
+  
+  # float, defines the fraction of test data (e.g. a small fraction of the training folds that is
+  # used for validation
+  # default 0.10
+  test_frac: 0.10
+  
+  # float, used for downsampling the input data (e.g. to create learning curves). Can usually left as 1.
+  # default 1
+  sample_frac: 1
+  
+  # int, seed value for the sampling described above
+  # default 21
+  sample_state: 21
diff --git a/sample_data/parameter_examples/xirt_params_rp_fast.yaml b/sample_data/parameter_examples/xirt_params_rp_fast.yaml
new file mode 100644
index 0000000..03e37c3
--- /dev/null
+++ b/sample_data/parameter_examples/xirt_params_rp_fast.yaml
@@ -0,0 +1,155 @@
+
+# xiRT options generated with xiRT v. 1.2.3+2.g84a5484
+# options for the recurrent layer used in xiRT
+# can usually be used with default values, except for type
+LSTM:
+  # activation parameters, leave as default unless you know what you are doing
+  activation: tanh
+  activity_regularization: l2
+  activityregularizer_value: 0.001
+  
+  # option that activates the bidirectional layer to the used LSTM layer
+  bidirectional: true
+  
+  # kernal regularization, leave as default
+  kernel_regularization: l2
+  kernelregularizer_value: 0.001
+  lstm_bn: true
+  
+  # central layer parameters
+  # increasing the values here will drastically increase runtime but might also improve results
+  # usually, 1 and GRU (for CPUs) or CuDNNGRU (for GPUs) will deliver good performance
+  nlayers: 1
+  type: GRU
+  units: 10
+ 
+# dense parameters are used for the individual task subnetworks (e.g. RP, SCX, ...)
+dense:
+  # activation functions in the layers between the embedding and prediction layer
+  # recommended to leave on defaults for most applications
+  activation:
+  - relu
+  - relu
+  - relu
+  
+  # boolean indicator if batch_normalization shoulde be used, leave on default
+  # recommended to leave on defaults for most applications
+  dense_bn:
+  - true
+  - true
+  - true
+  
+  # dropout rate to use
+  # recommended to leave on defaults for most applications
+  dropout:
+  - 0.1
+  - 0.1
+  - 0.1
+  
+  # regularization methods to use on the kernels, leave on defaults
+  kernel_regularizer:
+  - l2
+  - l2
+  - l2
+  regularization:
+  - true
+  - true
+  - true
+  regularizer_value:
+  - 0.001
+  - 0.001
+  - 0.001
+  # size of the individual layers, defaults deliver good results. Changes here might need adjustments 
+  # on dropout rates and other hyper-parameters
+  neurons:
+  - 300
+  - 150
+  - 50
+  
+  # int, number of layers to use. Note that all other parameters in the 'dense' section
+  # must be adapted to the new number used in this variable
+  nlayers: 3
+
+# dimension of the embedding output
+embedding:
+  length: 50
+ 
+# parameters influencing the learning 
+learning:
+  # numbers of samples to pass during a single iteration
+  batch_size: 512
+  # number of epochs to train
+  epochs: 15
+  # other tested/reasonable values for learning rate: 0.003, 0.001
+  learningrate: 0.01
+  verbose: 1
+  # default optimizer, most tensorflow optimizers are implemented as well
+  optimizer: adam
+  
+#!!!!!!!!!!!!!!!!!! most important parameters!!!!!!!!!!!!!!!
+output:
+  # task-parameters. Here the prefix hsax and rp are used to build and parameterize the
+  # respective sub-networks (this prefix must also match the "predictions" section. 
+  # each task needs to contain the sufixes: activation, column, dimension, loss, metric and weight.
+
+  # They must be carefully adapted for each prediction task.
+  # recommended to use sigmoid for fractions (SCX/hSAX) if ordinal regression method should be used
+  hsax-activation: sigmoid
+  # columne where the fraction RT is in the CSV input
+  hsax-column: hsax_ordinal
+  # the number of unique / distinct values (e.g. fractions)
+  hsax-dimension: 10
+  # must be binary_crossentropy for sigmoid activations
+  hsax-loss: binary_crossentropy
+  # must be mse
+  hsax-metrics: mse
+  # weight parameter to combine the loss of this task to any other defined task
+  hsax-weight: 50
+  
+  # use linear for regression tasks (revesed phase)
+  rp-activation: linear
+  rp-column: rp
+  # dimension is always 1 for regression
+  rp-dimension: 1
+  # loss and metrics should not be changed from mse
+  rp-loss: mse
+  rp-metrics: mse
+  # again, a weight parameter that might need tuning for multi-task settings
+  rp-weight: 1
+
+# siames parameters
+siamese:
+  # set to True for crosslinks (default)
+  use: True
+  # define how to combine the outputs of the siamese layers, most tensorflow options are supported.
+  # default value should be fine
+  merge_type: add
+  # add predictions for single peptides based on the crosslink model (default)
+  single_predictions: True
+callbacks:
+  # for debugging and model storage
+  # define which callbacks to use.
+  # default values are fine here and should not be changed
+  # options define the meta data that is written throughout the training process. The results can 
+  # be find in the callback in the specified outdir
+  check_point: True
+  log_csv: True
+  # early stopping callback
+  early_stopping: True
+  early_stopping_patience: 15
+  tensor_board: False
+  progressbar: True
+  # reduce learning rate callback
+  reduce_lr: True
+  reduce_lr_factor: 0.5
+  reduce_lr_patience: 15
+predictions:
+  # define the prediction tasks unambiguously as they appear in the output file; need to match
+  # column labels defined in output
+  # "continues" is reserved for regression problems e.g. reversed-phase chromatography here
+  continues:
+    - rp
+  # fractions are reserved for classification or ordinal regression problems e.g. 
+  # fractionation method that led to discrete fractions
+  # use [] if no fraction prediction is desired
+  fractions: []# simply write fractions: [] if no fraction prediction is desired
diff --git a/setup.py b/setup.py
index 2c3e812..a8cfc63 100644
--- a/setup.py
+++ b/setup.py
@@ -27,12 +27,11 @@
             "Chromatography", "Peptides"]
 RAPPSILBER_SOFTWARE = "https://www.rappsilberlab.org/software/"
 # What packages are required for this module to be executed?
-REQUIRED = ['numpy', 'pandas', 'tensorflow', 'seaborn', 'xlwt', 'pyyaml',
-            'pyteomics', 'scikit-learn', 'tqdm', 'biopython', 'palettable', 'statannot',
-            'tensorflow_addons']
+# REQUIRED = ['numpy', 'pandas', 'tensorflow', 'seaborn', 'xlwt', 'pyyaml',
+#             'pyteomics', 'scikit-learn', 'tqdm', 'biopython', 'palettable', 'statannot',
+#             'tensorflow_addons']
+REQUIRED = []
 
-# What packages are optional?
-# 'fancy feature': ['django'],}
 EXTRAS = {}
 
 # The rest you shouldn't have to touch too much :)
@@ -53,8 +52,7 @@
 # Load the package's __version__.py module as a dictionary.
 about = {}
 project_slug = "xirt"
-# with open(os.path.join(here, project_slug, '__version__.py')) as f:
-#     exec(f.read(), about)
+
 
 
 class UploadCommand(Command):
diff --git a/xirt/__main__.py b/xirt/__main__.py
index d6c9a75..76f5979 100644
--- a/xirt/__main__.py
+++ b/xirt/__main__.py
@@ -313,7 +313,7 @@ def xirt_runner(peptides_file, out_dir, xirt_loc, setup_loc, nrows=None, perform
                                                         "xirt_weights_{}.h5".format(
                                                             str(best_model_idx + 1).zfill(2))))
             logger.info("Model Summary:")
-            logger.info(model_summary_df.groupby("Split").agg([np.mean, np.std]).to_string())
+            logger.info(model_summary_df.groupby("Split").agg([np.mean, np.std]).round(2).to_string())
     else:
         logger.info("Loading model weights.")
         xirtnetwork.build_model(siamese=xirt_params["siamese"]["use"],