diff --git a/docs/yaml_docs/pipeline_preprocess_yml.md b/docs/yaml_docs/pipeline_preprocess_yml.md index ffaa581f..819375e9 100644 --- a/docs/yaml_docs/pipeline_preprocess_yml.md +++ b/docs/yaml_docs/pipeline_preprocess_yml.md @@ -17,11 +17,12 @@ When running the preprocess workflow, panpipes provides a basic `pipeline.yml` f To run the workflow on your own data, you need to specify the parameters described below in the `pipeline.yml` file to meet the requirements of your data. However, we do provide pre-filled versions of the `pipeline.yml` file for individual [tutorials](https://panpipes-pipelines.readthedocs.io/en/latest/tutorials/index.html). -For more information on functionalities implemented in `panpipes` to read the configuration files, such as reading blocks of parameters and reusing blocks with `&anchors` and `*scalars`, please check [our documentation](./useful_info_on_yml.md) +For more information on functionalities implemented in `panpipes` to read the configuration files, such as reading blocks of parameters and reusing blocks with `&anchors` and `*scalars`, please check [our documentation](./useful_info_on_yml.md). You can download the different preprocess `pipeline.yml` files here: - Basic `pipeline.yml` file (not prefilled) that is generated when calling `panpipes preprocess config: [Download here] +- ## Compute resources options @@ -362,13 +363,72 @@ Whether applying scaling or not is still a matter of debate, as stated in the [L - color_by `String`, Default: sample_id
Column to be fetched from the protein layer .obs to color the PCA plot by. -## ATAC steps preprocessing steps +## ATAC preprocessing steps atac
Parameters for the preprocessing of the ATAC modality. + - binarize `Boolean`, Default: False
+ If set to True, the data will be binarized. + - normalize `String`, Default: TFIDF
+ What normalisation method to use. Available options are "log1p" or "TFIDF". + - TFIDF_flavour `String`, Default: signac
+ TFIDF normalisation flavor. Leave blank if you don't use TFIDF normalisation. + Available options are: "signac", "logTF" or "logIDF". + - feature_selection_flavour `String`, Default: signac
+ Flavor for selecting highly variable features (HVF). + HVF selection either with scanpy's `pp.highly_variable_genes()` function or a `pseudo-FindTopFeatures()` function of the signac package. + Accordingly, available options are: "signac" or "scanpy". + + - min_mean `Float`, Default: 0.05
+ Applicable if `feature_selection_flavour` is set to "scanpy". + You can leave this parameter blank if you want to use the default value. + + - max_mean `Float`, Default: 1.5
+ Applicable if `feature_selection_flavour` is set to "scanpy". + You can leave this parameter blank if you want to use the default value. + + - min_disp `Float`, Default: 0.5
+ Applicable if `feature_selection_flavour` is set to "scanpy". + You can leave this parameter blank if you want to use the default value. + + - n_top_features `Integer`
+ Applicable if `feature_selection_flavour` is set to "scanpy". + Number of highly-variable features to keep. + If specified, overwrites previous defaults for HVF selection. + + - filter_by_hvf `Boolean`, Default: False
+ Applicable if `feature_selection_flavour` is set to "scanpy". + Set to True if you want to filter the ATAC layer to retain only HVFs. + + - min_cutoff `String`, Default: q5
+ Applicable if `feature_selection_flavour` is set to "signac". + Can be specified as follows: + - "q[x]": "q" followed by the minimum percentile, e.g. q5 will set the top 95% most common features as higly variable. + - "c[x]": "c" followed by a minimum cell count, e.g. c100 will set features present in > 100 cells as highly variable. + - "tc[x]": "tc" followed by a minimum total count, e.g. tc100 will set features with total counts > 100 as highly variable. + - "NULL": All features are assigned as highly variable. + - "NA": Highly variable features won't be changed. + + - dimred `String`, Default: LSI
+ Available options are: PCA or LSI. + LSI will only be computed if TFIDF normalisation was used. + + - n_comps `Integer`, Default: 50
+ Number of components to compute. + + - solver `String`, Default: default
+ If using PCA, which solver to use. Setting this parameter to "default", will use the 'arpack' solver. + + - color_by `String`, Default: sample_id
+ Specify the covariate you want to use to color the dimensionality reduction plot. + + - dim_remove `X`, Default: X
+ Whether to remove the component(s) associated to technical artifacts. + For instance, it is common to remove the first LSI component, as it is often associated with batch effects. + Leave blank to avoid removing any. diff --git a/panpipes/panpipes/pipeline_preprocess/pipeline.yml b/panpipes/panpipes/pipeline_preprocess/pipeline.yml index 0bac851a..4d920fc4 100644 --- a/panpipes/panpipes/pipeline_preprocess/pipeline.yml +++ b/panpipes/panpipes/pipeline_preprocess/pipeline.yml @@ -168,53 +168,36 @@ prot: store_as_X: save_norm_prot_mtx: False - #---------------------------- + #--------------------------------- # Protein Dimensionality reduction pca: False n_pcs: 50 solver: default color_by: sample_id -# ------------------------------ -# ATAC steps preprocessing steps -# ------------------------------ +# ------------------------ +# ATAC preprocessing steps +# ------------------------ atac: binarize: False - normalize: TFIDF # "log1p" or "TFIDF" - # if normalize = "TFIDF", else leave blank: - TFIDF_flavour: signac # "signac", "logTF" or "logIDF" - # highly variable feature selection: - # HVF selection either with scanpy's pp.highly_variable_genes() function or a pseudo-FindTopFeatures() function of the signac package - feature_selection_flavour: signac # "signac" or "scanpy" - # parameters for HVF flavour == "scanpy", leave the below blank to use defaults - min_mean: #default 0.05 - max_mean: #default 1.5 - min_disp: #default 0.5 - # if n_top_features is specified, it overwrites previous defaults for HVF selection - n_top_features: - # Filter the atac layer to retain only HVF + normalize: TFIDF #"log1p" or "TFIDF" + TFIDF_flavour: signac #"signac", "logTF" or "logIDF" + feature_selection_flavour: signac #"signac" or "scanpy" + + # parameters for feature_selection_flavour == "scanpy", leave blank to use defaults + min_mean: #default 0.05 + max_mean: #default 1.5 + min_disp: #default 0.5 + n_top_features: #if specified, overwrites previous defaults for HVF selection filter_by_hvf: False - # parameter for HVF flavour == "signac" + + # parameter for feature_selection_flavour == "signac" min_cutoff: q5 - # min_cutoff can be specified as follows: - # "q[x]": "q" followed by the minimum percentile, e.g. q5 will set the top 95% most common features as higly variable - # "c[x]": "c" followed by a minimum cell count, e.g. c100 will set features present in > 100 cells as highly variable - # "tc[x]": "tc" followed by a minimum total count, e.g. tc100 will set features with total counts > 100 as highly variable - # "NULL": All features are assigned as highly variable - # "NA": Highly variable features won't be changed - #---------------------------- + + #------------------------------ # ATAC Dimensionality reduction - #---------------------------- - dimred: LSI #PCA or LSI (LSI will only be computed if the normalize param is set to TFIDF) - n_comps: 50 # how many components to compute - # which dimension to exclude from further processing (sometimes useful to remove PC/LSI_1 if it's associated to tech factors) - # leave blank to retain all - # if using PCA, which solver to use. Default == 'arpack' + dimred: LSI #PCA or LSI + n_comps: 50 solver: default - # what covariate to use to color the dimensionality reduction color_by: sample_id - # whether to remove the component(s) associated to technical effects, common to remove 1 for LSI - # leave blank to avoid removing any dim_remove: - -