diff --git a/docs/yaml_docs/pipeline_preprocess_yml.md b/docs/yaml_docs/pipeline_preprocess_yml.md
index ffaa581f..819375e9 100644
--- a/docs/yaml_docs/pipeline_preprocess_yml.md
+++ b/docs/yaml_docs/pipeline_preprocess_yml.md
@@ -17,11 +17,12 @@ When running the preprocess workflow, panpipes provides a basic `pipeline.yml` f
To run the workflow on your own data, you need to specify the parameters described below in the `pipeline.yml` file to meet the requirements of your data.
However, we do provide pre-filled versions of the `pipeline.yml` file for individual [tutorials](https://panpipes-pipelines.readthedocs.io/en/latest/tutorials/index.html).
-For more information on functionalities implemented in `panpipes` to read the configuration files, such as reading blocks of parameters and reusing blocks with `&anchors` and `*scalars`, please check [our documentation](./useful_info_on_yml.md)
+For more information on functionalities implemented in `panpipes` to read the configuration files, such as reading blocks of parameters and reusing blocks with `&anchors` and `*scalars`, please check [our documentation](./useful_info_on_yml.md).
You can download the different preprocess `pipeline.yml` files here:
- Basic `pipeline.yml` file (not prefilled) that is generated when calling `panpipes preprocess config: [Download here]
+-
## Compute resources options
@@ -362,13 +363,72 @@ Whether applying scaling or not is still a matter of debate, as stated in the [L
- color_by `String`, Default: sample_id
Column to be fetched from the protein layer .obs to color the PCA plot by.
-## ATAC steps preprocessing steps
+## ATAC preprocessing steps
atac
Parameters for the preprocessing of the ATAC modality.
+ - binarize `Boolean`, Default: False
+ If set to True, the data will be binarized.
+ - normalize `String`, Default: TFIDF
+ What normalisation method to use. Available options are "log1p" or "TFIDF".
+ - TFIDF_flavour `String`, Default: signac
+ TFIDF normalisation flavor. Leave blank if you don't use TFIDF normalisation.
+ Available options are: "signac", "logTF" or "logIDF".
+ - feature_selection_flavour `String`, Default: signac
+ Flavor for selecting highly variable features (HVF).
+ HVF selection either with scanpy's `pp.highly_variable_genes()` function or a `pseudo-FindTopFeatures()` function of the signac package.
+ Accordingly, available options are: "signac" or "scanpy".
+
+ - min_mean `Float`, Default: 0.05
+ Applicable if `feature_selection_flavour` is set to "scanpy".
+ You can leave this parameter blank if you want to use the default value.
+
+ - max_mean `Float`, Default: 1.5
+ Applicable if `feature_selection_flavour` is set to "scanpy".
+ You can leave this parameter blank if you want to use the default value.
+
+ - min_disp `Float`, Default: 0.5
+ Applicable if `feature_selection_flavour` is set to "scanpy".
+ You can leave this parameter blank if you want to use the default value.
+
+ - n_top_features `Integer`
+ Applicable if `feature_selection_flavour` is set to "scanpy".
+ Number of highly-variable features to keep.
+ If specified, overwrites previous defaults for HVF selection.
+
+ - filter_by_hvf `Boolean`, Default: False
+ Applicable if `feature_selection_flavour` is set to "scanpy".
+ Set to True if you want to filter the ATAC layer to retain only HVFs.
+
+ - min_cutoff `String`, Default: q5
+ Applicable if `feature_selection_flavour` is set to "signac".
+ Can be specified as follows:
+ - "q[x]": "q" followed by the minimum percentile, e.g. q5 will set the top 95% most common features as higly variable.
+ - "c[x]": "c" followed by a minimum cell count, e.g. c100 will set features present in > 100 cells as highly variable.
+ - "tc[x]": "tc" followed by a minimum total count, e.g. tc100 will set features with total counts > 100 as highly variable.
+ - "NULL": All features are assigned as highly variable.
+ - "NA": Highly variable features won't be changed.
+
+ - dimred `String`, Default: LSI
+ Available options are: PCA or LSI.
+ LSI will only be computed if TFIDF normalisation was used.
+
+ - n_comps `Integer`, Default: 50
+ Number of components to compute.
+
+ - solver `String`, Default: default
+ If using PCA, which solver to use. Setting this parameter to "default", will use the 'arpack' solver.
+
+ - color_by `String`, Default: sample_id
+ Specify the covariate you want to use to color the dimensionality reduction plot.
+
+ - dim_remove `X`, Default: X
+ Whether to remove the component(s) associated to technical artifacts.
+ For instance, it is common to remove the first LSI component, as it is often associated with batch effects.
+ Leave blank to avoid removing any.
diff --git a/panpipes/panpipes/pipeline_preprocess/pipeline.yml b/panpipes/panpipes/pipeline_preprocess/pipeline.yml
index 0bac851a..4d920fc4 100644
--- a/panpipes/panpipes/pipeline_preprocess/pipeline.yml
+++ b/panpipes/panpipes/pipeline_preprocess/pipeline.yml
@@ -168,53 +168,36 @@ prot:
store_as_X:
save_norm_prot_mtx: False
- #----------------------------
+ #---------------------------------
# Protein Dimensionality reduction
pca: False
n_pcs: 50
solver: default
color_by: sample_id
-# ------------------------------
-# ATAC steps preprocessing steps
-# ------------------------------
+# ------------------------
+# ATAC preprocessing steps
+# ------------------------
atac:
binarize: False
- normalize: TFIDF # "log1p" or "TFIDF"
- # if normalize = "TFIDF", else leave blank:
- TFIDF_flavour: signac # "signac", "logTF" or "logIDF"
- # highly variable feature selection:
- # HVF selection either with scanpy's pp.highly_variable_genes() function or a pseudo-FindTopFeatures() function of the signac package
- feature_selection_flavour: signac # "signac" or "scanpy"
- # parameters for HVF flavour == "scanpy", leave the below blank to use defaults
- min_mean: #default 0.05
- max_mean: #default 1.5
- min_disp: #default 0.5
- # if n_top_features is specified, it overwrites previous defaults for HVF selection
- n_top_features:
- # Filter the atac layer to retain only HVF
+ normalize: TFIDF #"log1p" or "TFIDF"
+ TFIDF_flavour: signac #"signac", "logTF" or "logIDF"
+ feature_selection_flavour: signac #"signac" or "scanpy"
+
+ # parameters for feature_selection_flavour == "scanpy", leave blank to use defaults
+ min_mean: #default 0.05
+ max_mean: #default 1.5
+ min_disp: #default 0.5
+ n_top_features: #if specified, overwrites previous defaults for HVF selection
filter_by_hvf: False
- # parameter for HVF flavour == "signac"
+
+ # parameter for feature_selection_flavour == "signac"
min_cutoff: q5
- # min_cutoff can be specified as follows:
- # "q[x]": "q" followed by the minimum percentile, e.g. q5 will set the top 95% most common features as higly variable
- # "c[x]": "c" followed by a minimum cell count, e.g. c100 will set features present in > 100 cells as highly variable
- # "tc[x]": "tc" followed by a minimum total count, e.g. tc100 will set features with total counts > 100 as highly variable
- # "NULL": All features are assigned as highly variable
- # "NA": Highly variable features won't be changed
- #----------------------------
+
+ #------------------------------
# ATAC Dimensionality reduction
- #----------------------------
- dimred: LSI #PCA or LSI (LSI will only be computed if the normalize param is set to TFIDF)
- n_comps: 50 # how many components to compute
- # which dimension to exclude from further processing (sometimes useful to remove PC/LSI_1 if it's associated to tech factors)
- # leave blank to retain all
- # if using PCA, which solver to use. Default == 'arpack'
+ dimred: LSI #PCA or LSI
+ n_comps: 50
solver: default
- # what covariate to use to color the dimensionality reduction
color_by: sample_id
- # whether to remove the component(s) associated to technical effects, common to remove 1 for LSI
- # leave blank to avoid removing any
dim_remove:
-
-