Merge pull request #256 from DendrouLab/clean_clustering

Cleaned pipeline.yml for clustering
DendrouLab · Apr 23, 2024 · 97fed7d · 97fed7d
2 parents 10cff65 + 134911a
commit 97fed7d
Showing 1 changed file with 9 additions and 62 deletions.
diff --git a/panpipes/panpipes/pipeline_clustering/pipeline.yml b/panpipes/panpipes/pipeline_clustering/pipeline.yml
@@ -7,16 +7,11 @@
 # compute resource options
 # ------------------------
 resources:
-  # Number of threads used for parallel jobs
-  # this must be enough memory to load your mudata and do computationally intensive tasks
   threads_high: 2
-  # this must be enough memory to load your mudata and do computationally light tasks
   threads_medium: 2
-# this must be enough memory to load text files and do plotting, required much less memory than the other two
   threads_low: 2
 
   fewer_jobs: True
-# path to conda env, leave blank if running native or your cluster automatically inherits the login node environment
 condaenv:
 
 # --------------------------
@@ -25,10 +20,6 @@ condaenv:
 
 sample_prefix: mdata
 scaled_obj: mdata_scaled.h5mu
-# full obj only applicable if you have filtered your scaled object by hvgs
-# in this case panpipes will use the full obj to calculate rank_gene_groups and for plotting those genes
-# it should contain all the genes you want to include in rank_gene_groups, plus logged_counts as a layer
-# if your scaled_obj contains all the genes then leave full_obj blank
 full_obj: 
 
 # run clustering on each individual modality:
@@ -41,7 +32,6 @@ modalities:
 # if True, will look for WNN, or totalVI output
 multimodal:
   run_clustering: True
-  #WNN, mofa, totalVI # this will tell us where to look for 
   integration_method: 
 
 
@@ -50,68 +40,41 @@ multimodal:
 # ---------------------------------------
 # 
 # -----------------------------
-# number of neighbors to use when calculating the graph for clustering and umap.
 neighbors:
   rna:
     use_existing: True
-    # which representation in .obsm to use for nearest neighbors
-    # if dim_red=X_pca and X_pca not in .obsm, will be computed with default parameters
     dim_red: X_pca
-    #how many components to use for clustering
     n_dim_red: 30
-    # number of neighbours
     k: 30
-    # metric: euclidean | cosine
     metric: euclidean
-    # scanpy | hnsw (from scvelo)
     method: scanpy
   prot:
     use_existing: True
-    # which representation in .obsm to use for nearest neighbors
-    # if dim_red=X_pca and X_pca not in .obsm, will be computed with default parameters
     dim_red: X_pca
-    #how many components to use for clustering
     n_dim_red: 30
-    # number of neighbours
     k: 30
-    # metric: euclidean | cosine
     metric: euclidean
-    # scanpy | hnsw (from scvelo)
     method: scanpy
   atac:
     use_existing: True
-    # which representation in .obsm to use for nearest neighbors
-    # if dim_red=X_lsi/X_pca and X_lsi/X_pca not in .obsm, will be computed with default parameters
     dim_red: X_lsi
-    # if dim_red=X_lsi/X_pca and X_lsi/X_pca not in .obsm, which dimension to remove
     dim_remove: 1
-    #how many components to use for clustering
     n_dim_red: 30
-    # number of neighbours
     k: 30
-    # metric: euclidean | cosine
     metric: euclidean
-    # scanpy | hnsw (from scvelo)
     method: scanpy
   spatial:
     use_existing: False
-    # which representation in .obsm to use for nearest neighbors
-    # if dim_red=X_pca and X_pca not in .obsm, will be computed with default parameters
     dim_red: X_pca
-    #how many components to use for clustering
     n_dim_red: 30
-    # number of neighbours
     k: 30
-    # metric: euclidean | cosine
     metric: euclidean
-    # scanpy | hnsw (from scvelo)
     method: scanpy
 
 # ---------------------------------------
 # parameters for umap calculation
 # ---------------------------------------
 umap:
-# set run to False if you are happy with the existing umap from integration
   run: True
   rna:
     mindist:
@@ -141,19 +104,19 @@ clusterspecs:
      - 0.2
      - 0.6
      - 1 
-    algorithm: leiden # (louvain or leiden)
+    algorithm: leiden 
   prot:
     resolutions:
      - 0.2
      - 0.6
      - 1 
-    algorithm: leiden # (louvain or leiden)
+    algorithm: leiden 
   atac:
     resolutions:
      - 0.2
      - 0.6
      - 1 
-    algorithm: leiden # (louvain or leiden)
+    algorithm: leiden 
   multimodal:
     resolutions:
       - 0.5
@@ -168,31 +131,21 @@ clusterspecs:
 # ---------------------------------------
 # parameters for finding marker genes
 # ---------------------------------------
-# where pseudo_suerat is set to False 
-# we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
-# where pseudo_seurat is set to True, we run a python implementation of Seurat::FindMarkers (written by CRG)
-
+# args https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
 markerspecs:
   rna:
     run: True
     layer: logged_counts
-    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
     method: t-test_overestim_var
-    mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
-    # where pseudo_suerat is set to False 
-    # we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
-    # where pseudo_seurat is set to True, we run a python implementation of Seurat::FindMarkers (written by CRG),
+    mincells: 10 
     pseudo_seurat: False
-    # these next two settings do not matter unless pseudo_seurat is set to True,
-    # If applicable look at Seurat documentation for FindMarkers for details
     minpct: 0.1
     threshuse: 0.25
 
   prot:
     run:
-    layer: clr #options clr,dsb
-    mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
-    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
+    layer: clr 
+    mincells: 10 
     method: wilcoxon
     pseudo_seurat: False
     minpct: 0.1
@@ -202,7 +155,6 @@ markerspecs:
     run:
     layer: logged_counts #options logged_counts, signac_norm , logTF_norm,logIDF_norm
     mincells: 10
-    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
     method: wilcoxon
     pseudo_seurat: False
     minpct: 0.1
@@ -217,17 +169,12 @@ markerspecs:
     threshuse: 0.25
 
   spatial:
+    # args for spatial https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
     run: True
     layer: norm_pearson_resid
-    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
     method: t-test_overestim_var
-    mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
-    # where pseudo_suerat is set to False 
-    # we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
-    # where pseudo_seurat is set to True, we run a python implementation of Seurat::FindMarkers (written by CRG),
+    mincells: 10 
     pseudo_seurat: False
-    # these next two settings do not matter unless pseudo_seurat is set to True,
-    # If applicable look at Seurat documentation for FindMarkers for details
     minpct: 0.1
     threshuse: 0.25