diff --git a/.github/workflows/integration03-ci.yml b/.github/workflows/integration03-ci.yml new file mode 100644 index 00000000..7d79325f --- /dev/null +++ b/.github/workflows/integration03-ci.yml @@ -0,0 +1,99 @@ +name: Run integration03 + +on: + push: + branches: + - main + pull_request: + branches: + - main + +env: + debug: 'true' + +jobs: + integration: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] # , "macos-latest", "windows-latest" + python-version: ["3.10"] + + steps: + - uses: actions/checkout@v4 + + - name: File tree + if: env.debug == 'true' + run: tree + + - uses: conda-incubator/setup-miniconda@v3 + with: + miniforge-version: latest + auto-activate-base: true + auto-update-conda: true + channels: conda-forge + channel-priority: strict + activate-environment: pipeline_env + environment-file: pipeline_env.yaml +# important: this patch is only to test if multivi integration works +# issues are not related to panpipes https://discourse.scverse.org/t/error-when-training-model-on-m3-max-mps/1896/2 +# https://discourse.scverse.org/t/macbook-m1-m2-mps-acceleration-with-scvi/2075/4 + - name: Install Panpipes + shell: bash -el {0} + run: | + pip install -e . + conda list + + - name: Conda info + if: env.debug == 'true' + shell: bash -el {0} + run: conda info + + - name: Conda list + if: env.debug == 'true' + shell: pwsh + run: conda list + + # Note: all three files are renamed during the download to trim the "subsample_" prefix + - name: Preparing the data + run: | + mkdir -p teaseq/integration && cd teaseq/integration + curl -L -o teaseq.h5mu https://figshare.com/ndownloader/files/44796985 + + # Note: we run the following to test that the commands works + # However, the following task will replace the file anyway + - name: Preparing the configuration file + shell: bash -el {0} + run: | + cd teaseq/integration + panpipes integration config + + - name: Edit the submission file + run: | + cd teaseq/integration + curl -o pipeline.yml https://raw.githubusercontent.com/DendrouLab/panpipes/1849a8c65aa67702f423da2c3b2d1d9238adac6d/tests/integration_3/pipeline.yml + - name: Replace template contents in configuration file + run: | + cd teaseq/integration + sed -i 's+/Users/fabiola.curion/Documents/devel/miniconda3/envs/pipeline_env+pipeline_env+g' pipeline.yml + + - name: File tree + if: env.debug == 'true' + run: tree teaseq + + - name: Review pipeline tasks + shell: bash -el {0} + run: | + cd teaseq/integration + panpipes integration show full --local + + - name: Run pipeline tasks + shell: bash -el {0} + run: | + cd teaseq/integration + panpipes integration make full --local + + - name: File tree + if: env.debug == 'true' + run: tree teaseq diff --git a/README.md b/README.md index d4e9338b..35ca265b 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,9 @@ Panpipes is a set of computational workflows designed to automate multimodal single-cell and spatial transcriptomic analyses by incorporating widely-used Python-based tools to perform quality control, preprocessing, integration, clustering, and reference mapping at scale. Panpipes allows reliable and customisable analysis and evaluation of individual and integrated modalities, thereby empowering decision-making before downstream investigations. -**See our [documentation](https://panpipes-pipelines.readthedocs.io/en/latest/) and our [preprint](https://www.biorxiv.org/content/10.1101/2023.03.11.532085v2)** +**See our [documentation](https://panpipes-pipelines.readthedocs.io/en/latest/)** + +**Panpipes is on [Genome Biology!](https://link.springer.com/article/10.1186/s13059-024-03322-7)** These workflows make use of [cgat-core](https://github.com/cgat-developers/cgat-core) @@ -53,9 +55,9 @@ Check the example [submission file](https://github.com/DendrouLab/panpipes/blob/ ## Citation -[Panpipes: a pipeline for multiomic single-cell and spatial transcriptomic data analysis -Fabiola Curion, Charlotte Rich-Griffin, Devika Agarwal, Sarah Ouologuem, Tom Thomas, Fabian J. Theis, Calliope A. Dendrou -bioRxiv 2023.03.11.532085; doi: https://doi.org/10.1101/2023.03.11.532085](https://www.biorxiv.org/content/10.1101/2023.03.11.532085v2) +[Curion, F., Rich-Griffin, C., Agarwal, D. et al. Panpipes: a pipeline for multiomic single-cell and spatial transcriptomic data analysis. Genome Biol 25, 181 (2024). +doi: https://doi.org/10.1186/s13059-024-03322-7](https://link.springer.com/article/10.1186/s13059-024-03322-7) + ## Contributors diff --git a/panpipes/panpipes/pipeline_integration.py b/panpipes/panpipes/pipeline_integration.py index ba9e071e..916ff047 100644 --- a/panpipes/panpipes/pipeline_integration.py +++ b/panpipes/panpipes/pipeline_integration.py @@ -545,8 +545,7 @@ def run_bbknn_atac(outfile): cmd += " --neighbors_within_batch %i" % PARAMS['atac']['bbknn']['neighbors_within_batch'] if PARAMS['atac']['neighbors']['npcs'] is not None: cmd += " --neighbors_n_pcs %s" % PARAMS['atac']['neighbors']['npcs'] - #Forcing bbknn to run on PCA in case of atac - cmd += " --dimred PCA" + #cmd += " --dimred PCA" cmd += " > logs/3_atac_bbknn.log " if PARAMS['queues_long'] is not None: job_kwargs["job_queue"] = PARAMS['queues_long'] diff --git a/panpipes/python_scripts/batch_correct_bbknn.py b/panpipes/python_scripts/batch_correct_bbknn.py index 23b802ba..8d8a0919 100644 --- a/panpipes/python_scripts/batch_correct_bbknn.py +++ b/panpipes/python_scripts/batch_correct_bbknn.py @@ -56,19 +56,14 @@ # bbknn can't integrate on 2+ variables, so create a fake column with combined information columns = [x.strip() for x in args.integration_col.split(",")] -if args.modality =="atac": - if "scaled_counts" in adata.layers.keys(): - pass - else: - L.info("To run BBKNN on ATAC, PCA is needed. Computing PCA now.") - L.info("Scaling data and saving scaled counts to .layers['scaled_counts']") - sc.pp.scale(adata) - adata.layers["scaled_counts"] = adata.X.copy() - L.info("Computing PCA") - sc.tl.pca(adata, n_comps=min(50,adata.var.shape[0]-1), svd_solver='arpack', random_state=0) - -if "X_pca" not in adata.obsm: - L.warning("X_pca could not be found in adata.obsm. Computing PCA with default parameters.") +if args.dimred == "PCA": + dimred = "X_pca" +elif args.dimred == "LSI": + dimred = "X_lsi" + +if dimred not in adata.obsm: + L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) + dimred = "X_pca" n_pcs = 50 if adata.var.shape[0] < n_pcs or adata.obs.shape[0] < n_pcs: L.info("You have less features/samples than number of PCs you intend to calculate") @@ -81,7 +76,6 @@ svd_solver='arpack', random_state=0) - L.info("Preparing for integration") if len(columns) > 1: @@ -99,6 +93,7 @@ # run bbknn L.info("Running BBKNN") adata = sc.external.pp.bbknn(adata, + use_rep=dimred, batch_key=args.integration_col, copy=True, n_pcs = int(args.neighbors_n_pcs), diff --git a/panpipes/python_scripts/run_scib.py b/panpipes/python_scripts/run_scib.py index 12a2e053..96f6c379 100644 --- a/panpipes/python_scripts/run_scib.py +++ b/panpipes/python_scripts/run_scib.py @@ -124,3 +124,5 @@ df.to_csv(os.path.join(args.fig_dir, modality, "scib_metrics_min_max_scaled.csv")) else: df.to_csv(os.path.join(args.fig_dir, modality, "scib_metrics.csv")) + +L.info("Done") diff --git a/tests/integration_1/pipeline.yml b/tests/integration_1/pipeline.yml index 1092e866..dba8ee58 100644 --- a/tests/integration_1/pipeline.yml +++ b/tests/integration_1/pipeline.yml @@ -154,7 +154,7 @@ atac: # True or false depending on whether you want to run batch correction run: True # which dimensionality reduction to expect, LSI or PCA - dimred: LSI + dimred: PCA # what method(s) to use to run batch correction, you can specify multiple # (comma-seprated string, no spaces) # choices: harmony,bbknn,combat diff --git a/tests/integration_3/pipeline.yml b/tests/integration_3/pipeline.yml new file mode 100644 index 00000000..f1bb3366 --- /dev/null +++ b/tests/integration_3/pipeline.yml @@ -0,0 +1,394 @@ +# Pipeline pipeline_integration.py configuration file +# ============================================== + +# compute resource options +# ------------------------ +resources: + # Number of threads used for parallel jobs + # this must be enough memory to load your mudata and do computationally intensive tasks + threads_high: 1 + # this must be enough memory to load your mudata and do computationally light tasks + threads_medium: 1 + # this must be enough memory to load text files and do plotting, requires much less memory than the other two + threads_low: 1 + # if you access to a gpu-specific queue, how many gpu threads to request, make sure to edit the queues section below, + # so that panpipes can find your gpu queue + threads_gpu: 2 +# path to conda env, leave blank if running native or your cluster automatically inherits the login node environment +condaenv: /Users/fabiola.curion/Documents/devel/miniconda3/envs/pipeline_env + +# allows for tweaking which queues jobs get submitted to, +# in case there is a special queue for long jobs or you have access to a gpu-specific queue +# the default queue should be specified in your .cgat.yml file +# leave blank if you do not want to use the alternative queues +queues: + long: + gpu: + +# Start +# -------------------------- +# either one that exists already with +sample_prefix: teaseq +#this is what comes out of the filtering/preprocessing +preprocessed_obj: teaseq.h5mu +# contains layers: raw_counts, logged_counts, and has scaled or logged counts in X + + +#-------------------------- +# Batch correction +# ------------------------- +# unimodal: correct each modality independently +rna: + # True or false depending on whether you want to run batch correction + run: True + # what method(s) to use to run batch correction, you can specify multiple + # choices: harmony,bbknn,scanorama,scvi (comma-seprated string, no spaces) + tools: harmony,scvi + # this is the column you want to batch correct on. if you specify a comma separated list, + # they will be all used simultaneosly. + # Specifically all columns specified will be merged into one 'batch' columns. + # if you want to test correction for one at a time, + # specify one at a time and run the pipeline in different folders i.e. integration_by_sample, + # integration_by_tissue ... + column: dataset + #----------------------------- + # Harmony args + #----------------------------- + harmony: + # sigma value, used by Harmony + sigma: 0.1 + # theta value used by Harmony, default is 1 + theta: 1.0 + # number of pcs, used by Harmony + npcs: 30 + #---------------------------- + # BBKNN args # https://bbknn.readthedocs.io/en/latest/ + #----------------------------- + bbknn: + neighbors_within_batch: 20 + #----------------------------- + # SCVI args + #----------------------------- + scvi: + seed: 1492 + exclude_mt_genes: True + mt_column: mt + model_args: + n_layers: + n_latent: + gene_likelihood: zinb + training_args: + max_epochs: 40 + train_size: 0.9 + early_stopping: True + training_plan: + lr: 0.001 + n_epochs_kl_warmup: 40 + reduce_lr_on_plateau: True + lr_scheduler_metric: + lr_patience: 8 + lr_factor: 0.1 + #---------------------------- + # find neighbour parameters + #----------------------------- + # to reuse these params, (for example for WNN) please use anchors (&) and scalars (*) in the relevant place + # i.e. &rna_neighbors will be called by *rna_neighbors where referenced + neighbors: &rna_neighbors + # number of Principal Components to calculate for neighbours and umap: + # -if no correction is applied, PCA will be calculated and used to run UMAP and clustering on + # -if Harmony is the method of choice, it will use these components to create a corrected dim red.) + # the maximum number of dims for neighbors calculation can only only be lower or equal to the total number of dims for PCA or Harmony + # note: scvelo default is 30 + npcs: 30 + # number of neighbours + k: 30 + # metric: euclidean | cosine + metric: euclidean + # scanpy | hnsw (from scvelo) + method: scanpy + +#-------------------------- +prot: + # True or false depending on whether you want to run batch correction + run: False + # what method(s) to use to run batch correction, you can specify multiple + # choices: harmony,bbknn,combat + tools: harmony + # this is the column you want to batch correct on. if you specify a comma separated list (no spaces), + # they will be all used simultaneosly. if you want to test correction for one at a time, + # specify one at a time and run the pipeline in different folders i.e. integration_by_sample, + # integration_by_tissue ... + column: orig.ident + #---------------------------- + # Harmony args + #----------------------------- + harmony: + # sigma value, used by Harmony + sigma: 0.1 + # theta value used by Harmony, default is 1 + theta: 1.0 + # number of pcs, used by Harmony + npcs: 30 + #---------------------------- + # BBKNN args # https://bbknn.readthedocs.io/en/latest/ + #----------------------------- + bbknn: + neighbors_within_batch: 20 + #----------------------------› + # find neighbour parameters + #----------------------------- + neighbors: &prot_neighbors + # number of Principal Components to calculate for neighbours and umap: + # -if no correction is applied, PCA will be calculated and used to run UMAP and clustering on + # -if Harmony is the method of choice, it will use these components to create a corrected dim red.) + # note: scvelo default is 30 + npcs: 30 + # number of neighbours + k: 30 + # metric: euclidean | cosine + metric: euclidean + # scanpy | hnsw (from scvelo) + method: scanpy +#-------------------------- +atac: + # True or false depending on whether you want to run batch correction + run: False + # which dimensionality reduction to expect, LSI or PCA + dimred: LSI + # what method(s) to use to run batch correction, you can specify multiple + # (comma-seprated string, no spaces) + # choices: harmony,bbknn,combat + tools: harmony,bbknn + # this is the column you want to batch correct on. if you specify a comma separated list, + # they will be all used simultaneosly. if you want to test correction for one at a time, + # specify one at a time and run the pipeline in different folders i.e. integration_by_sample, + # integration_by_tissue ... + column: dataset + #---------------------------- + # Harmony args + #----------------------------- + harmony: + # sigma value, used by Harmony + sigma: 0.1 + # theta value used by Harmony, default is 1 + theta: 1.0 + # number of pcs, used by Harmony + npcs: 30 + #---------------------------- + # BBKNN args # https://bbknn.readthedocs.io/en/latest/ + #----------------------------- + bbknn: + neighbors_within_batch: 30 + #---------------------------- + # find neighbour parameters + #----------------------------- + neighbors: &atac_neighbors + # number of Principal Components to calculate for neighbours and umap: + # -if no correction is applied, PCA will be calculated and used to run UMAP and clustering + # -if Harmony is the method of choice, it will use these components to create a corrected dim red.) + # note: scvelo default is 30 + npcs: 30 + # number of neighbours + k: 30 + # metric: euclidean | cosine + metric: euclidean + # scanpy | hnsw (from scvelo) + method: scanpy +#---------------------------------------------- +# multimodal integration +# remember to specify knn graph params in the section "neighbors" +#---------------------------------------------- +multimodal: + # True or false depending on whether you want to run batch correction + run: True + # what method(s) to use to run batch correction, you can specify multiple + # choices: totalvi, mofa, MultiVI, WNN + # list e.g. below + tools: + - totalvi + + # this is the column you want to batch correct on. if you specify a comma separated list, + # they will be all used simultaneosly. if you want to test correction for one at a time, + # specify one at a time and run the pipeline in different folders i.e. integration_by_sample, + # integration_by_tissue ... + column_continuous: + column_categorical: dataset + # extra params: + totalvi: + seed: 1492 + # this is a minimal set of parameters that will be expected + # you can add any other param from the tutorials and they will + # be parsed alongside the others + seed: 1492 + # totalvi will run on rna and prot + modalities: rna,prot + exclude_mt_genes: True + mt_column: mt + # to filter outliers manually create a column called adt_outliers in mdata['prot'].obs + filter_by_hvg: True + filter_prot_outliers: False + model_args: + latent_distribution: "normal" + training_args: + max_epochs: 40 + train_size: 0.9 + early_stopping: True + training_plan: None + MultiVI: + seed: 1492 + # this is a minimal set of parameters that will be expected + # you can add any other param from the tutorials and they will + # be parsed alongside the others + # leave arguments blank for default + seed: 1492 + lowmem: True + # Set lowmem to True will subset the atac to the top 25k HVF. + # This is to deal with concatenation of atac,rna on large datasets which at the moment is suboptimally required by scvitools. + # >100GB of RAM are required to concatenate atac,rna with 15k cells and 120k total features (union rna,atac) + model_args: + # (default: None) + n_hidden : + # (default: None) + n_latent : + #(bool,default: True) + region_factors : True + #{‘normal’, ‘ln’} (default: 'normal') + latent_distribution : 'normal' + #(bool,default: False) + deeply_inject_covariates : False + #(bool, default: False) + fully_paired : False + training_args: + #(default: 500) + max_epochs : 500 + #float (default: 0.0001) + lr : 1.0e-05 + #leave blanck for default str | int | bool | None (default: None) + use_gpu : + # float (default: 0.9) + train_size : 0.9 + # leave blanck for default, float | None (default: None) + validation_size : + # int (default: 128) + batch_size : 128 + #float (default: 0.001) + weight_decay : 0.001 + #float (default: 1.0e-08) + eps : 1.0e-08 + #bool (default: True) + early_stopping : True + #bool (default: True) + save_best : True + #leave blanck for default int | None (default: None) + check_val_every_n_epoch : + #leave blanck for default int | None (default: None) + n_steps_kl_warmup : + # int | None (default: 50) + n_epochs_kl_warmup : 50 + #bool (default: True) + adversarial_mixing : True + #leave blanck for default dict | None (default: None) + training_plan : + mofa: + # this is a minimal set of parameters that will be expected + # you can add any other param from the tutorials and they will + # be parsed alongside the others + # (comma-separated string, no spaces) + modalities: + filter_by_hvg: True + n_factors: 10 + n_iterations: 1000 + #pick one among fast, medium, slow + convergence_mode: fast + save_parameters: False + #if save_parameters True, set the following, otherwise leave blank + outfile: + WNN: + # muon implementation of WNN + modalities: rna,prot,atac + # run wnn on batch corrected unimodal data, set each of the modalities you want to use to calc WNN to ONE method. + # leave to None and it will default to de novo calculation of neighbours on non corrected data for that modality using specified params + batch_corrected: + # options are: "bbknn", "scVI", "harmony", "scanorama" + rna: None + # options are "harmony", "bbknn" + prot: None + # options are "harmony" + atac: None + # please use anchors (&) and scalars (*) in the relevant place + # i.e. &rna_neighbors will be called by *rna_neighbors where referenced + knn: + rna: *rna_neighbors + prot: *prot_neighbors + atac: *atac_neighbors + #WNN has its own neighbors search, specify here + n_neighbors: #leave blank and it will default to aritmetic mean across modalities neighbors + n_bandwidth_neighbors: 20 + n_multineighbors: 200 + metric: 'euclidean' + low_memory: True + + ### + # neighbours knn calculation for multimodal analysis. + ### + neighbors: + # number of Principal Components to calculate for neighbours and umap: + # -if no correction is applied, PCA will be calculated and used to run UMAP and clustering on + # -if Harmony is the method of choice, it will use these components to create a corrected dim red.) + # note: scvelo default is 30 + npcs: 30 + # number of neighbours + k: 30 + # metric: euclidean | cosine + metric: euclidean + # scanpy | hnsw (from scvelo) + method: scanpy + + + +#----------------------------- +# Plot +#----------------------------- +plotqc: + # grouping var must be a categorical varible, + # (comma-seprated strings, no spaces) + # umaps comparing the integration (one plot per value in the group) + # for each batch correction column plus any extras in grouping var + grouping_var: dataset,sample_id + # what other metrics do you want to plot on each modalities embedding, (one plot per group) + # use mod:variable notation, + # any metrics that you want to plot on all modality umaps go under "all" + # these can be categorical or numeric + all: rep:receptor_subtype + rna: rna:total_counts + prot: prot:total_counts + atac: + multimodal: rna:total_counts + # if you want to add any additional plots, just remove the log file logs/plot_batch_corrected_umaps.log + # and run panpipes integration make plot_umaps + +# ---------------- +# Make final object +# ---------------- +# Final choices: Leave blank until you have reviewed the results from running +# panpipes integration make full +# This step will produce a mudata object with one layer per modality with +# one correction per modality and one multimodal layer. +# Choose the integration results you want to merge in the final object +# For unimodal integration: to pick the uncorrected version use "no_correction" +# then run +# panpipes integration make merge_integration +final_obj: + rna: + include: True + bc_choice: no_correction + prot: + include: True + bc_choice: harmony + atac: + include: False + bc_choice: bbknn + multimodal: + include: True + bc_choice: WNN +