From 2a3bff259616b74ae30e53aeeb5fa4ab319e335a Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Tue, 3 Sep 2024 14:33:04 +0200 Subject: [PATCH 1/6] set n_pcs to none --- panpipes/python_scripts/batch_correct_none.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/panpipes/python_scripts/batch_correct_none.py b/panpipes/python_scripts/batch_correct_none.py index 6593876d..9a5202b5 100644 --- a/panpipes/python_scripts/batch_correct_none.py +++ b/panpipes/python_scripts/batch_correct_none.py @@ -92,19 +92,18 @@ if dimred not in adata.obsm: L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) - n_pcs = 50 - if adata.var.shape[0] < n_pcs: - L.info("You have less features than number of PCs you intend to calculate") - n_pcs = adata.var.shape[0] - 1 - L.info("Setting n PCS to %i" % int(n_pcs)) + #n_pcs = 50 + #if adata.var.shape[0] < n_pcs: + # L.info("You have less features than number of PCs you intend to calculate") + # n_pcs = adata.var.shape[0] - 1 + # L.info("Setting n PCS to %i" % int(n_pcs)) L.info("Scaling data") sc.pp.scale(adata) L.info("Computing PCA") - sc.tl.pca(adata, n_comps=n_pcs, - svd_solver='arpack', + sc.tl.pca(adata, svd_solver='arpack', random_state=0) pc_kwargs['use_rep'] = "X_pca" - pc_kwargs['n_pcs'] = n_pcs + #pc_kwargs['n_pcs'] = n_pcs L.info("Computing neighbors") From c373adc65c1bbb95889c1d20f75094e9a0bb7a67 Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Tue, 3 Sep 2024 16:26:41 +0200 Subject: [PATCH 2/6] remove comments --- panpipes/python_scripts/batch_correct_none.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/panpipes/python_scripts/batch_correct_none.py b/panpipes/python_scripts/batch_correct_none.py index 9a5202b5..e637d60b 100644 --- a/panpipes/python_scripts/batch_correct_none.py +++ b/panpipes/python_scripts/batch_correct_none.py @@ -91,19 +91,13 @@ if dimred not in adata.obsm: - L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) - #n_pcs = 50 - #if adata.var.shape[0] < n_pcs: - # L.info("You have less features than number of PCs you intend to calculate") - # n_pcs = adata.var.shape[0] - 1 - # L.info("Setting n PCS to %i" % int(n_pcs)) + L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) L.info("Scaling data") sc.pp.scale(adata) L.info("Computing PCA") sc.tl.pca(adata, svd_solver='arpack', random_state=0) pc_kwargs['use_rep'] = "X_pca" - #pc_kwargs['n_pcs'] = n_pcs L.info("Computing neighbors") From d9c421d678507a7b8086f67116205688b90f64d8 Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Tue, 3 Sep 2024 16:28:16 +0200 Subject: [PATCH 3/6] temporarily pin anndata version --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2dc39d84..bae51e51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "gevent", "harmonypy", "mofapy2", + "anndata<0.10.9", "muon", "matplotlib", "openpyxl", From 97c11ae3edc4ac8e4766fd91090ef714642cd382 Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Wed, 11 Sep 2024 09:56:59 +0200 Subject: [PATCH 4/6] add check pca n_samples --- panpipes/python_scripts/batch_correct_bbknn.py | 8 ++++---- panpipes/python_scripts/batch_correct_harmony.py | 8 ++++---- panpipes/python_scripts/batch_correct_none.py | 10 ++++++++-- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/panpipes/python_scripts/batch_correct_bbknn.py b/panpipes/python_scripts/batch_correct_bbknn.py index d9001287..23b802ba 100644 --- a/panpipes/python_scripts/batch_correct_bbknn.py +++ b/panpipes/python_scripts/batch_correct_bbknn.py @@ -70,10 +70,10 @@ if "X_pca" not in adata.obsm: L.warning("X_pca could not be found in adata.obsm. Computing PCA with default parameters.") n_pcs = 50 - if adata.var.shape[0] < n_pcs: - L.info("You have less features than number of PCs you intend to calculate") - n_pcs = adata.var.shape[0] - 1 - L.info("Setting n PCS to %i" % int(n_pcs)) + if adata.var.shape[0] < n_pcs or adata.obs.shape[0] < n_pcs: + L.info("You have less features/samples than number of PCs you intend to calculate") + n_pcs = min(adata.var.shape[0], adata.obs.shape[0]) - 1 + L.info("Setting n PCS to %i" % int(n_pcs)) L.info("Scaling data") sc.pp.scale(adata) L.info("Computing PCA") diff --git a/panpipes/python_scripts/batch_correct_harmony.py b/panpipes/python_scripts/batch_correct_harmony.py index 56219ee9..244a5214 100644 --- a/panpipes/python_scripts/batch_correct_harmony.py +++ b/panpipes/python_scripts/batch_correct_harmony.py @@ -77,10 +77,10 @@ L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) dimred = "X_pca" n_pcs = 50 - if adata.var.shape[0] < n_pcs: - L.info("You have less features than number of PCs you intend to calculate") - n_pcs = adata.var.shape[0] - 1 - L.info("Setting n PCS to %i" % int(n_pcs)) + if adata.var.shape[0] < n_pcs or adata.obs.shape[0] < n_pcs: + L.info("You have less features/samples than number of PCs you intend to calculate") + n_pcs = min(adata.var.shape[0], adata.obs.shape[0]) - 1 + L.info("Setting n PCS to %i" % int(n_pcs)) L.info("Scaling data") sc.pp.scale(adata) L.info("Computing PCA") diff --git a/panpipes/python_scripts/batch_correct_none.py b/panpipes/python_scripts/batch_correct_none.py index e637d60b..b1bfeb33 100644 --- a/panpipes/python_scripts/batch_correct_none.py +++ b/panpipes/python_scripts/batch_correct_none.py @@ -91,13 +91,19 @@ if dimred not in adata.obsm: - L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) + L.warning("Dimred '%s' could not be found in adata.obsm. Computing PCA with default parameters." % dimred) + n_pcs = 50 + if adata.var.shape[0] < n_pcs or adata.obs.shape[0] < n_pcs: + L.info("You have less features/samples than number of PCs you intend to calculate") + n_pcs = min(adata.var.shape[0], adata.obs.shape[0]) - 1 + L.info("Setting n PCS to %i" % int(n_pcs)) L.info("Scaling data") sc.pp.scale(adata) L.info("Computing PCA") - sc.tl.pca(adata, svd_solver='arpack', + sc.tl.pca(adata, n_comps=n_pcs, svd_solver='arpack', random_state=0) pc_kwargs['use_rep'] = "X_pca" + pc_kwargs['n_pcs'] = n_pcs L.info("Computing neighbors") From b3edb2ee733a9d951cbf7df3af68448bc2e5f4db Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Wed, 11 Sep 2024 09:57:11 +0200 Subject: [PATCH 5/6] unpin anndata --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bae51e51..2dc39d84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ dependencies = [ "gevent", "harmonypy", "mofapy2", - "anndata<0.10.9", "muon", "matplotlib", "openpyxl", From f9e06826511e56461d42c8bc32ab1d36965d29ad Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Wed, 11 Sep 2024 10:14:20 +0200 Subject: [PATCH 6/6] add note pca protein integration --- docs/yaml_docs/pipeline_preprocess_yml.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/yaml_docs/pipeline_preprocess_yml.md b/docs/yaml_docs/pipeline_preprocess_yml.md index 634fbc34..b9fdfce4 100644 --- a/docs/yaml_docs/pipeline_preprocess_yml.md +++ b/docs/yaml_docs/pipeline_preprocess_yml.md @@ -351,7 +351,7 @@ Whether applying scaling or not is still a matter of debate, as stated in the [L Specify if you want to save the prot normalised assay additionally as a txt file. - pca `Boolean`, Default: False
- Specify if you want to run PCA on the normalised protein data. This might be useful, when you have more than 50 features in your protein assay. + Specify if you want to run PCA on the normalised protein data. This might be useful, when you have more than 50 features in your protein assay. Further, this dimensionality reduction can be used in the integration workflow. - n_pcs `Integer`, Default: 50
Number of principal components to compute. Specify at least n_pcs <= number of features -1.