From 559f97768212dfe9e3b427c36b8223ee5d4b9651 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 11 Oct 2024 15:25:58 +0200 Subject: [PATCH] move tasks to separate repositories (#910) * move tasks to separate repositories * disable broken components for now [ci force] * use numpy<2 in tfidf --- CHANGELOG.md | 219 ++++-- src/common/create_component/config.vsh.yaml | 1 + src/common/create_task_readme/config.vsh.yaml | 1 + .../normalization/atac_tfidf/config.vsh.yaml | 1 + src/tasks/batch_integration/README.md | 570 +-------------- .../api/comp_control_method_embedding.yaml | 26 - .../api/comp_control_method_feature.yaml | 26 - .../api/comp_control_method_graph.yaml | 26 - .../api/comp_method_embedding.yaml | 29 - .../api/comp_method_feature.yaml | 29 - .../api/comp_method_graph.yaml | 29 - .../api/comp_metric_embedding.yaml | 38 - .../api/comp_metric_feature.yaml | 31 - .../api/comp_metric_graph.yaml | 31 - .../api/comp_process_dataset.yaml | 45 -- .../comp_transformer_embedding_to_graph.yaml | 25 - ...comp_transformer_feature_to_embedding.yaml | 25 - .../api/file_common_dataset.yaml | 92 --- .../batch_integration/api/file_dataset.yaml | 69 -- .../api/file_integrated_embedding.yaml | 29 - .../api/file_integrated_feature.yaml | 29 - .../api/file_integrated_graph.yaml | 37 - .../batch_integration/api/file_score.yaml | 29 - .../batch_integration/api/file_solution.yaml | 89 --- .../batch_integration/api/task_info.yaml | 41 -- src/tasks/batch_integration/api/thumbnail.svg | 1 - .../batch_embed/config.vsh.yaml | 24 - .../no_integration/batch_embed/script.py | 49 -- .../global_embed/config.vsh.yaml | 24 - .../no_integration/global_embed/script.py | 36 - .../global_feature/config.vsh.yaml | 24 - .../no_integration/global_feature/script.py | 38 - .../global_graph/config.vsh.yaml | 25 - .../no_integration/global_graph/script.py | 41 -- .../celltype_embed/config.vsh.yaml | 25 - .../celltype_embed/script.py | 34 - .../celltype_jitter_embed/config.vsh.yaml | 29 - .../celltype_jitter_embed/script.py | 38 - .../batch_embed/config.vsh.yaml | 25 - .../random_integration/batch_embed/script.py | 40 -- .../batch_feature/config.vsh.yaml | 25 - .../batch_feature/script.py | 41 -- .../batch_graph/config.vsh.yaml | 25 - .../random_integration/batch_graph/script.py | 41 -- .../celltype_embed/config.vsh.yaml | 25 - .../celltype_embed/script.py | 38 - .../celltype_feature/config.vsh.yaml | 25 - .../celltype_feature/script.py | 42 -- .../celltype_graph/config.vsh.yaml | 25 - .../celltype_graph/script.py | 41 -- .../global_embed/config.vsh.yaml | 25 - .../random_integration/global_embed/script.py | 37 - .../global_feature/config.vsh.yaml | 25 - .../global_feature/script.py | 37 - .../global_graph/config.vsh.yaml | 25 - .../random_integration/global_graph/script.py | 37 - .../control_methods/utils.py | 56 -- .../methods/bbknn/config.vsh.yaml | 51 -- .../batch_integration/methods/bbknn/script.py | 63 -- .../methods/combat/config.vsh.yaml | 42 -- .../methods/combat/script.py | 57 -- .../methods/fastmnn_embedding/config.vsh.yaml | 36 - .../methods/fastmnn_feature/config.vsh.yaml | 34 - .../methods/fastmnn_feature/script.R | 51 -- .../methods/liger/config.vsh.yaml | 31 - .../batch_integration/methods/liger/script.R | 108 --- .../methods/mnn_correct/config.vsh.yaml | 27 - .../methods/mnn_correct/script.R | 47 -- .../methods/mnnpy/config.vsh.yaml | 52 -- .../batch_integration/methods/mnnpy/script.py | 55 -- .../methods/pyliger/config.vsh.yaml | 37 - .../methods/pyliger/script.py | 86 --- .../methods/scalex_embed/config.vsh.yaml | 41 -- .../methods/scalex_embed/script.py | 70 -- .../methods/scalex_feature/config.vsh.yaml | 41 -- .../methods/scanorama_embed/config.vsh.yaml | 41 -- .../methods/scanorama_embed/script.py | 87 --- .../methods/scanorama_feature/config.vsh.yaml | 41 -- .../methods/scanvi/config.vsh.yaml | 61 -- .../methods/scanvi/script.py | 76 -- .../methods/scvi/config.vsh.yaml | 59 -- .../batch_integration/methods/scvi/script.py | 66 -- .../metrics/asw_batch/config.vsh.yaml | 50 -- .../metrics/asw_batch/script.py | 44 -- .../metrics/asw_label/config.vsh.yaml | 38 - .../metrics/asw_label/script.py | 44 -- .../cell_cycle_conservation/config.vsh.yaml | 47 -- .../metrics/cell_cycle_conservation/script.py | 69 -- .../clustering_overlap/config.vsh.yaml | 61 -- .../metrics/clustering_overlap/script.py | 53 -- .../graph_connectivity/config.vsh.yaml | 47 -- .../metrics/graph_connectivity/script.py | 42 -- .../metrics/hvg_overlap/config.vsh.yaml | 46 -- .../metrics/hvg_overlap/script.py | 55 -- .../isolated_label_asw/config.vsh.yaml | 40 -- .../metrics/isolated_label_asw/script.py | 49 -- .../metrics/isolated_label_f1/config.vsh.yaml | 52 -- .../metrics/isolated_label_f1/script.py | 48 -- .../metrics/kbet/config.vsh.yaml | 57 -- .../batch_integration/metrics/kbet/script.py | 49 -- .../metrics/lisi/config.vsh.yaml | 56 -- .../batch_integration/metrics/lisi/script.py | 64 -- .../metrics/pcr/config.vsh.yaml | 44 -- .../batch_integration/metrics/pcr/script.py | 59 -- .../process_dataset/config.vsh.yaml | 18 - .../process_dataset/script.py | 66 -- .../resources_scripts/process_datasets.sh | 33 - .../resources_scripts/run_benchmark.sh | 22 - .../resources_scripts/run_benchmark_test.sh | 25 - .../resources_test_scripts/process.sh | 49 -- .../embed_to_graph/config.vsh.yaml | 19 - .../transformers/embed_to_graph/script.py | 33 - .../feature_to_embed/config.vsh.yaml | 20 - .../transformers/feature_to_embed/script.py | 41 -- .../process_datasets/config.vsh.yaml | 30 - .../workflows/process_datasets/main.nf | 54 -- .../process_datasets/run_nextflow.sh | 25 - .../workflows/run_benchmark/config.vsh.yaml | 115 --- .../workflows/run_benchmark/main.nf | 258 ------- .../workflows/run_benchmark/run_test.sh | 31 - src/tasks/denoising/README.md | 356 +--------- .../denoising/api/comp_control_method.yaml | 33 - src/tasks/denoising/api/comp_method.yaml | 26 - src/tasks/denoising/api/comp_metric.yaml | 31 - .../denoising/api/comp_process_dataset.yaml | 27 - .../denoising/api/file_common_dataset.yaml | 40 -- src/tasks/denoising/api/file_denoised.yaml | 21 - src/tasks/denoising/api/file_score.yaml | 21 - src/tasks/denoising/api/file_test.yaml | 44 -- src/tasks/denoising/api/file_train.yaml | 16 - src/tasks/denoising/api/task_info.yaml | 54 -- src/tasks/denoising/api/thumbnail.svg | 1 - .../no_denoising/config.vsh.yaml | 22 - .../control_methods/no_denoising/script.py | 22 - .../perfect_denoising/config.vsh.yaml | 22 - .../perfect_denoising/script.py | 24 - .../denoising/methods/alra/config.vsh.yaml | 43 -- src/tasks/denoising/methods/alra/script.R | 53 -- .../denoising/methods/dca/config.vsh.yaml | 45 -- src/tasks/denoising/methods/dca/script.py | 39 - .../methods/knn_smoothing/config.vsh.yaml | 41 -- .../denoising/methods/knn_smoothing/script.py | 39 - .../denoising/methods/magic/config.vsh.yaml | 63 -- src/tasks/denoising/methods/magic/script.py | 76 -- .../denoising/methods/saver/config.vsh.yaml | 32 - src/tasks/denoising/methods/saver/script.R | 39 - .../denoising/metrics/mse/config.vsh.yaml | 30 - src/tasks/denoising/metrics/mse/script.py | 51 -- .../denoising/metrics/poisson/config.vsh.yaml | 28 - src/tasks/denoising/metrics/poisson/script.py | 46 -- .../denoising/process_dataset/config.vsh.yaml | 37 - src/tasks/denoising/process_dataset/helper.py | 55 -- src/tasks/denoising/process_dataset/script.py | 75 -- .../resources_scripts/process_datasets.sh | 34 - .../resources_scripts/run_benchmark.sh | 23 - .../resources_scripts/run_benchmark_test.sh | 25 - .../resources_test_scripts/pancreas.sh | 51 -- .../process_datasets/config.vsh.yaml | 30 - .../workflows/process_datasets/main.nf | 54 -- .../workflows/process_datasets/run_test.sh | 25 - .../workflows/run_benchmark/config.vsh.yaml | 67 -- .../denoising/workflows/run_benchmark/main.nf | 184 ----- .../workflows/run_benchmark/run_test.sh | 29 - src/tasks/dimensionality_reduction/README.md | 375 +--------- .../api/comp_control_method.yaml | 33 - .../api/comp_method.yaml | 27 - .../api/comp_metric.yaml | 30 - .../api/comp_process_dataset.yaml | 27 - .../api/file_common_dataset.yaml | 58 -- .../api/file_dataset.yaml | 29 - .../api/file_embedding.yaml | 25 - .../api/file_score.yaml | 29 - .../api/file_solution.yaml | 58 -- .../api/task_info.yaml | 73 -- .../api/thumbnail.svg | 1 - .../random_features/config.vsh.yaml | 22 - .../control_methods/random_features/script.py | 34 - .../spectral_features/config.vsh.yaml | 41 -- .../spectral_features/script.py | 77 -- .../true_features/config.vsh.yaml | 22 - .../control_methods/true_features/script.py | 33 - .../methods/densmap/config.vsh.yaml | 45 -- .../methods/densmap/script.py | 54 -- .../methods/diffusion_map/config.vsh.yaml | 31 - .../methods/diffusion_map/script.R | 37 - .../methods/ivis/config.vsh.yaml | 44 -- .../methods/ivis/script.py | 57 -- .../methods/lmds/config.vsh.yaml | 44 -- .../methods/lmds/script.R | 39 - .../methods/neuralee/config.vsh.yaml | 55 -- .../methods/neuralee/script.py | 78 -- .../methods/pca/config.vsh.yaml | 40 -- .../methods/pca/script.py | 41 -- .../methods/phate/config.vsh.yaml | 58 -- .../methods/phate/script.py | 45 -- .../methods/pymde/config.vsh.yaml | 41 -- .../methods/pymde/script.py | 59 -- .../methods/simlr/config.vsh.yaml | 57 -- .../methods/simlr/script.R | 69 -- .../methods/tsne/config.vsh.yaml | 49 -- .../methods/tsne/script.py | 47 -- .../methods/umap/config.vsh.yaml | 50 -- .../methods/umap/script.py | 54 -- .../clustering_performance/config.vsh.yaml | 61 -- .../metrics/clustering_performance/script.py | 63 -- .../metrics/coranking/config.vsh.yaml | 166 ----- .../metrics/coranking/script.R | 101 --- .../density_preservation/config.vsh.yaml | 43 -- .../metrics/density_preservation/script.py | 132 ---- .../distance_correlation/config.vsh.yaml | 50 -- .../metrics/distance_correlation/script.py | 59 -- .../metrics/trustworthiness/config.vsh.yaml | 31 - .../metrics/trustworthiness/script.py | 37 - .../process_dataset/config.vsh.yaml | 13 - .../process_dataset/script.py | 34 - .../resources_scripts/process_datasets.sh | 34 - .../resources_scripts/run_benchmark.sh | 22 - .../resources_scripts/run_benchmark_test.sh | 25 - .../resources_test_scripts/pancreas.sh | 55 -- .../process_datasets/config.vsh.yaml | 30 - .../workflows/process_datasets/main.nf | 54 -- .../workflows/process_datasets/run_test.sh | 25 - .../workflows/run_benchmark/config.vsh.yaml | 82 --- .../workflows/run_benchmark/main.nf | 210 ------ .../workflows/run_benchmark/run_test.sh | 29 - src/tasks/label_projection/README.md | 369 +--------- .../api/comp_control_method.yaml | 38 - .../label_projection/api/comp_method.yaml | 31 - .../label_projection/api/comp_metric.yaml | 31 - .../api/comp_process_dataset.yaml | 32 - .../api/file_common_dataset.yaml | 72 -- .../label_projection/api/file_prediction.yaml | 24 - .../label_projection/api/file_score.yaml | 29 - .../label_projection/api/file_solution.yaml | 71 -- src/tasks/label_projection/api/file_test.yaml | 43 -- .../label_projection/api/file_train.yaml | 47 -- src/tasks/label_projection/api/task_info.yaml | 46 -- src/tasks/label_projection/api/thumbnail.svg | 1 - .../majority_vote/config.vsh.yaml | 22 - .../control_methods/majority_vote/script.py | 26 - .../random_labels/config.vsh.yaml | 25 - .../control_methods/random_labels/script.py | 33 - .../true_labels/config.vsh.yaml | 22 - .../control_methods/true_labels/script.py | 25 - .../methods/knn/config.vsh.yaml | 37 - .../label_projection/methods/knn/script.py | 28 - .../logistic_regression/config.vsh.yaml | 34 - .../methods/logistic_regression/script.py | 28 - .../methods/mlp/config.vsh.yaml | 47 -- .../label_projection/methods/mlp/script.py | 31 - .../methods/naive_bayes/config.vsh.yaml | 33 - .../methods/naive_bayes/script.py | 28 - .../methods/scanvi/config.vsh.yaml | 46 -- .../label_projection/methods/scanvi/script.py | 78 -- .../methods/scanvi_scarches/config.vsh.yaml | 53 -- .../methods/scanvi_scarches/script.py | 61 -- .../seurat_transferdata/config.vsh.yaml | 36 - .../methods/seurat_transferdata/script.R | 81 --- .../methods/xgboost/config.vsh.yaml | 34 - .../methods/xgboost/script.py | 39 - .../metrics/accuracy/config.vsh.yaml | 28 - .../metrics/accuracy/script.py | 36 - .../metrics/f1/config.vsh.yaml | 50 -- .../label_projection/metrics/f1/script.py | 43 -- .../process_dataset/config.vsh.yaml | 31 - .../process_dataset/script.py | 78 -- .../resources_scripts/process_datasets.sh | 34 - .../resources_scripts/run_benchmark.sh | 23 - .../resources_scripts/run_benchmark_test.sh | 25 - .../resources_test_scripts/pancreas.sh | 39 - .../process_datasets/config.vsh.yaml | 34 - .../workflows/process_datasets/main.nf | 55 -- .../workflows/run_benchmark/config.vsh.yaml | 77 -- .../workflows/run_benchmark/main.nf | 200 ------ .../workflows/run_benchmark/run_test.sh | 31 - src/tasks/match_modalities/README.md | 498 +------------ .../api/comp_control_method.yaml | 47 -- .../match_modalities/api/comp_method.yaml | 34 - .../match_modalities/api/comp_metric.yaml | 39 - .../api/comp_process_dataset.yaml | 40 -- .../api/file_common_dataset_mod1.yaml | 56 -- .../api/file_common_dataset_mod2.yaml | 56 -- .../api/file_dataset_mod1.yaml | 29 - .../api/file_dataset_mod2.yaml | 29 - .../api/file_integrated_mod1.yaml | 24 - .../api/file_integrated_mod2.yaml | 24 - .../match_modalities/api/file_score.yaml | 29 - .../api/file_solution_mod1.yaml | 58 -- .../api/file_solution_mod2.yaml | 58 -- src/tasks/match_modalities/api/task_info.yaml | 47 -- src/tasks/match_modalities/api/thumbnail.svg | 1 - .../random_features/config.vsh.yaml | 25 - .../control_methods/random_features/script.py | 32 - .../true_features/config.vsh.yaml | 21 - .../control_methods/true_features/script.py | 59 -- .../methods/fastmnn/config.vsh.yaml | 29 - .../match_modalities/methods/fastmnn/script.R | 37 - .../harmonic_alignment/config.vsh.yaml | 38 - .../methods/harmonic_alignment/script.py | 48 -- .../methods/procrustes/config.vsh.yaml | 29 - .../methods/procrustes/script.py | 34 - .../methods/scot/config.vsh.yaml | 30 - .../match_modalities/methods/scot/script.py | 45 -- .../metrics/knn_auc/config.vsh.yaml | 36 - .../metrics/knn_auc/script.py | 75 -- .../metrics/mse/config.vsh.yaml | 32 - .../match_modalities/metrics/mse/script.py | 56 -- .../process_dataset/config.vsh.yaml | 18 - .../process_dataset/script.py | 64 -- .../resources_scripts/process_datasets.sh | 34 - .../resources_scripts/run_benchmark.sh | 23 - .../scicar_cell_lines.sh | 34 - .../process_datasets/config.vsh.yaml | 42 -- .../workflows/process_datasets/main.nf | 82 --- .../workflows/run_benchmark/config.vsh.yaml | 75 -- .../workflows/run_benchmark/main.nf | 202 ------ .../workflows/run_benchmark/run_test.sh | 31 - src/tasks/predict_modality/README.md | 485 +------------ .../api/comp_control_method.yaml | 42 -- .../predict_modality/api/comp_method.yaml | 34 - .../api/comp_method_predict.yaml | 30 - .../api/comp_method_train.yaml | 26 - .../predict_modality/api/comp_metric.yaml | 30 - .../api/comp_process_dataset.yaml | 43 -- .../api/file_common_dataset_mod1.yaml | 98 --- .../api/file_common_dataset_mod2.yaml | 98 --- .../predict_modality/api/file_prediction.yaml | 20 - .../api/file_pretrained_model.yaml | 4 - .../predict_modality/api/file_score.yaml | 25 - .../predict_modality/api/file_test_mod1.yaml | 85 --- .../predict_modality/api/file_test_mod2.yaml | 81 --- .../predict_modality/api/file_train_mod1.yaml | 65 -- .../predict_modality/api/file_train_mod2.yaml | 65 -- src/tasks/predict_modality/api/task_info.yaml | 67 -- src/tasks/predict_modality/api/thumbnail.svg | 666 ------------------ .../meanpergene/config.vsh.yaml | 17 - .../control_methods/meanpergene/script.py | 37 - .../random_predict/config.vsh.yaml | 16 - .../control_methods/random_predict/script.R | 34 - .../control_methods/solution/config.vsh.yaml | 16 - .../control_methods/solution/script.R | 20 - .../control_methods/zeros/config.vsh.yaml | 16 - .../control_methods/zeros/script.py | 37 - .../methods/guanlab_dengkw_pm/config.vsh.yaml | 43 -- .../methods/guanlab_dengkw_pm/script.py | 136 ---- .../methods/knnr_py/config.vsh.yaml | 33 - .../methods/knnr_py/script.py | 67 -- .../methods/knnr_r/config.vsh.yaml | 36 - .../predict_modality/methods/knnr_r/script.R | 81 --- .../methods/lm/config.vsh.yaml | 32 - .../predict_modality/methods/lm/script.R | 74 -- .../methods/lmds_irlba_rf/config.vsh.yaml | 37 - .../methods/lmds_irlba_rf/script.R | 93 --- .../methods/newwave_knnr/config.vsh.yaml | 42 -- .../methods/newwave_knnr/script.R | 107 --- .../methods/novel/helper_functions.py | 247 ------- .../methods/novel/predict/config.vsh.yaml | 25 - .../methods/novel/predict/run_test.sh | 8 - .../methods/novel/predict/script.py | 119 ---- .../methods/novel/run/config.vsh.yaml | 21 - .../methods/novel/run/main.nf | 25 - .../methods/novel/run/run_test.sh | 15 - .../methods/novel/train/config.vsh.yaml | 31 - .../methods/novel/train/run_test.sh | 29 - .../methods/novel/train/script.py | 148 ---- .../methods/random_forest/config.vsh.yaml | 37 - .../methods/random_forest/script.R | 83 --- .../simple_mlp/predict/config.vsh.yaml | 21 - .../methods/simple_mlp/predict/script.py | 104 --- .../methods/simple_mlp/resources/models.py | 68 -- .../methods/simple_mlp/resources/utils.py | 37 - .../resources/yaml/mlp_ADT2GEX.yaml | 28 - .../resources/yaml/mlp_ATAC2GEX.yaml | 28 - .../resources/yaml/mlp_GEX2ADT.yaml | 28 - .../methods/simple_mlp/run/config.vsh.yaml | 26 - .../methods/simple_mlp/run/main.nf | 21 - .../methods/simple_mlp/run/run_test.sh | 15 - .../methods/simple_mlp/test.sh | 14 - .../methods/simple_mlp/train/config.vsh.yaml | 21 - .../methods/simple_mlp/train/script.py | 155 ---- .../metrics/correlation/config.vsh.yaml | 65 -- .../metrics/correlation/script.R | 84 --- .../metrics/mse/config.vsh.yaml | 30 - .../predict_modality/metrics/mse/script.py | 43 -- .../process_dataset/config.vsh.yaml | 21 - .../predict_modality/process_dataset/script.R | 158 ----- .../resources_scripts/process_datasets.sh | 22 - .../resources_scripts/run_benchmark.sh | 24 - .../neurips2021_bmmc.sh | 52 -- .../process_datasets/config.vsh.yaml | 43 -- .../workflows/process_datasets/main.nf | 128 ---- .../workflows/process_datasets/run_test.sh | 28 - .../workflows/run_benchmark/config.vsh.yaml | 82 --- .../workflows/run_benchmark/main.nf | 220 ------ .../workflows/run_benchmark/run_test.sh | 31 - src/tasks/spatial_decomposition/README.md | 361 +--------- .../api/comp_control_method.yaml | 38 - .../api/comp_method.yaml | 29 - .../api/comp_metric.yaml | 31 - .../api/comp_process_dataset.yaml | 33 - .../api/file_common_dataset.yaml | 75 -- .../api/file_output.yaml | 35 - .../spatial_decomposition/api/file_score.yaml | 25 - .../api/file_single_cell.yaml | 29 - .../api/file_solution.yaml | 57 -- .../api/file_spatial_masked.yaml | 25 - .../spatial_decomposition/api/task_info.yaml | 23 - .../random_proportions/config.vsh.yaml | 25 - .../random_proportions/script.py | 42 -- .../true_proportions/config.vsh.yaml | 22 - .../true_proportions/script.py | 40 -- .../dataset_simulator/config.vsh.yaml | 201 ------ .../dataset_simulator/script.py | 208 ------ .../methods/cell2location/config.vsh.yaml | 87 --- .../methods/cell2location/script.py | 152 ---- .../methods/destvi/config.vsh.yaml | 43 -- .../methods/destvi/script.py | 62 -- .../methods/nmfreg/config.vsh.yaml | 37 - .../methods/nmfreg/script.py | 91 --- .../methods/nnls/config.vsh.yaml | 30 - .../methods/nnls/script.py | 63 -- .../methods/rctd/config.vsh.yaml | 39 - .../methods/rctd/script.R | 94 --- .../methods/seurat/config.vsh.yaml | 39 - .../methods/seurat/script.R | 99 --- .../methods/stereoscope/config.vsh.yaml | 43 -- .../methods/stereoscope/script.py | 61 -- .../methods/tangram/config.vsh.yaml | 38 - .../methods/tangram/script.py | 84 --- .../methods/vanillanmf/config.vsh.yaml | 37 - .../methods/vanillanmf/script.py | 77 -- .../metrics/r2/config.vsh.yaml | 32 - .../metrics/r2/script.py | 39 - .../process_dataset/config.vsh.yaml | 13 - .../process_dataset/script.py | 46 -- .../resources_scripts/process_datasets.sh | 36 - .../resources_scripts/run_benchmark.sh | 22 - .../cxg_mouse_pancreas_atlas.sh | 39 - .../resources_test_scripts/pancreas.sh | 43 -- .../process_datasets/config.vsh.yaml | 43 -- .../workflows/process_datasets/main.nf | 65 -- .../workflows/process_datasets/run_test.sh | 23 - .../workflows/run_benchmark/config.vsh.yaml | 69 -- .../workflows/run_benchmark/main.nf | 198 ------ .../workflows/run_benchmark/run_test.sh | 28 - src/tasks/spatially_variable_genes/README.md | 334 +-------- .../api/comp_control_method.yaml | 34 - .../api/comp_method.yaml | 25 - .../api/comp_metric.yaml | 31 - .../api/comp_process_dataset.yaml | 27 - .../api/file_common_dataset.yaml | 58 -- .../api/file_dataset.yaml | 40 -- .../api/file_output.yaml | 30 - .../api/file_score.yaml | 25 - .../api/file_simulated_dataset.yaml | 66 -- .../api/file_solution.yaml | 57 -- .../api/task_info.yaml | 47 -- .../random_ranking/config.vsh.yaml | 25 - .../control_methods/random_ranking/script.py | 28 - .../true_ranking/config.vsh.yaml | 25 - .../control_methods/true_ranking/script.py | 25 - .../methods/boostgp/config.vsh.yaml | 48 -- .../methods/boostgp/script.R | 50 -- .../methods/gpcounts/config.vsh.yaml | 56 -- .../methods/gpcounts/script.py | 92 --- .../methods/moran_i/config.vsh.yaml | 40 -- .../methods/moran_i/script.py | 44 -- .../methods/nnsvg/config.vsh.yaml | 31 - .../methods/nnsvg/script.R | 71 -- .../methods/scgco/config.vsh.yaml | 69 -- .../methods/scgco/script.py | 63 -- .../methods/sepal/config.vsh.yaml | 46 -- .../methods/sepal/script.py | 40 -- .../methods/somde/config.vsh.yaml | 37 - .../methods/somde/script.py | 53 -- .../methods/spagcn/config.vsh.yaml | 47 -- .../methods/spagcn/script.py | 132 ---- .../methods/spagft/config.vsh.yaml | 59 -- .../methods/spagft/script.py | 44 -- .../methods/spanve/config.vsh.yaml | 45 -- .../methods/spanve/script.py | 33 - .../methods/spark/config.vsh.yaml | 30 - .../methods/spark/script.R | 75 -- .../methods/spark_x/config.vsh.yaml | 35 - .../methods/spark_x/script.R | 57 -- .../methods/spatialde/config.vsh.yaml | 39 - .../methods/spatialde/script.py | 53 -- .../methods/spatialde2/config.vsh.yaml | 53 -- .../methods/spatialde2/script.py | 51 -- .../metrics/correlation/config.vsh.yaml | 32 - .../metrics/correlation/script.py | 37 - .../select_reference/config.vsh.yaml | 51 -- .../select_reference/script.py | 36 - .../simulate_svg/config.vsh.yaml | 46 -- .../process_dataset/simulate_svg/script.R | 196 ------ .../split_dataset/config.vsh.yaml | 38 - .../process_dataset/split_dataset/script.py | 34 - .../resources_scripts/process_datasets.sh | 114 --- .../resources_scripts/run_benchmark.sh | 62 -- .../mouse_brain_coronal_section1.sh | 43 -- .../process_datasets/config.vsh.yaml | 67 -- .../workflows/process_datasets/main.nf | 86 --- .../workflows/process_datasets/run_test.sh | 32 - .../workflows/run_benchmark/config.vsh.yaml | 87 --- .../workflows/run_benchmark/main.nf | 197 ------ .../workflows/run_benchmark/run_test.sh | 28 - 506 files changed, 162 insertions(+), 28074 deletions(-) delete mode 100644 src/tasks/batch_integration/api/comp_control_method_embedding.yaml delete mode 100644 src/tasks/batch_integration/api/comp_control_method_feature.yaml delete mode 100644 src/tasks/batch_integration/api/comp_control_method_graph.yaml delete mode 100644 src/tasks/batch_integration/api/comp_method_embedding.yaml delete mode 100644 src/tasks/batch_integration/api/comp_method_feature.yaml delete mode 100644 src/tasks/batch_integration/api/comp_method_graph.yaml delete mode 100644 src/tasks/batch_integration/api/comp_metric_embedding.yaml delete mode 100644 src/tasks/batch_integration/api/comp_metric_feature.yaml delete mode 100644 src/tasks/batch_integration/api/comp_metric_graph.yaml delete mode 100644 src/tasks/batch_integration/api/comp_process_dataset.yaml delete mode 100644 src/tasks/batch_integration/api/comp_transformer_embedding_to_graph.yaml delete mode 100644 src/tasks/batch_integration/api/comp_transformer_feature_to_embedding.yaml delete mode 100644 src/tasks/batch_integration/api/file_common_dataset.yaml delete mode 100644 src/tasks/batch_integration/api/file_dataset.yaml delete mode 100644 src/tasks/batch_integration/api/file_integrated_embedding.yaml delete mode 100644 src/tasks/batch_integration/api/file_integrated_feature.yaml delete mode 100644 src/tasks/batch_integration/api/file_integrated_graph.yaml delete mode 100644 src/tasks/batch_integration/api/file_score.yaml delete mode 100644 src/tasks/batch_integration/api/file_solution.yaml delete mode 100644 src/tasks/batch_integration/api/task_info.yaml delete mode 100644 src/tasks/batch_integration/api/thumbnail.svg delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py delete mode 100644 src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py delete mode 100644 src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py delete mode 100644 src/tasks/batch_integration/control_methods/utils.py delete mode 100644 src/tasks/batch_integration/methods/bbknn/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/bbknn/script.py delete mode 100644 src/tasks/batch_integration/methods/combat/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/combat/script.py delete mode 100644 src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/fastmnn_feature/script.R delete mode 100644 src/tasks/batch_integration/methods/liger/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/liger/script.R delete mode 100644 src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/mnn_correct/script.R delete mode 100644 src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/mnnpy/script.py delete mode 100644 src/tasks/batch_integration/methods/pyliger/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/pyliger/script.py delete mode 100644 src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/scalex_embed/script.py delete mode 100644 src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/scanorama_embed/script.py delete mode 100644 src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/scanvi/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/scanvi/script.py delete mode 100644 src/tasks/batch_integration/methods/scvi/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/methods/scvi/script.py delete mode 100644 src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/asw_batch/script.py delete mode 100644 src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/asw_label/script.py delete mode 100644 src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py delete mode 100644 src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/clustering_overlap/script.py delete mode 100644 src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/graph_connectivity/script.py delete mode 100644 src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/hvg_overlap/script.py delete mode 100644 src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/isolated_label_asw/script.py delete mode 100644 src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/isolated_label_f1/script.py delete mode 100644 src/tasks/batch_integration/metrics/kbet/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/kbet/script.py delete mode 100644 src/tasks/batch_integration/metrics/lisi/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/lisi/script.py delete mode 100644 src/tasks/batch_integration/metrics/pcr/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/metrics/pcr/script.py delete mode 100644 src/tasks/batch_integration/process_dataset/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/process_dataset/script.py delete mode 100755 src/tasks/batch_integration/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/batch_integration/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/batch_integration/resources_scripts/run_benchmark_test.sh delete mode 100755 src/tasks/batch_integration/resources_test_scripts/process.sh delete mode 100644 src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/transformers/embed_to_graph/script.py delete mode 100644 src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/transformers/feature_to_embed/script.py delete mode 100644 src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/workflows/process_datasets/main.nf delete mode 100755 src/tasks/batch_integration/workflows/process_datasets/run_nextflow.sh delete mode 100644 src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/batch_integration/workflows/run_benchmark/main.nf delete mode 100755 src/tasks/batch_integration/workflows/run_benchmark/run_test.sh delete mode 100644 src/tasks/denoising/api/comp_control_method.yaml delete mode 100644 src/tasks/denoising/api/comp_method.yaml delete mode 100644 src/tasks/denoising/api/comp_metric.yaml delete mode 100644 src/tasks/denoising/api/comp_process_dataset.yaml delete mode 100644 src/tasks/denoising/api/file_common_dataset.yaml delete mode 100644 src/tasks/denoising/api/file_denoised.yaml delete mode 100644 src/tasks/denoising/api/file_score.yaml delete mode 100644 src/tasks/denoising/api/file_test.yaml delete mode 100644 src/tasks/denoising/api/file_train.yaml delete mode 100644 src/tasks/denoising/api/task_info.yaml delete mode 100644 src/tasks/denoising/api/thumbnail.svg delete mode 100644 src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml delete mode 100644 src/tasks/denoising/control_methods/no_denoising/script.py delete mode 100644 src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml delete mode 100644 src/tasks/denoising/control_methods/perfect_denoising/script.py delete mode 100644 src/tasks/denoising/methods/alra/config.vsh.yaml delete mode 100644 src/tasks/denoising/methods/alra/script.R delete mode 100644 src/tasks/denoising/methods/dca/config.vsh.yaml delete mode 100644 src/tasks/denoising/methods/dca/script.py delete mode 100644 src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml delete mode 100644 src/tasks/denoising/methods/knn_smoothing/script.py delete mode 100644 src/tasks/denoising/methods/magic/config.vsh.yaml delete mode 100644 src/tasks/denoising/methods/magic/script.py delete mode 100644 src/tasks/denoising/methods/saver/config.vsh.yaml delete mode 100644 src/tasks/denoising/methods/saver/script.R delete mode 100644 src/tasks/denoising/metrics/mse/config.vsh.yaml delete mode 100644 src/tasks/denoising/metrics/mse/script.py delete mode 100644 src/tasks/denoising/metrics/poisson/config.vsh.yaml delete mode 100644 src/tasks/denoising/metrics/poisson/script.py delete mode 100644 src/tasks/denoising/process_dataset/config.vsh.yaml delete mode 100644 src/tasks/denoising/process_dataset/helper.py delete mode 100644 src/tasks/denoising/process_dataset/script.py delete mode 100755 src/tasks/denoising/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/denoising/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/denoising/resources_scripts/run_benchmark_test.sh delete mode 100755 src/tasks/denoising/resources_test_scripts/pancreas.sh delete mode 100644 src/tasks/denoising/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/denoising/workflows/process_datasets/main.nf delete mode 100755 src/tasks/denoising/workflows/process_datasets/run_test.sh delete mode 100644 src/tasks/denoising/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/denoising/workflows/run_benchmark/main.nf delete mode 100755 src/tasks/denoising/workflows/run_benchmark/run_test.sh delete mode 100644 src/tasks/dimensionality_reduction/api/comp_control_method.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/comp_method.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/comp_metric.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/comp_process_dataset.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/file_common_dataset.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/file_dataset.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/file_embedding.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/file_score.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/file_solution.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/task_info.yaml delete mode 100644 src/tasks/dimensionality_reduction/api/thumbnail.svg delete mode 100644 src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/control_methods/random_features/script.py delete mode 100644 src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/control_methods/spectral_features/script.py delete mode 100644 src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/control_methods/true_features/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/densmap/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/diffusion_map/script.R delete mode 100644 src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/ivis/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/lmds/script.R delete mode 100644 src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/neuralee/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/pca/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/phate/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/pymde/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/simlr/script.R delete mode 100644 src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/tsne/script.py delete mode 100644 src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/methods/umap/script.py delete mode 100644 src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/metrics/clustering_performance/script.py delete mode 100644 src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/metrics/coranking/script.R delete mode 100644 src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/metrics/density_preservation/script.py delete mode 100644 src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py delete mode 100644 src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/metrics/trustworthiness/script.py delete mode 100644 src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/process_dataset/script.py delete mode 100755 src/tasks/dimensionality_reduction/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/dimensionality_reduction/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/dimensionality_reduction/resources_scripts/run_benchmark_test.sh delete mode 100755 src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh delete mode 100644 src/tasks/dimensionality_reduction/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/workflows/process_datasets/main.nf delete mode 100644 src/tasks/dimensionality_reduction/workflows/process_datasets/run_test.sh delete mode 100644 src/tasks/dimensionality_reduction/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/dimensionality_reduction/workflows/run_benchmark/main.nf delete mode 100755 src/tasks/dimensionality_reduction/workflows/run_benchmark/run_test.sh delete mode 100644 src/tasks/label_projection/api/comp_control_method.yaml delete mode 100644 src/tasks/label_projection/api/comp_method.yaml delete mode 100644 src/tasks/label_projection/api/comp_metric.yaml delete mode 100644 src/tasks/label_projection/api/comp_process_dataset.yaml delete mode 100644 src/tasks/label_projection/api/file_common_dataset.yaml delete mode 100644 src/tasks/label_projection/api/file_prediction.yaml delete mode 100644 src/tasks/label_projection/api/file_score.yaml delete mode 100644 src/tasks/label_projection/api/file_solution.yaml delete mode 100644 src/tasks/label_projection/api/file_test.yaml delete mode 100644 src/tasks/label_projection/api/file_train.yaml delete mode 100644 src/tasks/label_projection/api/task_info.yaml delete mode 100644 src/tasks/label_projection/api/thumbnail.svg delete mode 100644 src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml delete mode 100644 src/tasks/label_projection/control_methods/majority_vote/script.py delete mode 100644 src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml delete mode 100644 src/tasks/label_projection/control_methods/random_labels/script.py delete mode 100644 src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml delete mode 100644 src/tasks/label_projection/control_methods/true_labels/script.py delete mode 100644 src/tasks/label_projection/methods/knn/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/knn/script.py delete mode 100644 src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/logistic_regression/script.py delete mode 100644 src/tasks/label_projection/methods/mlp/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/mlp/script.py delete mode 100644 src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/naive_bayes/script.py delete mode 100644 src/tasks/label_projection/methods/scanvi/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/scanvi/script.py delete mode 100644 src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/scanvi_scarches/script.py delete mode 100644 src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/seurat_transferdata/script.R delete mode 100644 src/tasks/label_projection/methods/xgboost/config.vsh.yaml delete mode 100644 src/tasks/label_projection/methods/xgboost/script.py delete mode 100644 src/tasks/label_projection/metrics/accuracy/config.vsh.yaml delete mode 100644 src/tasks/label_projection/metrics/accuracy/script.py delete mode 100644 src/tasks/label_projection/metrics/f1/config.vsh.yaml delete mode 100644 src/tasks/label_projection/metrics/f1/script.py delete mode 100644 src/tasks/label_projection/process_dataset/config.vsh.yaml delete mode 100644 src/tasks/label_projection/process_dataset/script.py delete mode 100755 src/tasks/label_projection/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/label_projection/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/label_projection/resources_scripts/run_benchmark_test.sh delete mode 100755 src/tasks/label_projection/resources_test_scripts/pancreas.sh delete mode 100644 src/tasks/label_projection/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/label_projection/workflows/process_datasets/main.nf delete mode 100644 src/tasks/label_projection/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/label_projection/workflows/run_benchmark/main.nf delete mode 100755 src/tasks/label_projection/workflows/run_benchmark/run_test.sh delete mode 100644 src/tasks/match_modalities/api/comp_control_method.yaml delete mode 100644 src/tasks/match_modalities/api/comp_method.yaml delete mode 100644 src/tasks/match_modalities/api/comp_metric.yaml delete mode 100644 src/tasks/match_modalities/api/comp_process_dataset.yaml delete mode 100644 src/tasks/match_modalities/api/file_common_dataset_mod1.yaml delete mode 100644 src/tasks/match_modalities/api/file_common_dataset_mod2.yaml delete mode 100644 src/tasks/match_modalities/api/file_dataset_mod1.yaml delete mode 100644 src/tasks/match_modalities/api/file_dataset_mod2.yaml delete mode 100644 src/tasks/match_modalities/api/file_integrated_mod1.yaml delete mode 100644 src/tasks/match_modalities/api/file_integrated_mod2.yaml delete mode 100644 src/tasks/match_modalities/api/file_score.yaml delete mode 100644 src/tasks/match_modalities/api/file_solution_mod1.yaml delete mode 100644 src/tasks/match_modalities/api/file_solution_mod2.yaml delete mode 100644 src/tasks/match_modalities/api/task_info.yaml delete mode 100644 src/tasks/match_modalities/api/thumbnail.svg delete mode 100644 src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/control_methods/random_features/script.py delete mode 100644 src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/control_methods/true_features/script.py delete mode 100644 src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/methods/fastmnn/script.R delete mode 100644 src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/methods/harmonic_alignment/script.py delete mode 100644 src/tasks/match_modalities/methods/procrustes/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/methods/procrustes/script.py delete mode 100644 src/tasks/match_modalities/methods/scot/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/methods/scot/script.py delete mode 100644 src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/metrics/knn_auc/script.py delete mode 100644 src/tasks/match_modalities/metrics/mse/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/metrics/mse/script.py delete mode 100644 src/tasks/match_modalities/process_dataset/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/process_dataset/script.py delete mode 100755 src/tasks/match_modalities/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/match_modalities/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh delete mode 100644 src/tasks/match_modalities/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/workflows/process_datasets/main.nf delete mode 100644 src/tasks/match_modalities/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/match_modalities/workflows/run_benchmark/main.nf delete mode 100644 src/tasks/match_modalities/workflows/run_benchmark/run_test.sh delete mode 100644 src/tasks/predict_modality/api/comp_control_method.yaml delete mode 100644 src/tasks/predict_modality/api/comp_method.yaml delete mode 100644 src/tasks/predict_modality/api/comp_method_predict.yaml delete mode 100644 src/tasks/predict_modality/api/comp_method_train.yaml delete mode 100644 src/tasks/predict_modality/api/comp_metric.yaml delete mode 100644 src/tasks/predict_modality/api/comp_process_dataset.yaml delete mode 100644 src/tasks/predict_modality/api/file_common_dataset_mod1.yaml delete mode 100644 src/tasks/predict_modality/api/file_common_dataset_mod2.yaml delete mode 100644 src/tasks/predict_modality/api/file_prediction.yaml delete mode 100644 src/tasks/predict_modality/api/file_pretrained_model.yaml delete mode 100644 src/tasks/predict_modality/api/file_score.yaml delete mode 100644 src/tasks/predict_modality/api/file_test_mod1.yaml delete mode 100644 src/tasks/predict_modality/api/file_test_mod2.yaml delete mode 100644 src/tasks/predict_modality/api/file_train_mod1.yaml delete mode 100644 src/tasks/predict_modality/api/file_train_mod2.yaml delete mode 100644 src/tasks/predict_modality/api/task_info.yaml delete mode 100644 src/tasks/predict_modality/api/thumbnail.svg delete mode 100644 src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/control_methods/meanpergene/script.py delete mode 100644 src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/control_methods/random_predict/script.R delete mode 100644 src/tasks/predict_modality/control_methods/solution/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/control_methods/solution/script.R delete mode 100644 src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/control_methods/zeros/script.py delete mode 100644 src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/guanlab_dengkw_pm/script.py delete mode 100644 src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/knnr_py/script.py delete mode 100644 src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/knnr_r/script.R delete mode 100644 src/tasks/predict_modality/methods/lm/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/lm/script.R delete mode 100644 src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/lmds_irlba_rf/script.R delete mode 100644 src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/newwave_knnr/script.R delete mode 100644 src/tasks/predict_modality/methods/novel/helper_functions.py delete mode 100644 src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/novel/predict/run_test.sh delete mode 100644 src/tasks/predict_modality/methods/novel/predict/script.py delete mode 100644 src/tasks/predict_modality/methods/novel/run/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/novel/run/main.nf delete mode 100644 src/tasks/predict_modality/methods/novel/run/run_test.sh delete mode 100644 src/tasks/predict_modality/methods/novel/train/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/novel/train/run_test.sh delete mode 100644 src/tasks/predict_modality/methods/novel/train/script.py delete mode 100644 src/tasks/predict_modality/methods/random_forest/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/random_forest/script.R delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/predict/script.py delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/resources/models.py delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/resources/utils.py delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/resources/yaml/mlp_ADT2GEX.yaml delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/resources/yaml/mlp_ATAC2GEX.yaml delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/resources/yaml/mlp_GEX2ADT.yaml delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/run/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/run/main.nf delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/run/run_test.sh delete mode 100755 src/tasks/predict_modality/methods/simple_mlp/test.sh delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/train/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/methods/simple_mlp/train/script.py delete mode 100644 src/tasks/predict_modality/metrics/correlation/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/metrics/correlation/script.R delete mode 100644 src/tasks/predict_modality/metrics/mse/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/metrics/mse/script.py delete mode 100644 src/tasks/predict_modality/process_dataset/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/process_dataset/script.R delete mode 100755 src/tasks/predict_modality/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/predict_modality/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/predict_modality/resources_test_scripts/neurips2021_bmmc.sh delete mode 100644 src/tasks/predict_modality/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/workflows/process_datasets/main.nf delete mode 100755 src/tasks/predict_modality/workflows/process_datasets/run_test.sh delete mode 100644 src/tasks/predict_modality/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/predict_modality/workflows/run_benchmark/main.nf delete mode 100755 src/tasks/predict_modality/workflows/run_benchmark/run_test.sh delete mode 100644 src/tasks/spatial_decomposition/api/comp_control_method.yaml delete mode 100644 src/tasks/spatial_decomposition/api/comp_method.yaml delete mode 100644 src/tasks/spatial_decomposition/api/comp_metric.yaml delete mode 100644 src/tasks/spatial_decomposition/api/comp_process_dataset.yaml delete mode 100644 src/tasks/spatial_decomposition/api/file_common_dataset.yaml delete mode 100644 src/tasks/spatial_decomposition/api/file_output.yaml delete mode 100644 src/tasks/spatial_decomposition/api/file_score.yaml delete mode 100644 src/tasks/spatial_decomposition/api/file_single_cell.yaml delete mode 100644 src/tasks/spatial_decomposition/api/file_solution.yaml delete mode 100644 src/tasks/spatial_decomposition/api/file_spatial_masked.yaml delete mode 100644 src/tasks/spatial_decomposition/api/task_info.yaml delete mode 100644 src/tasks/spatial_decomposition/control_methods/random_proportions/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/control_methods/random_proportions/script.py delete mode 100644 src/tasks/spatial_decomposition/control_methods/true_proportions/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/control_methods/true_proportions/script.py delete mode 100644 src/tasks/spatial_decomposition/dataset_simulator/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/dataset_simulator/script.py delete mode 100644 src/tasks/spatial_decomposition/methods/cell2location/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/cell2location/script.py delete mode 100644 src/tasks/spatial_decomposition/methods/destvi/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/destvi/script.py delete mode 100644 src/tasks/spatial_decomposition/methods/nmfreg/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/nmfreg/script.py delete mode 100644 src/tasks/spatial_decomposition/methods/nnls/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/nnls/script.py delete mode 100644 src/tasks/spatial_decomposition/methods/rctd/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/rctd/script.R delete mode 100644 src/tasks/spatial_decomposition/methods/seurat/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/seurat/script.R delete mode 100644 src/tasks/spatial_decomposition/methods/stereoscope/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/stereoscope/script.py delete mode 100644 src/tasks/spatial_decomposition/methods/tangram/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/tangram/script.py delete mode 100644 src/tasks/spatial_decomposition/methods/vanillanmf/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/methods/vanillanmf/script.py delete mode 100644 src/tasks/spatial_decomposition/metrics/r2/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/metrics/r2/script.py delete mode 100644 src/tasks/spatial_decomposition/process_dataset/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/process_dataset/script.py delete mode 100755 src/tasks/spatial_decomposition/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/spatial_decomposition/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/spatial_decomposition/resources_test_scripts/cxg_mouse_pancreas_atlas.sh delete mode 100755 src/tasks/spatial_decomposition/resources_test_scripts/pancreas.sh delete mode 100644 src/tasks/spatial_decomposition/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/workflows/process_datasets/main.nf delete mode 100644 src/tasks/spatial_decomposition/workflows/process_datasets/run_test.sh delete mode 100644 src/tasks/spatial_decomposition/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/spatial_decomposition/workflows/run_benchmark/main.nf delete mode 100755 src/tasks/spatial_decomposition/workflows/run_benchmark/run_test.sh delete mode 100644 src/tasks/spatially_variable_genes/api/comp_control_method.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/comp_method.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/comp_metric.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/comp_process_dataset.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/file_common_dataset.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/file_dataset.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/file_output.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/file_score.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/file_simulated_dataset.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/file_solution.yaml delete mode 100644 src/tasks/spatially_variable_genes/api/task_info.yaml delete mode 100644 src/tasks/spatially_variable_genes/control_methods/random_ranking/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/control_methods/random_ranking/script.py delete mode 100644 src/tasks/spatially_variable_genes/control_methods/true_ranking/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/control_methods/true_ranking/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/boostgp/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/boostgp/script.R delete mode 100644 src/tasks/spatially_variable_genes/methods/gpcounts/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/gpcounts/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/moran_i/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/moran_i/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/nnsvg/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/nnsvg/script.R delete mode 100644 src/tasks/spatially_variable_genes/methods/scgco/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/scgco/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/sepal/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/sepal/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/somde/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/somde/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/spagcn/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/spagcn/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/spagft/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/spagft/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/spanve/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/spanve/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/spark/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/spark/script.R delete mode 100644 src/tasks/spatially_variable_genes/methods/spark_x/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/spark_x/script.R delete mode 100644 src/tasks/spatially_variable_genes/methods/spatialde/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/spatialde/script.py delete mode 100644 src/tasks/spatially_variable_genes/methods/spatialde2/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/methods/spatialde2/script.py delete mode 100644 src/tasks/spatially_variable_genes/metrics/correlation/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/metrics/correlation/script.py delete mode 100644 src/tasks/spatially_variable_genes/process_dataset/select_reference/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/process_dataset/select_reference/script.py delete mode 100644 src/tasks/spatially_variable_genes/process_dataset/simulate_svg/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/process_dataset/simulate_svg/script.R delete mode 100644 src/tasks/spatially_variable_genes/process_dataset/split_dataset/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/process_dataset/split_dataset/script.py delete mode 100755 src/tasks/spatially_variable_genes/resources_scripts/process_datasets.sh delete mode 100755 src/tasks/spatially_variable_genes/resources_scripts/run_benchmark.sh delete mode 100755 src/tasks/spatially_variable_genes/resources_test_scripts/mouse_brain_coronal_section1.sh delete mode 100644 src/tasks/spatially_variable_genes/workflows/process_datasets/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/workflows/process_datasets/main.nf delete mode 100644 src/tasks/spatially_variable_genes/workflows/process_datasets/run_test.sh delete mode 100644 src/tasks/spatially_variable_genes/workflows/run_benchmark/config.vsh.yaml delete mode 100644 src/tasks/spatially_variable_genes/workflows/run_benchmark/main.nf delete mode 100755 src/tasks/spatially_variable_genes/workflows/run_benchmark/run_test.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index e54ad71d10..9d1d8a62ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,28 @@ -# openproblems v2.1.0 +# openproblems [next release] + +## Breaking changes + +- Moved `src/tasks/batch_integration` to [`task_batch_integration`](https://github.com/openproblems-bio/task_batch_integration). + +- Moved `src/tasks/denoising` to [`task_denoising`](https://github.com/openproblems-bio/task_denoising). + +- Moved `src/tasks/dimensionality_reduction` to [`task_dimensionality_reduction`](https://github.com/openproblems-bio/task_dimensionality_reduction). + +- Moved `src/tasks/label_projection` to [`task_label_projection`](https://github.com/openproblems-bio/task_label_projection). + +- Moved `src/tasks/match_modalities` to [`task_match_modalities`](https://github.com/openproblems-bio/task_match_modalities). + +- Moved `src/tasks/predict_modality` to [`task_predict_modality`](https://github.com/openproblems-bio/task_predict_modality). + +- Moved `src/tasks/spatial_decomposition` to [`task_spatial_decomposition`](https://github.com/openproblems-bio/task_spatial_decomposition). + +- Moved `src/tasks/spatially_variable_genes` to [`task_spatially_variable_genes`](https://github.com/openproblems-bio/task_spatially_variable_genes). ## Minor changes -- Add the CELLxGENE immune cell atlas dataset as a common test resource (PR #907) -- Update `dataset_id` for `tenx_visium`, `zenodo_spatial`, `zenodo_spatial_slidetags` datasets and use `mouse_brain_coronal` as a test resource in the `spatially_variable_genes` task (PR #908) +- Add the CELLxGENE immune cell atlas dataset as a common test resource (PR #907). + +- Update `dataset_id` for `tenx_visium`, `zenodo_spatial`, `zenodo_spatial_slidetags` datasets and use `mouse_brain_coronal` as a test resource in the `spatially_variable_genes` task (PR #908). # openproblems v2.0.0 @@ -11,7 +30,8 @@ A major update to the OpenProblems framework, switching from a Python-based fram Most relevant parts of the overall structure: -* `src/tasks`: Benchmarking tasks: +- `src/tasks`: Benchmarking tasks: + - `batch_integration`: Batch integration - `denoising`: Denoising - `dimensionality_reduction`: Dimensionality reduction @@ -20,7 +40,8 @@ Most relevant parts of the overall structure: - `spatial_decomposition`: Spatial decomposition - `spatially_variable_genes`: Spatially variable genes -* `src/datasets`: Components for creating common datasets. Loaders: +- `src/datasets`: Components for creating common datasets. Loaders: + - `cellxgene_census`: Query cells from a CellxGene Census - `openproblems_neurips2021_bmmc`: Fetch a dataset from the OpenProblems NeurIPS2021 competition - `openproblems_neurips2022_pbmc`: Fetch a dataset from the OpenProblems NeurIPS2022 competition @@ -30,7 +51,7 @@ Most relevant parts of the overall structure: - `zenodo_spatial`: Fetch and process an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. - `zenodo_spatial_slidetags`: Download a compressed file containing gene expression matrix and spatial locations from zenodo. -* `src/common`: Common components used by all tasks. +- `src/common`: Common components used by all tasks. - `check_dataset_schema`: Check whether an h5ad dataset adheres to a dataset schema - `check_yaml_schema`: Check whether a YAML adheres to a JSON schema - `comp_tests`: Reusable component unit tests @@ -44,12 +65,12 @@ Most relevant parts of the overall structure: For more information related to the structure of this repository, see the [documentation](https://openproblems.bio/documentation/reference/openproblems/). - # openproblems v1.0.0 Note: This changelog was automatically generated from the git log. ## New functionality + - Added `cell2location` to the `spatial_decomposition` task. - Added nearest-neighbor ranking matrix computation to `_utils`. - Datasets now store nearest-neighbor ranking matrix in `adata.obsm["X_ranking"]`. @@ -61,6 +82,7 @@ Note: This changelog was automatically generated from the git log. - Added `obsm` parameter to `_xgboost` function to allow specifying the embedding space for XGBoost training. ## Major changes + - Updated `scvi-tools` to version `0.20` in both Python and R environments. - Updated datasets to include nearest-neighbor ranking matrix. - Modified dimensionality reduction task to include nearest-neighbor ranking matrix computation in dataset generation. @@ -71,6 +93,7 @@ Note: This changelog was automatically generated from the git log. - Removed the redundant computation and storage of the nearest-neighbor ranking matrix in datasets. ## Minor changes + - Updated method names to be shorter and more consistent across tasks. - Improved method summaries for clarity. - Updated JAX and JAXlib versions to 0.4.6. @@ -101,11 +124,13 @@ Note: This changelog was automatically generated from the git log. - Modified `_scanvi_scarches` to allow for specifying `prediction_method` and handle `unlabeled_category` consistently. ## Documentation + - Improved the documentation of the `auprc` metric. - Improved the documentation of the `cell2location` methods. - Document sub-stub task behaviour ## Bug fixes + - Fixed an error in `neuralee_default` where the `subsample_genes` argument could be too small. - Fixed an error in `knn_naive` where the `is_baseline` argument was set to `False`. - Fixed calculation of ranking matrix in `_utils` to include ties. @@ -130,14 +155,14 @@ Note: This changelog was automatically generated from the git log. - Updated the `run_tests` workflow to skip testing on the `test_process` branch. - Updated the `create-pull-request` step to set the author for the pull request. - Updated the `run_tests` workflow to skip testing on pull request reviews. -- Updated the `update_website_ - +- Updated the `update*website* # openproblems v0.8.0 Note: This changelog was automatically generated from the git log. ## New functionality + - Added the zebrafish_labs dataset to the dimensionality reduction task. - Added the `diffusion_map` method to the dimensionality reduction task. - Added the `spectral_features` method to the dimensionality reduction task, which uses diffusion maps to create embedding features. @@ -147,6 +172,7 @@ Note: This changelog was automatically generated from the git log. - Added `celltype_random_embedding_jitter` method to randomize embedding with jitter. ## Minor changes + - Improved the `density_preservation` metric calculation. - Updated the `distance_correlation` metric to use the new `diffusion_map` method. - Increased the default number of components used for `distance_correlation_spectral` to 1000. @@ -169,6 +195,7 @@ Note: This changelog was automatically generated from the git log. - Added permission to write packages to the `run_tests` workflow. ## Bug fixes + - Fixed a bug in `density_preservation` that caused it to return 0 when there were NaN values in the embedding. - Removed unused `true_features_log_cp10k` and `true_features_log_cp10k_hvg` methods. - Removed unnecessary imports in metrics. @@ -178,69 +205,76 @@ Note: This changelog was automatically generated from the git log. - Added `get_split` function for metrics that require splitting data into training and testing sets. - Added `feature_to_embedding` function for embedding-based metrics. - Fixed issue where baseline methods were not properly documented. + * Increased default maximum epochs for spatial models to improve performance. * Improved training parameters for both spatial and single-cell models to improve stability and performance. * Updated validation metric used for early stopping in spatial model to improve training quality. ## Documentation + - Updated documentation to clarify that the AnnData object passed to metric functions is a copy. - Updated the documentation for batch integration tasks to reflect the change in the expected format of the dataset objects. ## Major changes + - Moved baseline methods from individual task modules to a common module. - Removed redundant baseline methods from individual task modules. - Increased default values for `max_epochs_sp` and `max_epochs_sc` in `destvi` method. - # openproblems v0.7.4 Note: This changelog was automatically generated from the git log. ## New functionality + - Added metadata for all datasets, methods, and metrics. ## Major changes + - Updated nf-openproblems to v1.10. ## Minor changes + - Added a new `docker_pull` rule to the Snakemake workflow to pull Docker images. - Added a new `docker` rule to the Snakemake workflow to build Docker images. - Changed the `pytest` command to include coverage for the `test` directory. - Added new environment variables for the TOWER_TEST_ACTION_ID and TOWER_FULL_ACTION_ID to the Snakemake workflow. - Updated the `scripts/install_renv.R` script to increase the number of retry attempts. - # openproblems v0.7.3 Note: This changelog was automatically generated from the git log. ## Minor changes + - Updated `scib` version to `1.1.3` in `docker/openproblems-r-extras/requirements.txt` and `docker/openproblems-r-pytorch/requirements.txt`. + ## Bug fixes -- Added `pytest-timestamper` to test dependencies for better debugging. +- Added `pytest-timestamper` to test dependencies for better debugging. # openproblems v0.7.2 Note: This changelog was automatically generated from the git log. ## Bug fixes -- Fixed an issue where pymde did not work on sparse data. +- Fixed an issue where pymde did not work on sparse data. # openproblems v0.7.1 Note: This changelog was automatically generated from the git log. ## Minor changes -- Added `hvg_unint` and `n_genes_pre` to the lung batch. +- Added `hvg_unint` and `n_genes_pre` to the lung batch. # openproblems v0.7.0 Note: This changelog was automatically generated from the git log. ## New functionality + - Added a bibtex file `main.bib` for storing all references cited in the repository. - Added a section on adding paper references to `CONTRIBUTING.md` explaining how to add entries to `main.bib` and link to them in markdown documents. - Added new baseline methods for dimensionality reduction: "True Features (logCPM)", "True Features (logCPM, 1kHVG)". @@ -255,6 +289,7 @@ Note: This changelog was automatically generated from the git log. - Added a new workflow to comment on pull request status. ## Major changes + - Updated the `openproblems` repository to cite papers using bibtex references. - Renamed `alra` method to `alra_sqrt`. - Updated `spacexr` to latest version. @@ -263,6 +298,7 @@ Note: This changelog was automatically generated from the git log. - Bumped version to 0.7.0. ## Minor changes + - Added BibTex references to all data loaders in `openproblems/data`. - Added BibTex references to all methods in `openproblems/tasks`. - Added BibTex references to all metrics in `openproblems/tasks`. @@ -279,12 +315,12 @@ Note: This changelog was automatically generated from the git log. - Added PyMDE dependency to requirements.txt - Updated the API to specify that datasets should provide log CPM-normalized counts in `adata.X - # openproblems v0.6.1 Note: This changelog was automatically generated from the git log. ## New functionality + - Added `cell2location_detection_alpha_1` method, which uses `detection_alpha=1` and a hard-coded reference. - Added a new parameter `hard_coded_reference` to `cell2location_detection_alpha_1` method. - Added a new baseline method for dimensionality reduction using high-dimensional Laplacian Eigenmaps. @@ -299,6 +335,7 @@ Note: This changelog was automatically generated from the git log. - Added `top_prop` parameter to `odds_ratio` metric to allow specifying the proportion of interactions to consider for calculating the odds ratio. ## Major changes + - Removed unused `openproblems-python-batch-integration` docker image. - Moved `scanorama`, `bbknn`, `scVI`, `mnnpy` and `scib` from `openproblems-python-batch-integration` to `openproblems-r-pytorch`. - Moved `cell2location`, `molecular-cross-validation`, `neuralee`, `tangram` and `phate` from `openproblems-python-extras` to `openproblems-python-pytorch`. @@ -324,6 +361,7 @@ Note: This changelog was automatically generated from the git log. - Renamed the `nbt2022-reproducibility` to `website-experimental` ## Minor changes + - Updated `numpy` and `scipy` dependencies in setup.py. - Updated `scikit-learn`, `louvain`, `python-igraph`, `decorator` and `colorama` dependencies in setup.py. - Improved Docker image caching. @@ -348,22 +386,22 @@ Note: This changelog was automatically generated from the git log. - Added environment variable to track changes. - Removed unused git command. - Decreased number of samples for testing. -- Updated `igraph` to 0.10.* in `setup.py`. -- Updated `anndata2ri` to 1.1.* in `openproblems-r-base/README.md`. +- Updated `igraph` to 0.10.\* in `setup.py`. +- Updated `anndata2ri` to 1.1.\* in `openproblems-r-base/README.md`. - Updated `kBET` to `a10ffea` in `openproblems-r-extras/r_requirements.txt`. - Updated `scib` to `f0be826` in `openproblems-r-extras/requirements.txt`. -- Updated `harmony-pytorch` to 0.1.* in `openproblems-r-pytorch/requirements.txt`. -- Updated `torch` to 1.13.* in `openproblems-r-pytorch/requirements.txt`. +- Updated `harmony-pytorch` to 0.1.\* in `openproblems-r-pytorch/requirements.txt`. +- Updated `torch` to 1.13.\* in `openproblems-r-pytorch/requirements.txt`. - Updated `scanorama` to 1.7.0 in `openproblems-r-pytorch/requirements.txt`. -- Updated `scvi-tools` to 0.16.* in `openproblems-r-pytorch/requirements.txt`. +- Updated `scvi-tools` to 0.16.\* in `openproblems-r-pytorch/requirements.txt`. - Updated the `regulatory_effect_prediction` task to use - # openproblems v0.6.0 Note: This changelog was automatically generated from the git log. ## New functionality + - Added a new dataset: "Pancreas (inDrop)" - Added a new function: "pancreas" - Added a new utility function: "utils.split_data" @@ -381,6 +419,7 @@ Note: This changelog was automatically generated from the git log. - Added support for uploading docker images to ECR. ## Minor changes + - Added `tabula_muris_senis` dataset to `openproblems/tasks/denoising/datasets/__init__.py`. - Updated `styler` to version 1.8.1. - Updated the method for normalizing scores to correctly account for baseline method scores. @@ -411,12 +450,12 @@ Note: This changelog was automatically generated from the git log. - Updated the workflow to include a new `test_full_benchmark` branch. - Removed redundant code from the workflow. - # openproblems v0.5.21 Note: This changelog was automatically generated from the git log. ## New functionality + - Added a new metric, AUPRC, for evaluating cell-cell communication predictions. - Added support for aggregating method scores using "max" and "sum" operations. - Implemented a new method, true events, which predicts all possible interactions. @@ -426,152 +465,158 @@ Note: This changelog was automatically generated from the git log. - Added LIANA, CellPhoneDB, Connectome, Log2FC, NATMI, and SingleCellSignalR methods to the cell-cell communication source-target task. ## Bug fixes + - Fixed a bug where the odds ratio metric was not handling cases where the numerator or denominator was zero. ## Minor changes + - Updated the IRkernel package version in the R base docker image to 1.3.1. - Updated the saezlab/liana package version in the R extras docker image to 0.1.7. -- Updated the boto3 package version in the main docker image to 1.26.*. +- Updated the boto3 package version in the main docker image to 1.26.\*. - Added a check to the cell-cell communication dataset validation to ensure that there are no duplicate entries in the target data. - Updated the documentation for the cell-cell communication ligand-target task. - Updated the documentation for the cell-cell communication source-target task. - # openproblems v0.5.20 Note: This changelog was automatically generated from the git log. ## Bug fixes + - Fixed an issue where a sparse matrix was not being converted to CSR format. - Fixed a bug in `docker_run.sh` where pip check was not being executed. ## Minor changes -- Updated `pkgload` to version 1.3.1. +- Updated `pkgload` to version 1.3.1. # openproblems v0.5.19 Note: This changelog was automatically generated from the git log. ## Minor changes -- Converted sparse matrix to csr format. +- Converted sparse matrix to csr format. # openproblems v0.5.18 Note: This changelog was automatically generated from the git log. ## Minor changes -- Converted sparse matrices to CSR format. +- Converted sparse matrices to CSR format. # openproblems v0.5.17 Note: This changelog was automatically generated from the git log. - - - # openproblems v0.5.16 Note: This changelog was automatically generated from the git log. ## Bug fixes + - Fixed a bug where the bioconductor version was incorrect. - Fixed a bug where the matrix in obs was incorrect. + ## Minor changes + - Updated the scran package to version 1.24.1. - Updated the batchelor and scuttle packages. - # openproblems v0.5.15 Note: This changelog was automatically generated from the git log. - - - # openproblems v0.5.14 Note: This changelog was automatically generated from the git log. ## Major changes -- Updated workflow to run tests against `prod` branch. +- Updated workflow to run tests against `prod` branch. # openproblems v0.5.13 Note: This changelog was automatically generated from the git log. ## Bug fixes -- Skip benchmark if tester fails. +- Skip benchmark if tester fails. # openproblems v0.5.12 Note: This changelog was automatically generated from the git log. ## New functionality + - Explicitly push prod images on tag ## Documentation + - Added short metric descriptions to README ## Minor changes -- Added labels tests +- Added labels tests # openproblems v0.5.11 Note: This changelog was automatically generated from the git log. ## Bug fixes + - Reverted bump of louvain to 0.8, which caused issues. ## Minor changes -- Updated torch requirement to 1.13 in the openproblems-r-pytorch docker. +- Updated torch requirement to 1.13 in the openproblems-r-pytorch docker. # openproblems v0.5.10 Note: This changelog was automatically generated from the git log. ## New functionality + - Added support for SCALEX version 1.0.2. ## Minor changes + - Updated RcppAnnoy to version 0.0.20. -- Updated SageMaker requirement to version 2.116.*. +- Updated SageMaker requirement to version 2.116.\*. ## Bug fixes + - Fixed a bug in the `docker_hash` function, which now returns a string instead of an integer. - Fixed a bug in the `scalex` method, which now correctly handles the `outdir` parameter. - # openproblems v0.5.9 Note: This changelog was automatically generated from the git log. ## Minor changes + - Update rpy2 requirement from <3.5.5 to <3.5.6 - Update ragg to 1.2.4 + ## Bug fixes -- Don't fail job if hash fails +- Don't fail job if hash fails # openproblems v0.5.8 Note: This changelog was automatically generated from the git log. ## Minor changes -- Updated scIB to 77ab015. +- Updated scIB to 77ab015. # openproblems v0.5.7 Note: This changelog was automatically generated from the git log. ## New functionality + - Added a new batch integration subtask for corrected feature matrices. - Added a new sub-task for batch integration, "batch integration embed", which includes all methods that output a joint embedding of cells across batches. - Added a new sub-task for batch integration, "batch integration graph", which includes all methods that output a cell-cell similarity graph (e.g., a kNN graph). @@ -581,78 +626,88 @@ Note: This changelog was automatically generated from the git log. Note: This changelog was automatically generated from the git log. ## Bug fixes + - Fixed an issue where the `::` in branch names would cause problems. - Fixed an issue where the `check_r_dependencies.yml` workflow was not properly handling branch names with `::`. + ## Minor changes + - Updated the `caret` package to version 6.0-93. - Updated the README to include information about the Open Problems team and task leaders. - Replaced the `NuSVR` method with a faster alternative, improving performance. + ## New functionality + - Added a new method for running Seuratv3 from a fork, allowing for more efficient use of resources. - Added a new requirement to the `r_requirements.txt` file for the `bslib` package. - Added a new requirement to the `r_requirements.txt` file for the `caret` package. + ## Documentation + - Added a new section to the README to document the process of running Seuratv3 from a fork. - Updated the README to include a list of all contributors to the Open Problems project. - # openproblems v0.5.5 Note: This changelog was automatically generated from the git log. ## Bug fixes + - Fix sampling and reindexing - Fix docker unavailable error to include image name ## New functionality + - Require minimum celltype count for `spatial_decomposition` ## Minor changes + - Update Rcpp to 1.0.9 - Update to nf-openproblems v1.7 - # openproblems v0.5.4 Note: This changelog was automatically generated from the git log. ## Bug fixes -- Fixed an issue where some cell types were missing from the output. +- Fixed an issue where some cell types were missing from the output. # openproblems v0.5.3 Note: This changelog was automatically generated from the git log. ## Bug fixes -- Fixed a bug in the rctd method where cell types with fewer than 25 cells were not being used. +- Fixed a bug in the rctd method where cell types with fewer than 25 cells were not being used. # openproblems v0.5.2 Note: This changelog was automatically generated from the git log. ## Bug fixes -- Handle missing function error by catching FileNotFoundError and NoSuchFunctionError instead of just RuntimeError. +- Handle missing function error by catching FileNotFoundError and NoSuchFunctionError instead of just RuntimeError. # openproblems v0.5.1 Note: This changelog was automatically generated from the git log. ## Major changes + - Updated `scipy` requirement from `==1.8.*` to `>=1.8,<1.10`. - Updated `igraph` to version `1.3.4`. ## Minor changes + - Changed the mnnpy dependency to use a patch version instead of a specific commit hash. ## Bug fixes + - Changed `docker_hash` to use the Docker API if `docker` is not available. - Use `curl` to retrieve the Docker hash if `docker` fails. - Fixed an issue with using `git+https` for `mnnpy`. - # openproblems v0.5.0 Note: This changelog was automatically generated from the git log. @@ -699,70 +754,76 @@ Note: This changelog was automatically generated from the git log. - Updated issue templates to reflect the `main` branch as the default branch. - Updated pull request template to reflect the `main` branch as the default branch. - # openproblems v0.4.4 Note: This changelog was automatically generated from the git log. ## New functionality + - Added a new docker image `openproblems-r-pytorch` for running Harmony in Python ## Major changes + - Moved `harmony` to Python-based `harmony-pytorch` ## Bug fixes + - Fixed an issue where `adata.var` was not being correctly handled in `_utils.py` - Updated the documentation for the `openproblems-r-extras` docker image - # openproblems v0.4.3 Note: This changelog was automatically generated from the git log. ## New functionality + - Added PHATE with sqrt potential ## Bug fixes + - Fixed path to R_HOME - Fixed Dockerfile to use R 4.2 - Minor CI fixes - # openproblems v0.4.2 Note: This changelog was automatically generated from the git log. ## Minor changes -- Run scran pooling in series, not in parallel. +- Run scran pooling in series, not in parallel. # openproblems v0.4.1 Note: This changelog was automatically generated from the git log. ## New functionality + - Added `FastMNN`, `Harmony`, and `Liger` methods for batch integration. - Added `bbknn_full_unscaled` method. - Added Dependabot configuration for pip and GitHub Actions dependencies. ## Minor changes + - Updated dependencies: `scib`, `bbknn`, `scanorama`, `annoy`, and `mnnpy`. - Improved the performance of several methods by pre-processing the data before running them. ## Bug fixes -- Fixed bugs in `fastMNN`, `harmony`, `liger`, `scanorama`, `scanvi`, `scvi`, `mnn`, and `combat` that caused incorrect embedding. +- Fixed bugs in `fastMNN`, `harmony`, `liger`, `scanorama`, `scanvi`, `scvi`, `mnn`, and `combat` that caused incorrect embedding. # openproblems v0.4.0 Note: This changelog was automatically generated from the git log. ## New functionality + - Added a new file `workflow/generate_website_markdown.py` to generate website markdown files for all tasks and datasets. - Updated Nextflow version to v1.5. - Updated Nextflow version to v1.6. ## Major changes + - Added code version to the output of each method. - Updated `nextflow` version to `v1.3`. - Updated `nextflow` version to `v1.4`. @@ -771,6 +832,7 @@ Note: This changelog was automatically generated from the git log. - Updated Python version to 3.8.13. ## Minor changes + - Updated dependencies for the Docker images. - Updated pre-commit hooks to include `requirements-txt-fixer`. - Updated Nextflow workflow to version 1.4. @@ -778,19 +840,20 @@ Note: This changelog was automatically generated from the git log. - Updated the Tower action ID. ## Bug fixes + - Fixed a bug where Docker images were not properly pushed to Docker Hub. - Updated `requirements.txt` files to fix dependency conflicts. - Removed unnecessary dependencies from CI workflows to reduce disk space usage on GitHub runners. - # openproblems v0.3.5 Note: This changelog was automatically generated from the git log. ## New functionality + - Added new integration methods: BBKNN, Combat, FastMNN feature, FastMNN embed, Harmony, Liger, MNN, Scanorama feature, Scanorama embed, Scanvi, Scvi - Added new metrics: graph_connectivity, iso_label_f1, nmi -- Added _utils.py with functions: hvg_batch, scale_batch +- Added \_utils.py with functions: hvg_batch, scale_batch - Added `run_bbknn` function. - Added a test for the trustworthiness metric, which now passes for sparse matrices. - Added a test for the density preservation metric, which now passes against densmap for a reasonable degree of similarity. @@ -807,6 +870,7 @@ Note: This changelog was automatically generated from the git log. - Added a new `invite-contributors.yml` file to the repository. ## Major changes + - The `test_methods.py` file has been simplified by removing unused arguments. - The `test_metrics.py` file has been simplified by removing unused arguments. - The `test_utils/docker.py` file has been modified to allow specifying the docker image as a decorator argument. @@ -816,6 +880,7 @@ Note: This changelog was automatically generated from the git log. - Modified `.github/workflows/run_tests.yml` to cancel previous runs when a new commit is pushed. ## Minor changes + - Removed `.nextflow`, `scratch/`, `openproblems/results/` and `openproblems/work/` from `.gitignore`. - Updated `CONTRIBUTING.md` - Methods should not edit `adata.obsm["train"]` or `adata.obsm["test"]`. @@ -830,6 +895,7 @@ Note: This changelog was automatically generated from the git log. Note: This changelog was automatically generated from the git log. ## New functionality + - Added CeNGEN, Tabula Muris Senis, and Pancreas datasets to the label_projection task. - Added scANVI and scArches+scANVI methods to the label_projection task. - Added majority_vote and random_labels baseline methods to the label_projection task. @@ -858,18 +924,20 @@ Note: This changelog was automatically generated from the git log. - Added support for running benchmarks from forks - Added `openproblems-cli` command to run test-hash - # openproblems v0.3.3 Note: This changelog was automatically generated from the git log. ## New functionality + - Added support for balanced SCOT alignment. ## Minor changes + - Updated the workflow to store benchmark results in `/tmp`. ## Bug fixes + - Fixed the parsing and committing of benchmark results on tag. - Fixed the Github Actions badge link. - Fixed the coverage badge. @@ -877,12 +945,12 @@ Note: This changelog was automatically generated from the git log. - Ignored AWS warning and cleaned up S3 properly. - Updated the workflow to continue on error for forks. - # openproblems v0.3.2 Note: This changelog was automatically generated from the git log. ## New functionality + - Added trustworthiness metric to the dimensionality reduction task. - Added density preservation metric. - Added several metrics based on nearest neighbor ranking: continuity, co-KNN size, co-KNN AUC, local continuity meta criterion, local property, global property. @@ -893,10 +961,12 @@ Note: This changelog was automatically generated from the git log. - Added support for the Single Cell Optimal Transport (SCOT) method for multimodal data integration. - SCOT implements Gromov-Wasserstein optimal transport to align single-cell multi-omics data. - Added four variations of SCOT: -+ - sqrt CPM unbalanced -+ - sqrt CPM balanced -+ - log scran unbalanced -+ - log scran balanced + +* - sqrt CPM unbalanced +* - sqrt CPM balanced +* - log scran unbalanced +* - log scran balanced + - Each variation implements different normalization strategies for the input data. - Added `scot` method to `openproblems.tasks.multimodal_data_integration.methods`. - Added pre-processing to the `dimensionality_reduction` task. @@ -914,12 +984,12 @@ Note: This changelog was automatically generated from the git log. - Added support for running benchmark tests on tags. - Added a test directory for use in the workflow. - # openproblems v0.3.1 Note: This changelog was automatically generated from the git log. ## New functionality + - Added chromatin potential task - Added PHATE to the dimensional_reduction task. - Added support for testing docker builds on a separate branch. @@ -953,21 +1023,23 @@ Note: This changelog was automatically generated from the git log. - Added 10x PBMC dataset - Added `load_10x_5k_pbmc` function to load the 10x 5k PBMC dataset. - # openproblems v0.2.1 Note: This changelog was automatically generated from the git log. ## New functionality + - Added MLP method for label projection task. - Added pancreas data loading to label projection task. ## Minor changes + - Updated black. - Updated test version of pancreas_batch to have test data. - Added random pancreas train data. ## Bug fixes + - Fixed zebrafish code duplication. - Fixed pancreas import location. - Fixed bug in zebrafish data. @@ -976,22 +1048,25 @@ Note: This changelog was automatically generated from the git log. - Removed dummy and cheat metrics/datasets. - Removed excess covariates from pancreas dataset. - # openproblems v0.2 Note: This changelog was automatically generated from the git log. ## New functionality + - Added zebrafish label projection task ## Major changes + - Moved scIB, rpy2, harmonicalignment, and mnnpy to optional dependencies ## Minor changes + - Improved n_components fix - Moved URL into function for neater namespace ## Bug fixes + - Fixed n_svd for truncatedSVD - Fixed data loader - Fixed n_pca problem @@ -999,23 +1074,26 @@ Note: This changelog was automatically generated from the git log. - Scaled data for regression - Added check to ensure that data has nonzero size - # openproblems v0.1 Note: This changelog was automatically generated from the git log. ## New functionality + - Added a results page to the website. - Added a new zebrafish dataset to the openproblems library. - Added netlify.toml to deploy website. ## Documentation + - Updated documentation to reflect new features and datasets. ## Major changes + - Bumped version to 0.1. ## Minor changes + - Improved the website's home menu link. - Improved website links. - Updated website's hero and social links. @@ -1029,14 +1107,15 @@ Note: This changelog was automatically generated from the git log. - Updated the Travis CI configuration to exclude website from black. ## Bug fixes -- Fixed zebrafish data loader. +- Fixed zebrafish data loader. # openproblems v0.0.3 Note: This changelog was automatically generated from the git log. ## New functionality + - Added harmonic alignment method. - Added scicar datasets. - Added logistic regression methods. @@ -1045,9 +1124,11 @@ Note: This changelog was automatically generated from the git log. - Added normalization tools. ## Documentation + - Updated documentation to reflect normalization changes. ## Major changes + - Migrated normalizations to openproblems.tools.normalize. - Updated dataset specification to require normalization in methods. - Removed zebrafish dataset. @@ -1058,6 +1139,7 @@ Note: This changelog was automatically generated from the git log. - Migrated references to github repo. ## Minor changes + - Improved sparse array equality test. - Improved sparse inequality check. - Increased test data size. @@ -1078,6 +1160,7 @@ Note: This changelog was automatically generated from the git log. - Increased test coverage. ## Bug fixes + - Bugfix harmonic_alignment, closes #4. - Bugfix harmonic alignment import. - Normalized data inside methods, closes #19. @@ -1088,7 +1171,6 @@ Note: This changelog was automatically generated from the git log. - Fix cheat method. - Don't check for raw data -- we are no longer normalizing. - # openproblems v0.0.2 Note: This changelog was automatically generated from the git log. @@ -1123,4 +1205,5 @@ Note: This changelog was automatically generated from the git log. First release of OpenProblems. methods, 1 metric) -* Multimodal data integration (2 datasets, 2 methods, 2 metrics) + +- Multimodal data integration (2 datasets, 2 methods, 2 metrics) diff --git a/src/common/create_component/config.vsh.yaml b/src/common/create_component/config.vsh.yaml index b8dc748fb6..58303a1ca8 100644 --- a/src/common/create_component/config.vsh.yaml +++ b/src/common/create_component/config.vsh.yaml @@ -1,5 +1,6 @@ functionality: name: create_component + status: disabled namespace: common description: | Create a component Viash component. diff --git a/src/common/create_task_readme/config.vsh.yaml b/src/common/create_task_readme/config.vsh.yaml index cff0917b0d..273e196ffb 100644 --- a/src/common/create_task_readme/config.vsh.yaml +++ b/src/common/create_task_readme/config.vsh.yaml @@ -1,5 +1,6 @@ functionality: name: create_task_readme + status: disabled namespace: common description: | Create a README for the task. diff --git a/src/datasets/normalization/atac_tfidf/config.vsh.yaml b/src/datasets/normalization/atac_tfidf/config.vsh.yaml index 5a8f56306a..31319f0958 100644 --- a/src/datasets/normalization/atac_tfidf/config.vsh.yaml +++ b/src/datasets/normalization/atac_tfidf/config.vsh.yaml @@ -17,6 +17,7 @@ platforms: - type: python packages: - muon + - numpy<2 - type: nextflow directives: label: [midtime, midmem, midcpu] diff --git a/src/tasks/batch_integration/README.md b/src/tasks/batch_integration/README.md index 073a654508..8f95a4bcd2 100644 --- a/src/tasks/batch_integration/README.md +++ b/src/tasks/batch_integration/README.md @@ -1,571 +1,3 @@ # Batch Integration - -Remove unwanted batch effects from scRNA data while retaining -biologically meaningful variation. - -Path: -[`src/tasks/batch_integration`](https://github.com/openproblems-bio/openproblems/tree/main/src/tasks/batch_integration) - -## Motivation - -As single-cell technologies advance, single-cell datasets are growing -both in size and complexity. Especially in consortia such as the Human -Cell Atlas, individual studies combine data from multiple labs, each -sequencing multiple individuals possibly with different technologies. -This gives rise to complex batch effects in the data that must be -computationally removed to perform a joint analysis. These batch -integration methods must remove the batch effect while not removing -relevant biological information. Currently, over 200 tools exist that -aim to remove batch effects scRNA-seq datasets \[@zappia2018exploring\]. -These methods balance the removal of batch effects with the conservation -of nuanced biological information in different ways. This abundance of -tools has complicated batch integration method choice, leading to -several benchmarks on this topic \[@luecken2020benchmarking; -@tran2020benchmark; @chazarragil2021flexible; @mereu2020benchmarking\]. -Yet, benchmarks use different metrics, method implementations and -datasets. Here we build a living benchmarking task for batch integration -methods with the vision of improving the consistency of method -evaluation. - -## Description - -In this task we evaluate batch integration methods on their ability to -remove batch effects in the data while conserving variation attributed -to biological effects. As input, methods require either normalised or -unnormalised data with multiple batches and consistent cell type labels. -The batch integrated output can be a feature matrix, a low dimensional -embedding and/or a neighbourhood graph. The respective batch-integrated -representation is then evaluated using sets of metrics that capture how -well batch effects are removed and whether biological variance is -conserved. We have based this particular task on the latest, and most -extensive benchmark of single-cell data integration methods. - -## Authors & contributors - -| name | roles | -|:------------------|:-------------------| -| Michaela Mueller | maintainer, author | -| Kai Waldrant | contributor | -| Robrecht Cannoodt | contributor | -| Daniel Strobl | author | - -## API - -``` mermaid -flowchart LR - file_common_dataset("Common Dataset") - comp_process_dataset[/"Data processor"/] - file_dataset("Dataset") - file_solution("Solution") - comp_control_method_embedding[/"Control method (embedding)"/] - comp_control_method_graaf[/"Control method (graph)"/] - comp_method_embedding[/"Method (embedding)"/] - comp_method_feature[/"Method (feature)"/] - comp_method_graaf[/"Method (graph)"/] - comp_metric_embedding[/"Metric (embedding)"/] - comp_metric_feature[/"Metric (feature)"/] - comp_metric_graaf[/"Metric (graph)"/] - file_integrated_embedding("Integrated embedding") - file_integrated_graaf("Integrated Graph") - file_integrated_feature("Integrated Feature") - file_score("Score") - comp_transformer_embedding_to_graaf[/"Embedding to Graph"/] - comp_transformer_feature_to_embedding[/"Feature to Embedding"/] - file_common_dataset---comp_process_dataset - comp_process_dataset-->file_dataset - comp_process_dataset-->file_solution - file_dataset---comp_control_method_embedding - file_dataset---comp_control_method_graaf - file_dataset---comp_method_embedding - file_dataset---comp_method_feature - file_dataset---comp_method_graaf - file_solution---comp_metric_embedding - file_solution---comp_metric_feature - file_solution---comp_metric_graaf - comp_control_method_embedding-->file_integrated_embedding - comp_control_method_graaf-->file_integrated_graaf - comp_method_embedding-->file_integrated_embedding - comp_method_feature-->file_integrated_feature - comp_method_graaf-->file_integrated_graaf - comp_metric_embedding-->file_score - comp_metric_feature-->file_score - comp_metric_graaf-->file_score - file_integrated_embedding---comp_metric_embedding - file_integrated_embedding---comp_transformer_embedding_to_graaf - file_integrated_graaf---comp_metric_graaf - file_integrated_feature---comp_metric_feature - file_integrated_feature---comp_transformer_feature_to_embedding - comp_transformer_embedding_to_graaf-->file_integrated_graaf - comp_transformer_feature_to_embedding-->file_integrated_embedding -``` - -## File format: Common Dataset - -A subset of the common dataset. - -Example file: `resources_test/common/pancreas/dataset.h5ad` - -Format: - -
- - AnnData object - obs: 'cell_type', 'batch' - var: 'hvg', 'hvg_score', 'feature_name' - obsm: 'X_pca' - obsp: 'knn_distances', 'knn_connectivities' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'knn' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["cell_type"]` | `string` | Cell type information. | -| `obs["batch"]` | `string` | Batch information. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | -| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | - -
- -## Component type: Data processor - -Path: -[`src/batch_integration`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration) - -A label projection dataset processor. - -Arguments: - -
- -| Name | Type | Description | -|:--------------------|:----------|:---------------------------------------------------------------------------| -| `--input` | `file` | A subset of the common dataset. | -| `--output_dataset` | `file` | (*Output*) Unintegrated AnnData HDF5 file. | -| `--output_solution` | `file` | (*Output*) Solution dataset. | -| `--obs_label` | `string` | (*Optional*) Which .obs slot to use as label. Default: `cell_type`. | -| `--obs_batch` | `string` | (*Optional*) Which .obs slot to use as batch covariate. Default: `batch`. | -| `--hvgs` | `integer` | (*Optional*) Number of highly variable genes. Default: `2000`. | -| `--subset_hvg` | `boolean` | (*Optional*) Whether to subset to highly variable genes. Default: `FALSE`. | - -
- -## File format: Dataset - -Unintegrated AnnData HDF5 file. - -Example file: `resources_test/batch_integration/pancreas/dataset.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'label' - var: 'hvg', 'hvg_score', 'feature_name' - obsm: 'X_pca' - obsp: 'knn_distances', 'knn_connectivities' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'knn' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["label"]` | `string` | label information. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | -| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | - -
- -## File format: Solution - -Solution dataset - -Example file: `resources_test/batch_integration/pancreas/solution.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'label' - var: 'hvg', 'hvg_score', 'feature_name' - obsm: 'X_pca' - obsp: 'knn_distances', 'knn_connectivities' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'knn' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["label"]` | `string` | label information. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | -| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | - -
- -## Component type: Control method (embedding) - -Path: -[`src/batch_integration/control_methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/control_methods) - -A batch integration embedding control method. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:--------------------------------------------| -| `--input` | `file` | Unintegrated AnnData HDF5 file. | -| `--output` | `file` | (*Output*) An integrated AnnData HDF5 file. | - -
- -## Component type: Control method (graph) - -Path: -[`src/batch_integration/control_methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/control_methods) - -A batch integration graph control method. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:-----------------------------------------| -| `--input` | `file` | Unintegrated AnnData HDF5 file. | -| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | - -
- -## Component type: Method (embedding) - -Path: -[`src/batch_integration/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/methods) - -A batch integration embedding method. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:--------------------------------------------| -| `--input` | `file` | Unintegrated AnnData HDF5 file. | -| `--output` | `file` | (*Output*) An integrated AnnData HDF5 file. | - -
- -## Component type: Method (feature) - -Path: -[`src/batch_integration/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/methods) - -A batch integration feature method. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:-----------------------------------------| -| `--input` | `file` | Unintegrated AnnData HDF5 file. | -| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | - -
- -## Component type: Method (graph) - -Path: -[`src/batch_integration/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/methods) - -A batch integration graph method. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:-----------------------------------------| -| `--input` | `file` | Unintegrated AnnData HDF5 file. | -| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | - -
- -## Component type: Metric (embedding) - -Path: -[`src/batch_integration/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/metrics) - -A batch integration embedding metric. - -Arguments: - -
- -| Name | Type | Description | -|:---------------------|:-------|:---------------------------------| -| `--input_integrated` | `file` | An integrated AnnData HDF5 file. | -| `--input_solution` | `file` | Solution dataset. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## Component type: Metric (feature) - -Path: -[`src/batch_integration/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/metrics) - -A batch integration feature metric. - -Arguments: - -
- -| Name | Type | Description | -|:---------------------|:-------|:------------------------------| -| `--input_integrated` | `file` | Integrated AnnData HDF5 file. | -| `--input_solution` | `file` | Solution dataset. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## Component type: Metric (graph) - -Path: -[`src/batch_integration/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/metrics) - -A batch integration graph metric. - -Arguments: - -
- -| Name | Type | Description | -|:---------------------|:-------|:------------------------------| -| `--input_integrated` | `file` | Integrated AnnData HDF5 file. | -| `--input_solution` | `file` | Solution dataset. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## File format: Integrated embedding - -An integrated AnnData HDF5 file. - -Example file: -`resources_test/batch_integration/pancreas/integrated_embedding.h5ad` - -Format: - -
- - AnnData object - obsm: 'X_emb' - uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:--------------------------------------------------------| -| `obsm["X_emb"]` | `double` | integration embedding prediction. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | - -
- -## File format: Integrated Graph - -Integrated AnnData HDF5 file. - -Example file: -`resources_test/batch_integration/pancreas/integrated_graph.h5ad` - -Format: - -
- - AnnData object - obsp: 'connectivities', 'distances' - uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id', 'neighbors' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:--------------------------------------------------------| -| `obsp["connectivities"]` | `double` | Neighbors connectivities matrix. | -| `obsp["distances"]` | `double` | Neighbors connectivities matrix. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["neighbors"]` | `object` | Supplementary K nearest neighbors data. | - -
- -## File format: Integrated Feature - -Integrated AnnData HDF5 file. - -Example file: -`resources_test/batch_integration/pancreas/integrated_feature.h5ad` - -Format: - -
- - AnnData object - layers: 'corrected_counts' - uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:---------|:--------------------------------------------------------| -| `layers["corrected_counts"]` | `double` | Corrected counts after integration. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | - -
- -## File format: Score - -Metric score file - -Example file: `score.h5ad` - -Format: - -
- - AnnData object - uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | -| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | - -
- -## Component type: Embedding to Graph - -Path: -[`src/batch_integration/transformers`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/transformers) - -Transform an embedding to a graph output. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:-----------------------------------------| -| `--input` | `file` | An integrated AnnData HDF5 file. | -| `--output` | `file` | (*Output*) Integrated AnnData HDF5 file. | - -
- -## Component type: Feature to Embedding - -Path: -[`src/batch_integration/transformers`](https://github.com/openproblems-bio/openproblems/tree/main/src/batch_integration/transformers) - -Transform a feature output to an embedding. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:--------------------------------------------| -| `--input` | `file` | Integrated AnnData HDF5 file. | -| `--output` | `file` | (*Output*) An integrated AnnData HDF5 file. | - -
- +# This task has been moved to [https://github.com/openproblems-bio/task_batch_integration](https://github.com/openproblems-bio/task_batch_integration)! diff --git a/src/tasks/batch_integration/api/comp_control_method_embedding.yaml b/src/tasks/batch_integration/api/comp_control_method_embedding.yaml deleted file mode 100644 index 9c4bc65ce5..0000000000 --- a/src/tasks/batch_integration/api/comp_control_method_embedding.yaml +++ /dev/null @@ -1,26 +0,0 @@ -functionality: - namespace: batch_integration/control_methods - info: - type: control_method - subtype: embedding - type_info: - label: Control method (embedding) - summary: A batch integration embedding control method. - description: | - A batch integration control method which outputs a batch-corrected embedding. - arguments: - - name: --input - __merge__: file_dataset.yaml - direction: input - required: true - - name: --output - direction: output - __merge__: file_integrated_embedding.yaml - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_control_method_feature.yaml b/src/tasks/batch_integration/api/comp_control_method_feature.yaml deleted file mode 100644 index 3d2ac9853d..0000000000 --- a/src/tasks/batch_integration/api/comp_control_method_feature.yaml +++ /dev/null @@ -1,26 +0,0 @@ -functionality: - namespace: batch_integration/control_methods - info: - type: control_method - subtype: feature - type_info: - label: Control method (feature) - summary: A batch integration feature control method. - description: | - A batch integration control method which outputs a batch-corrected feature space. - arguments: - - name: --input - __merge__: file_dataset.yaml - direction: input - required: true - - name: --output - direction: output - __merge__: file_integrated_feature.yaml - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_control_method_graph.yaml b/src/tasks/batch_integration/api/comp_control_method_graph.yaml deleted file mode 100644 index cba6f48f7a..0000000000 --- a/src/tasks/batch_integration/api/comp_control_method_graph.yaml +++ /dev/null @@ -1,26 +0,0 @@ -functionality: - namespace: batch_integration/control_methods - info: - type: control_method - subtype: graph - type_info: - label: Control method (graph) - summary: A batch integration graph control method. - description: | - A batch integration control method which outputs a batch-corrected cell graphs. - arguments: - - __merge__: file_dataset.yaml - name: --input - direction: input - required: true - - __merge__: file_integrated_graph.yaml - name: --output - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_method_embedding.yaml b/src/tasks/batch_integration/api/comp_method_embedding.yaml deleted file mode 100644 index 86e7d7caf3..0000000000 --- a/src/tasks/batch_integration/api/comp_method_embedding.yaml +++ /dev/null @@ -1,29 +0,0 @@ -functionality: - namespace: batch_integration/methods - info: - type: method - subtype: embedding - type_info: - label: Method (embedding) - summary: A batch integration embedding method. - description: | - A batch integration method which outputs a batch-corrected embedding. - arguments: - - name: --input - __merge__: file_dataset.yaml - direction: input - required: true - - name: --output - __merge__: file_integrated_embedding.yaml - direction: output - required: true - test_resources: - # check method component - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - path: /src/common/library.bib - # auto-run component - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_method_feature.yaml b/src/tasks/batch_integration/api/comp_method_feature.yaml deleted file mode 100644 index d609c2dd5b..0000000000 --- a/src/tasks/batch_integration/api/comp_method_feature.yaml +++ /dev/null @@ -1,29 +0,0 @@ -functionality: - namespace: batch_integration/methods - info: - type: method - subtype: feature - type_info: - label: Method (feature) - summary: A batch integration feature method. - description: | - A batch integration method which outputs a batch-corrected feature-space. - arguments: - - name: --input - __merge__: file_dataset.yaml - direction: input - required: true - - name: --output - __merge__: file_integrated_feature.yaml - direction: output - required: true - test_resources: - # check method component - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - path: /src/common/library.bib - # auto-run component - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_method_graph.yaml b/src/tasks/batch_integration/api/comp_method_graph.yaml deleted file mode 100644 index 2f37146e24..0000000000 --- a/src/tasks/batch_integration/api/comp_method_graph.yaml +++ /dev/null @@ -1,29 +0,0 @@ -functionality: - namespace: batch_integration/methods - info: - type: method - subtype: graph - type_info: - label: Method (graph) - summary: A batch integration graph method. - description: | - A batch integration method which outputs a batch-corrected cell graphs. - arguments: - - name: --input - __merge__: file_dataset.yaml - direction: input - required: true - - name: --output - __merge__: file_integrated_graph.yaml - direction: output - required: true - test_resources: - # check method component - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - path: /src/common/library.bib - # auto-run component - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas diff --git a/src/tasks/batch_integration/api/comp_metric_embedding.yaml b/src/tasks/batch_integration/api/comp_metric_embedding.yaml deleted file mode 100644 index 7443fca8b4..0000000000 --- a/src/tasks/batch_integration/api/comp_metric_embedding.yaml +++ /dev/null @@ -1,38 +0,0 @@ -functionality: - namespace: batch_integration/metrics - info: - type: metric - subtype: embedding - type_info: - label: Metric (embedding) - summary: A batch integration embedding metric. - description: | - A metric for evaluating batch corrected embeddings. - test_setup: - pancreas: - input_integrated: resources_test/batch_integration/pancreas/integrated_embedding.h5ad - input_solution: resources_test/batch_integration/pancreas/solution.h5ad - cellxgene_census: - input_integrated: resources_test/batch_integration/cxg_mouse_pancreas_atlas/integrated_embedding.h5ad - input_solution: resources_test/batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad - arguments: - - name: --input_integrated - __merge__: file_integrated_embedding.yaml - direction: input - required: true - - name: --input_solution - __merge__: file_solution.yaml - direction: input - required: true - - name: --output - __merge__: file_score.yaml - direction: output - required: true - test_resources: - - path: /resources_test/batch_integration/ - dest: resources_test/batch_integration/ - # - type: python_script - # path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib diff --git a/src/tasks/batch_integration/api/comp_metric_feature.yaml b/src/tasks/batch_integration/api/comp_metric_feature.yaml deleted file mode 100644 index 2f741d0aa2..0000000000 --- a/src/tasks/batch_integration/api/comp_metric_feature.yaml +++ /dev/null @@ -1,31 +0,0 @@ -functionality: - namespace: batch_integration/metrics - info: - type: metric - subtype: feature - type_info: - label: Metric (feature) - summary: A batch integration feature metric. - description: | - A metric for evaluating batch corrected feature spaces. - arguments: - - name: --input_integrated - __merge__: file_integrated_feature.yaml - direction: input - required: true - - name: --input_solution - __merge__: file_solution.yaml - direction: input - required: true - - name: --output - __merge__: file_score.yaml - direction: output - required: true - test_resources: - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas - - type: python_script - path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib diff --git a/src/tasks/batch_integration/api/comp_metric_graph.yaml b/src/tasks/batch_integration/api/comp_metric_graph.yaml deleted file mode 100644 index 66935b9663..0000000000 --- a/src/tasks/batch_integration/api/comp_metric_graph.yaml +++ /dev/null @@ -1,31 +0,0 @@ -functionality: - namespace: batch_integration/metrics - info: - type: metric - subtype: graph - type_info: - label: Metric (graph) - summary: A batch integration graph metric. - description: | - A metric for evaluating batch corrected cell graphs. - arguments: - - name: --input_integrated - __merge__: file_integrated_graph.yaml - direction: input - required: true - - name: --input_solution - __merge__: file_solution.yaml - direction: input - required: true - - name: --output - __merge__: file_score.yaml - direction: output - required: true - test_resources: - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas - - type: python_script - path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib diff --git a/src/tasks/batch_integration/api/comp_process_dataset.yaml b/src/tasks/batch_integration/api/comp_process_dataset.yaml deleted file mode 100644 index 715ef6d3c3..0000000000 --- a/src/tasks/batch_integration/api/comp_process_dataset.yaml +++ /dev/null @@ -1,45 +0,0 @@ -functionality: - namespace: batch_integration - info: - type: process_dataset - type_info: - label: Data processor - summary: A label projection dataset processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. - arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - direction: input - required: true - - name: "--output_dataset" - __merge__: file_dataset.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true - - name: "--obs_label" - type: "string" - description: "Which .obs slot to use as label." - default: "cell_type" - - name: "--obs_batch" - type: "string" - description: "Which .obs slot to use as batch covariate." - default: "batch" - - name: --hvgs - type: integer - description: Number of highly variable genes - default: 2000 - required: false - - name: --subset_hvg - type: boolean - description: Whether to subset to highly variable genes - default: false - required: false - test_resources: - - path: /resources_test/common/pancreas/ - dest: resources_test/common/pancreas/ - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/batch_integration/api/comp_transformer_embedding_to_graph.yaml b/src/tasks/batch_integration/api/comp_transformer_embedding_to_graph.yaml deleted file mode 100644 index d8e815dad5..0000000000 --- a/src/tasks/batch_integration/api/comp_transformer_embedding_to_graph.yaml +++ /dev/null @@ -1,25 +0,0 @@ -functionality: - namespace: batch_integration/transformers - info: - type: transformer - subtype: graph - type_info: - label: Embedding to Graph - summary: Transform an embedding to a graph output. - description: | - Transform an embedding to a graph output by applying the k nearest neighbors algorithm. - arguments: - - name: --input - __merge__: file_integrated_embedding.yaml - direction: input - required: true - - name: --output - __merge__: file_integrated_graph.yaml - direction: output - required: true - test_resources: - # auto-run component - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas \ No newline at end of file diff --git a/src/tasks/batch_integration/api/comp_transformer_feature_to_embedding.yaml b/src/tasks/batch_integration/api/comp_transformer_feature_to_embedding.yaml deleted file mode 100644 index 788e4b965a..0000000000 --- a/src/tasks/batch_integration/api/comp_transformer_feature_to_embedding.yaml +++ /dev/null @@ -1,25 +0,0 @@ -functionality: - namespace: batch_integration/transformers - info: - type: transformer - subtype: embedding - type_info: - label: Feature to Embedding - summary: Transform a feature output to an embedding. - description: | - Transform a feature output to an embedding by computing a PCA on the corrected counts. - arguments: - - name: --input - __merge__: file_integrated_feature.yaml - direction: input - required: true - - name: --output - __merge__: file_integrated_embedding.yaml - direction: output - required: true - test_resources: - # auto-run component - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/batch_integration/pancreas - dest: resources_test/batch_integration/pancreas \ No newline at end of file diff --git a/src/tasks/batch_integration/api/file_common_dataset.yaml b/src/tasks/batch_integration/api/file_common_dataset.yaml deleted file mode 100644 index 097a6794a1..0000000000 --- a/src/tasks/batch_integration/api/file_common_dataset.yaml +++ /dev/null @@ -1,92 +0,0 @@ -# This file is based on the spec of the common dataset located at -# `src/datasets/api/file_common_dataset.yaml`. However, some fields -# such as obs.cell_type and obs.batch are now required -type: file -example: "resources_test/common/pancreas/dataset.h5ad" -info: - label: "Common Dataset" - summary: A subset of the common dataset. - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: cell_type - description: Cell type information - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - - type: string - name: feature_name - description: A human-readable name for the feature, usually a gene symbol. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - obsp: - - type: double - name: knn_distances - description: K nearest neighbors distance matrix. - required: true - - type: double - name: knn_connectivities - description: K nearest neighbors connectivities matrix. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: object - name: knn - description: Supplementary K nearest neighbors data. - required: true - diff --git a/src/tasks/batch_integration/api/file_dataset.yaml b/src/tasks/batch_integration/api/file_dataset.yaml deleted file mode 100644 index 6d1eb928d8..0000000000 --- a/src/tasks/batch_integration/api/file_dataset.yaml +++ /dev/null @@ -1,69 +0,0 @@ -type: file -example: "resources_test/batch_integration/pancreas/dataset.h5ad" -info: - label: "Dataset" - summary: Unintegrated AnnData HDF5 file. - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: string - name: label - description: label information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - - type: string - name: feature_name - description: A human-readable name for the feature, usually a gene symbol. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - obsp: - - type: double - name: knn_distances - description: K nearest neighbors distance matrix. - required: true - - type: double - name: knn_connectivities - description: K nearest neighbors connectivities matrix. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: object - name: knn - description: Supplementary K nearest neighbors data. - required: true - diff --git a/src/tasks/batch_integration/api/file_integrated_embedding.yaml b/src/tasks/batch_integration/api/file_integrated_embedding.yaml deleted file mode 100644 index aa526abe71..0000000000 --- a/src/tasks/batch_integration/api/file_integrated_embedding.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/batch_integration/pancreas/integrated_embedding.h5ad" -info: - prediction_type: embedding - label: "Integrated embedding" - summary: An integrated AnnData HDF5 file. - slots: - obsm: - - type: double - name: X_emb - description: integration embedding prediction - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: method_id - description: "A unique identifier for the method" - required: true diff --git a/src/tasks/batch_integration/api/file_integrated_feature.yaml b/src/tasks/batch_integration/api/file_integrated_feature.yaml deleted file mode 100644 index b89e16f907..0000000000 --- a/src/tasks/batch_integration/api/file_integrated_feature.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/batch_integration/pancreas/integrated_feature.h5ad" -info: - prediction_type: feature - label: "Integrated Feature" - summary: Integrated AnnData HDF5 file. - slots: - layers: - - type: double - name: corrected_counts - description: Corrected counts after integration - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: method_id - description: "A unique identifier for the method" - required: true \ No newline at end of file diff --git a/src/tasks/batch_integration/api/file_integrated_graph.yaml b/src/tasks/batch_integration/api/file_integrated_graph.yaml deleted file mode 100644 index 8c09147d0d..0000000000 --- a/src/tasks/batch_integration/api/file_integrated_graph.yaml +++ /dev/null @@ -1,37 +0,0 @@ -type: file -example: "resources_test/batch_integration/pancreas/integrated_graph.h5ad" -info: - prediction_type: graph - label: "Integrated Graph" - summary: Integrated AnnData HDF5 file. - slots: - obsp: - - type: double - name: connectivities - description: Neighbors connectivities matrix. - required: true - - type: double - name: distances - description: Neighbors connectivities matrix. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: object - name: neighbors - description: Supplementary K nearest neighbors data. - required: true diff --git a/src/tasks/batch_integration/api/file_score.yaml b/src/tasks/batch_integration/api/file_score.yaml deleted file mode 100644 index 9b4dac654f..0000000000 --- a/src/tasks/batch_integration/api/file_score.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "score.h5ad" -info: - label: "Score" - summary: "Metric score file" - slots: - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: string - name: metric_ids - description: "One or more unique metric identifiers" - multiple: true - required: true - - type: double - name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." - multiple: true - required: true \ No newline at end of file diff --git a/src/tasks/batch_integration/api/file_solution.yaml b/src/tasks/batch_integration/api/file_solution.yaml deleted file mode 100644 index 7e8b07ea4c..0000000000 --- a/src/tasks/batch_integration/api/file_solution.yaml +++ /dev/null @@ -1,89 +0,0 @@ -type: file -example: "resources_test/batch_integration/pancreas/solution.h5ad" -info: - label: "Solution" - summary: Solution dataset - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: string - name: label - description: label information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - - type: string - name: feature_name - description: A human-readable name for the feature, usually a gene symbol. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - obsp: - - type: double - name: knn_distances - description: K nearest neighbors distance matrix. - required: true - - type: double - name: knn_connectivities - description: K nearest neighbors connectivities matrix. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: object - name: knn - description: Supplementary K nearest neighbors data. - required: true - diff --git a/src/tasks/batch_integration/api/task_info.yaml b/src/tasks/batch_integration/api/task_info.yaml deleted file mode 100644 index bc3a575029..0000000000 --- a/src/tasks/batch_integration/api/task_info.yaml +++ /dev/null @@ -1,41 +0,0 @@ -name: batch_integration -label: Batch Integration -v1: - path: openproblems/tasks/batch_integration/README.md - commit: 637163fba7d74ab5393c2adbee5354dcf4d46f85 -summary: Remove unwanted batch effects from scRNA data while retaining biologically meaningful variation. -image: thumbnail.svg -motivation: | - As single-cell technologies advance, single-cell datasets are growing both in size and complexity. - Especially in consortia such as the Human Cell Atlas, individual studies combine data from multiple labs, each sequencing multiple individuals possibly with different technologies. - This gives rise to complex batch effects in the data that must be computationally removed to perform a joint analysis. - These batch integration methods must remove the batch effect while not removing relevant biological information. - Currently, over 200 tools exist that aim to remove batch effects scRNA-seq datasets [@zappia2018exploring]. - These methods balance the removal of batch effects with the conservation of nuanced biological information in different ways. - This abundance of tools has complicated batch integration method choice, leading to several benchmarks on this topic [@luecken2020benchmarking; @tran2020benchmark; @chazarragil2021flexible; @mereu2020benchmarking]. - Yet, benchmarks use different metrics, method implementations and datasets. Here we build a living benchmarking task for batch integration methods with the vision of improving the consistency of method evaluation. -description: | - In this task we evaluate batch integration methods on their ability to remove batch effects in the data while conserving variation attributed to biological effects. - As input, methods require either normalised or unnormalised data with multiple batches and consistent cell type labels. - The batch integrated output can be a feature matrix, a low dimensional embedding and/or a neighbourhood graph. - The respective batch-integrated representation is then evaluated using sets of metrics that capture how well batch effects are removed and whether biological variance is conserved. - We have based this particular task on the latest, and most extensive benchmark of single-cell data integration methods. -authors: - - name: Michaela Mueller - roles: [ maintainer, author ] - info: - github: mumichae - - name: Kai Waldrant - roles: [ contributor ] - info: - github: KaiWaldrant - orcid: "0009-0003-8555-1361" - - name: Robrecht Cannoodt - roles: [ contributor ] - info: - github: rcannood - orcid: "0000-0003-3641-729X" - - name: Daniel Strobl - roles: [ author ] - info: - github: danielStrobl diff --git a/src/tasks/batch_integration/api/thumbnail.svg b/src/tasks/batch_integration/api/thumbnail.svg deleted file mode 100644 index 77626c5bfb..0000000000 --- a/src/tasks/batch_integration/api/thumbnail.svg +++ /dev/null @@ -1 +0,0 @@ -Batch 1Batch 2dim-2dim-1dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml deleted file mode 100644 index c2484fbaa2..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/config.vsh.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_embedding.yaml -functionality: - name: batch_embed - namespace: batch_integration/control_methods/no_integration - info: - label: No integration by Batch - summary: "Cells are embedded by computing PCA independently on each batch" - description: "Cells are embedded by computing PCA independently on each batch" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py deleted file mode 100644 index 801440ce65..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/batch_embed/script.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -import scanpy as sc -import numpy as np - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar' -} - -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) -adata.var["highly_variable"] = adata.var["hvg"] - -print("Process dataset", flush=True) -adata.obsm["X_emb"] = np.zeros((adata.shape[0], 50), dtype=float) -for batch in adata.obs["batch"].unique(): - batch_idx = adata.obs["batch"] == batch - n_comps = min(50, np.sum(batch_idx)) - solver = "full" if n_comps == np.sum(batch_idx) else "arpack" - adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca( - adata[batch_idx].copy(), - n_comps=n_comps, - use_highly_variable=True, - svd_solver=solver, - copy=True, - ).obsm["X_pca"] - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml deleted file mode 100644 index 95212518c5..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/global_embed/config.vsh.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_embedding.yaml -functionality: - name: global_embed - namespace: batch_integration/control_methods/no_integration - info: - label: No integration - summary: "Cells are embedded by PCA on the unintegrated data" - description: "Cells are embedded by PCA on the unintegrated data" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py deleted file mode 100644 index f45038806b..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/global_embed/script.py +++ /dev/null @@ -1,36 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar', - "resources_dir": "src/tasks/batch_integration/control_methods/" -} - -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsm='obsm', - uns='uns' -) - -print("process dataset", flush=True) -adata.obsm["X_emb"] = adata.obsm["X_pca"] - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml deleted file mode 100644 index b20701c8f1..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/global_feature/config.vsh.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_feature.yaml -functionality: - name: global_feature - namespace: batch_integration/control_methods/no_integration - info: - label: No integration - summary: "Original feature space is not modified" - description: "Original feature space is not modified" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py deleted file mode 100644 index 2acdbf9b7a..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/global_feature/script.py +++ /dev/null @@ -1,38 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar', - "resources_dir": "src/tasks/batch_integration/control_methods/" -} - -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - -# no processing, subset matrix to highly variable genes -adata_hvg = adata[:, adata.var["hvg"]].copy() -adata.layers['corrected_counts'] = adata_hvg.X.copy() - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml deleted file mode 100644 index 86886ce263..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/global_graph/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_graph.yaml -functionality: - name: global_graph - namespace: batch_integration/control_methods/no_integration - info: - label: No integration - summary: "kNN graph is built on the PCA of the unintegrated data" - description: "Cells are embedded by PCA on the unintegrated data. A kNN graph is built on this PCA." - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py deleted file mode 100644 index 4824c8f443..0000000000 --- a/src/tasks/batch_integration/control_methods/no_integration/global_graph/script.py +++ /dev/null @@ -1,41 +0,0 @@ -import scanpy as sc -import sys - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar', - "resources_dir": "src/tasks/batch_integration/control_methods/" -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _set_uns -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsp='obsp', - uns='uns' -) - -print("process dataset", flush=True) -neighbors_map = adata.uns['knn'] -adata.obsp['connectivities'] = adata.obsp[neighbors_map['connectivities_key']] -adata.obsp['distances'] = adata.obsp[neighbors_map['distances_key']] -_set_uns(adata, neighbors_key='knn') - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml deleted file mode 100644 index 6c853a7719..0000000000 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_embedding.yaml -functionality: - name: celltype_embed - namespace: batch_integration/control_methods/perfect_integration - info: - label: Perfect embedding by cell type - summary: "Cells are embedded as a one-hot encoding of celltype labels" - description: "Cells are embedded as a one-hot encoding of celltype labels" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py deleted file mode 100644 index ca16a60ab2..0000000000 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_embed/script.py +++ /dev/null @@ -1,34 +0,0 @@ -import anndata as ad -import sys - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar' -} - -## VIASH END -sys.path.append(meta["resources_dir"]) -from utils import _perfect_embedding -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - uns='uns' -) - -print('Process data...', flush=True) -adata.obsm["X_emb"] = _perfect_embedding(partition=adata.obs["label"]) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml deleted file mode 100644 index e945e3bc58..0000000000 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/config.vsh.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_embedding.yaml -functionality: - name: celltype_jitter_embed - namespace: batch_integration/control_methods/perfect_integration - info: - label: Perfect embedding by celltype with jitter - summary: "Cells are embedded as a one-hot encoding of celltype labels, with a small amount of random noise added to the embedding" - description: "Cells are embedded as a one-hot encoding of celltype labels, with a small amount of random noise added to the embedding" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - arguments: - - name: "--jitter" - type: double - default: 0.01 - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py b/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py deleted file mode 100644 index 8f88f77472..0000000000 --- a/src/tasks/batch_integration/control_methods/perfect_integration/celltype_jitter_embed/script.py +++ /dev/null @@ -1,38 +0,0 @@ -import anndata as ad -import sys - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'jitter': 0.01, -} - -meta = { - 'functionality': 'foo', - 'config': 'bar' -} - -## VIASH END -sys.path.append(meta["resources_dir"]) -from utils import _perfect_embedding -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - uns='uns' -) - -print('Process data...', flush=True) -adata.obsm["X_emb"] = _perfect_embedding( - partition=adata.obs["label"], - jitter=par["jitter"] -) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml deleted file mode 100644 index d8bcee01d4..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_embedding.yaml -functionality: - name: batch_embed - namespace: batch_integration/control_methods/random_integration - info: - label: Random integration by batch - summary: "Embedding coordinates are randomly permuted within each batch" - description: "Embedding coordinates are randomly permuted within each batch" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py deleted file mode 100644 index 175a449a49..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_embed/script.py +++ /dev/null @@ -1,40 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar', - "resources_dir": "src/tasks/batch_integration/control_methods/" -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_features -from read_anndata_partial import read_anndata - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsm='obsm', - uns='uns' -) - -print("process dataset", flush=True) -adata.obsm["X_emb"] = _randomize_features( - adata.obsm["X_pca"], - partition=adata.obs["batch"], -) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml deleted file mode 100644 index 5f98284bb9..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_feature.yaml -functionality: - name: batch_feature - namespace: batch_integration/control_methods/random_integration - info: - label: Random integration by batch - summary: "Feature values are randomly permuted within each batch" - description: "Feature values are randomly permuted within each batch" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: acf5c95a7306b819c4a13972783433d0a48f769b - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py deleted file mode 100644 index 630871e780..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_feature/script.py +++ /dev/null @@ -1,41 +0,0 @@ -import anndata as ad -import sys - - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad' -} - -meta = { - 'functionality_name': 'foo', - 'config': 'bar', -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_features -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - -adata.layers['corrected_counts'] = _randomize_features( - adata.X, - partition=adata.obs["batch"], -) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml deleted file mode 100644 index 72a12c5031..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_graph.yaml -functionality: - name: batch_graph - namespace: batch_integration/control_methods/random_integration - info: - label: Random integration - summary: "Graph connectivity values are randomly permuted within each batch" - description: "Graph connectivity values are randomly permuted within each batch" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py deleted file mode 100644 index d5c20aa185..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/batch_graph/script.py +++ /dev/null @@ -1,41 +0,0 @@ -import anndata as ad -import sys - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad' -} - -meta = { - 'functionality_name': 'foo', - 'config': 'bar', -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_graph -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsp='obsp', - uns='uns' -) - -print('Randomize graph...', flush=True) -adata = _randomize_graph( - adata, - neighbors_key="knn", - partition=adata.obs["batch"], -) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml deleted file mode 100644 index b4457498c9..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_embedding.yaml -functionality: - name: celltype_embed - namespace: batch_integration/control_methods/random_integration - info: - label: Random embedding by cell type - summary: "Embedding coordinates are randomized within celltype labels" - description: "Embedding coordinates are randomized within celltype labels" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py deleted file mode 100644 index bf26568079..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_embed/script.py +++ /dev/null @@ -1,38 +0,0 @@ -import anndata as ad -import sys - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar' -} - -## VIASH END -sys.path.append(meta["resources_dir"]) -from utils import _randomize_features -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsm='obsm', - uns='uns' -) - -print('Process data...', flush=True) -adata.obsm["X_emb"] = _randomize_features( - adata.obsm["X_pca"], - partition=adata.obs["label"] -) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml deleted file mode 100644 index 7c483739c2..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_feature.yaml -functionality: - name: celltype_feature - namespace: batch_integration/control_methods/random_integration - info: - label: Random feature by cell type - summary: "Features are randomized within celltype labels" - description: "Features are randomized within celltype labels" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py deleted file mode 100644 index 9f1302df0d..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_feature/script.py +++ /dev/null @@ -1,42 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar', - "resources_dir": "src/tasks/batch_integration/control_methods/" -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_features -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - -print("Process data...", flush=True) -adata.layers['corrected_counts'] = _randomize_features( - adata.X, - partition=adata.obs["label"] -) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml deleted file mode 100644 index 6015185616..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_graph.yaml -functionality: - name: celltype_graph - namespace: batch_integration/control_methods/random_integration - info: - label: Random graph by cell type - summary: "Graph connectivities are randomized within celltype labels" - description: "Graph connectivities are randomized within celltype labels" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py deleted file mode 100644 index 3634d55dbd..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/celltype_graph/script.py +++ /dev/null @@ -1,41 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar', - "resources_dir": "src/tasks/batch_integration/control_methods/" -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_graph -from read_anndata_partial import read_anndata - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsp='obsp', - uns='uns' -) - -print("Process data...", flush=True) -adata = _randomize_graph( - adata, - neighbors_key="knn", - partition=adata.obs["label"], -) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml deleted file mode 100644 index 0343c37817..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/global_embed/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_embedding.yaml -functionality: - name: global_embed - namespace: batch_integration/control_methods/random_integration - info: - label: Random integration - summary: "Embedding coordinates are randomly permuted" - description: "Embedding coordinates are randomly permuted" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py deleted file mode 100644 index ca626600b8..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/global_embed/script.py +++ /dev/null @@ -1,37 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality': 'foo', - 'config': 'bar', - "resources_dir": "src/tasks/batch_integration/control_methods/" -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_features -from read_anndata_partial import read_anndata - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsm='obsm', - uns='uns' -) - -print("process dataset", flush=True) -adata.obsm["X_emb"] = _randomize_features(adata.obsm["X_pca"]) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml deleted file mode 100644 index f49ee146a1..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/global_feature/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_feature.yaml -functionality: - name: global_feature - namespace: batch_integration/control_methods/random_integration - info: - label: Random integration - summary: "Feature values are randomly permuted" - description: "Feature values are randomly permuted" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: acf5c95a7306b819c4a13972783433d0a48f769b - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] \ No newline at end of file diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py deleted file mode 100644 index c74c7d2a5e..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/global_feature/script.py +++ /dev/null @@ -1,37 +0,0 @@ -import anndata as ad -import sys - - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad' -} - -meta = { - 'functionality_name': 'foo', - 'config': 'bar', -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_features -from read_anndata_partial import read_anndata - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - -adata.layers['corrected_counts'] = _randomize_features(adata.X) - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml b/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml deleted file mode 100644 index 1b92cbc70a..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/global_graph/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# use method api spec -__merge__: ../../../api/comp_control_method_graph.yaml -functionality: - name: global_graph - namespace: batch_integration/control_methods/random_integration - info: - label: Random integration - summary: "Graph connectivity values are randomly permuted" - description: "Graph connectivity values are randomly permuted" - v1: - path: openproblems/tasks/_batch_integration/_common/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py - - path: ../../utils.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [ "midtime", "lowmem", "lowcpu"] diff --git a/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py b/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py deleted file mode 100644 index cd4d64f043..0000000000 --- a/src/tasks/batch_integration/control_methods/random_integration/global_graph/script.py +++ /dev/null @@ -1,37 +0,0 @@ -import anndata as ad -import sys - -## VIASH START - -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad' -} - -meta = { - 'functionality_name': 'foo', - 'config': 'bar', -} - -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from utils import _randomize_graph -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsp='obsp', - uns='uns' -) - -print('Randomize graph...', flush=True) -adata = _randomize_graph(adata, neighbors_key="knn") - -print("Store outputs", flush=True) -adata.uns['method_id'] = meta['functionality_name'] -adata.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/control_methods/utils.py b/src/tasks/batch_integration/control_methods/utils.py deleted file mode 100644 index 954e24af26..0000000000 --- a/src/tasks/batch_integration/control_methods/utils.py +++ /dev/null @@ -1,56 +0,0 @@ -import numpy as np - - -def _set_uns(adata, neighbors_key): - adata.uns["neighbors"] = adata.uns[neighbors_key] - adata.uns["neighbors"]["connectivities_key"] = "connectivities" - adata.uns["neighbors"]["distances_key"] = "distances" - - -def _randomize_features(X, partition=None): - """ - Taken and adapted from opsca-v1: - https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L13 - """ - X_out = X.copy() - if partition is None: - partition = np.full(X.shape[0], 0) - else: - partition = np.asarray(partition) - for partition_name in np.unique(partition): - partition_idx = np.argwhere(partition == partition_name).flatten() - X_out[partition_idx] = X[np.random.permutation(partition_idx)] - return X_out - - -def _randomize_graph(adata, partition=None, neighbors_key="neighbors"): - """ - Taken and adapted from opsca-v1: - https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L25 - """ - knn_map = adata.uns[neighbors_key] - distances, connectivities = ( - adata.obsp[knn_map["distances_key"]], - adata.obsp[knn_map["connectivities_key"]], - ) - new_idx = _randomize_features(np.arange(distances.shape[0]), partition=partition) - adata.obsp["distances"] = distances[new_idx][:, new_idx] - adata.obsp["connectivities"] = connectivities[new_idx][:, new_idx] - _set_uns(adata, neighbors_key) - return adata - - -def _perfect_embedding(partition, jitter=0.01): - """ - Taken and adapted from opsca-v1: - https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L37 - """ - from sklearn.preprocessing import LabelEncoder - from sklearn.preprocessing import OneHotEncoder - - embedding = OneHotEncoder().fit_transform( - LabelEncoder().fit_transform(partition)[:, None] - ) - if jitter is not None: - embedding = embedding + np.random.uniform(-1 * jitter, jitter, embedding.shape) - return np.asarray(embedding) diff --git a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml b/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml deleted file mode 100644 index 8eff37339f..0000000000 --- a/src/tasks/batch_integration/methods/bbknn/config.vsh.yaml +++ /dev/null @@ -1,51 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_graph.yaml -functionality: - name: bbknn - info: - label: BBKNN - summary: "BBKNN creates k nearest neighbours graph by identifying neighbours within batches, then combining and processing them with UMAP for visualization." - description: | - "BBKNN or batch balanced k nearest neighbours graph is built for each cell by - identifying its k nearest neighbours within each defined batch separately, - creating independent neighbour sets for each cell in each batch. These sets - are then combined and processed with the UMAP algorithm for visualisation." - reference: "polanski2020bbknn" - repository_url: "https://github.com/Teichlab/bbknn" - documentation_url: "https://github.com/Teichlab/bbknn#readme" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - bbknn_full_unscaled: - bbknn_full_scaled: - preferred_normalization: log_cp10k_scaled - arguments: - - name: --annoy_n_trees - type: integer - default: 10 - description: Number of trees to use in the annoy forrest. - - name: --neighbors_within_batch - type: integer - default: 3 - description: Number of neighbors to report within each batch. - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - bbknn - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/bbknn/script.py b/src/tasks/batch_integration/methods/bbknn/script.py deleted file mode 100644 index 1496fda0bb..0000000000 --- a/src/tasks/batch_integration/methods/bbknn/script.py +++ /dev/null @@ -1,63 +0,0 @@ -import sys -import anndata as ad -import scanpy as sc -import bbknn - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'annoy_n_trees': 10, - 'neighbors_within_batch': 3, - 'n_hvg': 2000, -} -meta = { - 'functionality_name': 'foo', - 'config': 'bar' -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - sc.pp.pca(adata) - -print('Run BBKNN', flush=True) -kwargs = dict(batch_key='batch', copy=True) -kwargs['annoy_n_trees'] = par['annoy_n_trees'] -kwargs['neighbors_within_batch'] = par['neighbors_within_batch'] - -ad_bbknn = bbknn.bbknn(adata, **kwargs) - -print("Store output", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - obsp={ - 'connectivities': ad_bbknn.obsp['connectivities'], - 'distances': ad_bbknn.obsp['distances'], - }, - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - 'neighbors': ad_bbknn.uns['neighbors'] - } -) - -print("Store outputs", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/combat/config.vsh.yaml b/src/tasks/batch_integration/methods/combat/config.vsh.yaml deleted file mode 100644 index f94333627d..0000000000 --- a/src/tasks/batch_integration/methods/combat/config.vsh.yaml +++ /dev/null @@ -1,42 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_feature.yaml -functionality: - name: combat - info: - label: Combat - summary: "Adjusting batch effects in microarray expression data using - empirical Bayes methods" - description: | - "An Empirical Bayes (EB) approach to correct for batch effects. It - estimates batch-specific parameters by pooling information across genes in - each batch and shrinks the estimates towards the overall mean of the batch - effect estimates across all genes. These parameters are then used to adjust - the data for batch effects, leading to more accurate and reproducible - results." - reference: "hansen2012removing" - repository_url: "https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html" - documentation_url: "https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/combat.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - combat_full_unscaled: - combat_full_scaled: - preferred_normalization: log_cp10k_scaled - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, highmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/combat/script.py b/src/tasks/batch_integration/methods/combat/script.py deleted file mode 100644 index 9f282efb9c..0000000000 --- a/src/tasks/batch_integration/methods/combat/script.py +++ /dev/null @@ -1,57 +0,0 @@ -import sys -import scanpy as sc -from scipy.sparse import csr_matrix - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'n_hvg': 2000, -} - -meta = { - 'functionality_name': 'foo', - 'config': 'bar' -} - -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - - -print('Run Combat', flush=True) -adata.X = sc.pp.combat(adata, key='batch', inplace=False) - - -print("Store output", flush=True) -output = sc.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - }, - layers={ - 'corrected_counts': csr_matrix(adata.X), - } -) - -print("Store outputs", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml deleted file mode 100644 index cd885da3cd..0000000000 --- a/src/tasks/batch_integration/methods/fastmnn_embedding/config.vsh.yaml +++ /dev/null @@ -1,36 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_embedding.yaml -functionality: - name: fastmnn_embedding - info: - label: fastMnn (embedding) - summary: "A simpler version of the original mnnCorrect algorithm." - description: | - The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps. - - 1. Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality. - 2. Identify MNN pairs in the low-dimensional space between a reference batch and a target batch. - 3. Remove variation along the average batch vector in both reference and target batches. - 4. Correct the cells in the target batch towards the reference, using locally weighted correction vectors. - 5. Merge the corrected target batch with the reference, and repeat with the next target batch. - - reference: "haghverdi2018batch" - repository_url: "https://code.bioconductor.org/browse/batchelor/" - documentation_url: "https://bioconductor.org/packages/batchelor/" - preferred_normalization: log_cp10k - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: r_script - path: ../fastmnn_feature/script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - bioc: - - batchelor - - type: nextflow - directives: - label: [midtime, lowcpu, highmem] diff --git a/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml deleted file mode 100644 index e28406eb54..0000000000 --- a/src/tasks/batch_integration/methods/fastmnn_feature/config.vsh.yaml +++ /dev/null @@ -1,34 +0,0 @@ -__merge__: ../../api/comp_method_feature.yaml -functionality: - name: fastmnn_feature - info: - label: fastMnn (feature) - summary: "A simpler version of the original mnnCorrect algorithm." - description: | - The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps. - - 1. Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality. - 2. Identify MNN pairs in the low-dimensional space between a reference batch and a target batch. - 3. Remove variation along the average batch vector in both reference and target batches. - 4. Correct the cells in the target batch towards the reference, using locally weighted correction vectors. - 5. Merge the corrected target batch with the reference, and repeat with the next target batch. - - reference: "haghverdi2018batch" - repository_url: "https://code.bioconductor.org/browse/batchelor/" - documentation_url: "https://bioconductor.org/packages/batchelor/" - preferred_normalization: log_cp10k - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/fastmnn.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - bioc: batchelor - - type: nextflow - directives: - label: [midtime, lowcpu, highmem] diff --git a/src/tasks/batch_integration/methods/fastmnn_feature/script.R b/src/tasks/batch_integration/methods/fastmnn_feature/script.R deleted file mode 100644 index dbccd52d29..0000000000 --- a/src/tasks/batch_integration/methods/fastmnn_feature/script.R +++ /dev/null @@ -1,51 +0,0 @@ -cat("Loading dependencies\n") -suppressPackageStartupMessages({ - requireNamespace("anndata", quietly = TRUE) - library(Matrix, warn.conflicts = FALSE) - requireNamespace("batchelor", quietly = TRUE) - library(SingleCellExperiment, warn.conflicts = FALSE) -}) -## VIASH START -par <- list( - input = 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - output = 'output.h5ad' -) -meta <- list( - functionality_name = "mnn_correct_feature" -) -## VIASH END - -cat("Read input\n") -adata <- anndata::read_h5ad(par$input) - -# TODO: pass output of 'multiBatchNorm' to fastMNN - -cat("Run mnn\n") -out <- suppressWarnings(batchelor::fastMNN( - t(adata$layers[["normalized"]]), - batch = adata$obs[["batch"]] -)) - -cat("Reformat output\n") -# reusing the same script for fastmnn_embed and fastmnn_feature -return_type <- gsub("fastmnn_", "", meta[["functionality_name"]]) - -output <- anndata::AnnData( - shape = adata$shape, - uns = list( - dataset_id = adata$uns[["dataset_id"]], - normalization_id = adata$uns[["normalization_id"]], - method_id = meta$functionality_name - ) -) - -if (return_type == "feature") { - layer <- as(SummarizedExperiment::assay(out, "reconstructed"), "sparseMatrix") - output$layers[["corrected_counts"]] <- t(layer) -} else if (return_type == "embedding") { - obsm <- SingleCellExperiment::reducedDim(out, "corrected") - output$obsm[["X_emb"]] <- obsm -} - -cat("Write output to file\n") -zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/batch_integration/methods/liger/config.vsh.yaml b/src/tasks/batch_integration/methods/liger/config.vsh.yaml deleted file mode 100644 index 4c638d467b..0000000000 --- a/src/tasks/batch_integration/methods/liger/config.vsh.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_embedding.yaml -functionality: - name: liger - info: - label: LIGER - summary: Linked Inference of Genomic Experimental Relationships - description: | - LIGER or linked inference of genomic experimental relationships uses iNMF - deriving and implementing a novel coordinate descent algorithm to efficiently - do the factorization. Joint clustering is performed and factor loadings are - normalised. - reference: welch2019single - repository_url: https://github.com/welch-lab/liger - documentation_url: https://github.com/welch-lab/liger - preferred_normalization: log_cp10k - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: apt - packages: cmake - - type: r - cran: rliger - github: welch-lab/RcppPlanc - - type: nextflow - directives: - label: [lowcpu, highmem, midtime] diff --git a/src/tasks/batch_integration/methods/liger/script.R b/src/tasks/batch_integration/methods/liger/script.R deleted file mode 100644 index b7159063ff..0000000000 --- a/src/tasks/batch_integration/methods/liger/script.R +++ /dev/null @@ -1,108 +0,0 @@ -cat(">> Load dependencies\n") -requireNamespace("anndata", quietly = TRUE) -requireNamespace("rliger", quietly = TRUE) - -## VIASH START -par <- list( - input = "resources_test/batch_integration/pancreas/dataset.h5ad", - output = "output.h5ad" -) -meta <- list( - functionality_name = "liger" -) -## VIASH END - -cat("Read input\n") -adata <- anndata::read_h5ad(par$input) - -anndataToLiger <- function(adata) { - # fetch batch names - batch <- adata$obs$batch - batch_names <- as.character(unique(batch)) - - # restructure data - raw_data <- lapply(batch_names, function(batch_name) { - Matrix::t(adata$layers[["counts"]][batch == batch_name, , drop = FALSE]) - }) - names(raw_data) <- batch_names - - rliger::createLiger(rawData = raw_data, removeMissing = FALSE) -} - -addNormalizedDataToLiger <- function(adata, lobj) { - norm_data <- lapply(names(rliger::rawData(lobj)), function(name) { - norm <- adata$layers[["normalized"]] - - # subset - col_names <- colnames(rliger::rawData(lobj)[[name]]) - row_names <- rownames(rliger::rawData(lobj)[[name]]) - prefix <- paste0(name, "_") - col_names <- sub(prefix, "", col_names) - - norm <- norm[ - col_names, - row_names, - drop = FALSE - ] - - # add prefix - rownames(norm) <- paste0(prefix, rownames(norm)) - - # transpose - norm <- Matrix::t(norm) - - # turn into dgcMatrix - as(as(norm, "denseMatrix"), "CsparseMatrix") - }) - names(norm_data) <- names(rliger::rawData(lobj)) - - for (name in names(rliger::rawData(lobj))) { - lobj@datasets[[name]]@normData <- norm_data[[name]] - } - - lobj -} - -cat(">> Create Liger Data object\n") -lobj <- anndataToLiger(adata) - -cat(">> Normalize data\n") -lobj <- addNormalizedDataToLiger(adata, lobj) - -# could also use the rliger normalization instead -# lobj <- rliger::normalize(lobj) - -cat(">> Select genes\n") -# lobj <- rliger::selectGenes(lobj) -# overwrite gene selection to include all genes -lobj@varFeatures <- adata$var_names - -cat(">> Perform scaling\n") -lobj <- rliger::scaleNotCenter(lobj, removeMissing = FALSE) - -cat(">> Joint Matrix Factorization\n") -lobj <- rliger::runIntegration(lobj, k = 20) - -cat(">> Quantile normalization\n") -lobj <- rliger::quantileNorm(lobj) - -cat(">> Store output\n") -# remove dataset names from rownames -for (name in names(rliger::rawData(lobj))) { - rownames(lobj@H.norm) <- sub(paste0(name, "_"), "", rownames(lobj@H.norm)) -} - -output <- anndata::AnnData( - uns = list( - dataset_id = adata$uns[["dataset_id"]], - normalization_id = adata$uns[["normalization_id"]], - method_id = meta$functionality_name - ), - obsm = list( - X_emb = lobj@H.norm[rownames(adata), , drop = FALSE] - ), - shape = adata$shape -) - -cat(">> Write AnnData to file\n") -zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml b/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml deleted file mode 100644 index 1c999fa540..0000000000 --- a/src/tasks/batch_integration/methods/mnn_correct/config.vsh.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_feature.yaml -functionality: - name: mnn_correct - info: - label: mnnCorrect - summary: "Correct for batch effects in single-cell expression data using the mutual nearest neighbors method." - description: | - We present a strategy for batch correction based on the detection of mutual nearest neighbors (MNNs) in the high-dimensional expression space. - Our approach does not rely on predefined or equal population compositions across batches; instead, it requires only that a subset of the population be shared between batches. - reference: "haghverdi2018batch" - repository_url: "https://code.bioconductor.org/browse/batchelor/" - documentation_url: "https://bioconductor.org/packages/batchelor/" - preferred_normalization: log_cp10k - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - bioc: - - batchelor - - type: nextflow - directives: - label: [midtime, lowcpu, highmem] diff --git a/src/tasks/batch_integration/methods/mnn_correct/script.R b/src/tasks/batch_integration/methods/mnn_correct/script.R deleted file mode 100644 index 0e6dfa2606..0000000000 --- a/src/tasks/batch_integration/methods/mnn_correct/script.R +++ /dev/null @@ -1,47 +0,0 @@ -cat("Loading dependencies\n") -suppressPackageStartupMessages({ - requireNamespace("anndata", quietly = TRUE) - library(Matrix, warn.conflicts = FALSE) - requireNamespace("batchelor", quietly = TRUE) - library(SingleCellExperiment, warn.conflicts = FALSE) -}) -## VIASH START -par <- list( - input = 'resources_test/batch_integration/pancreas/dataset.h5ad', - output = 'output.h5ad' -) -meta <- list( - functionality_name = "mnn_correct_feature" -) -## VIASH END - -cat("Read input\n") -adata <- anndata::read_h5ad(par$input) - -cat("Run mnn\n") -out <- suppressWarnings(batchelor::mnnCorrect( - t(adata$layers[["normalized"]]), - batch = adata$obs[["batch"]] -)) - -cat("Reformat output\n") -layer <- SummarizedExperiment::assay(out, "corrected") -as(t(layer), "sparseMatrix") - - - -cat("Store outputs\n") -output <- anndata::AnnData( - uns = list( - dataset_id = adata$uns[["dataset_id"]], - normalization_id = adata$uns[["normalization_id"]], - method_id = meta$functionality_name - ), - layers = list( - corrected_counts = as(t(layer), "sparseMatrix") - ), - shape = adata$shape -) - -cat("Write output to file\n") -zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml b/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml deleted file mode 100644 index 2c5075534b..0000000000 --- a/src/tasks/batch_integration/methods/mnnpy/config.vsh.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_feature.yaml -functionality: - name: mnnpy - info: - label: mnnpy - summary: "Batch effect correction by matching mutual nearest neighbors, Python implementation." - description: | - An implementation of MNN correct in python featuring low memory usage, full multicore support and compatibility with the scanpy framework. - - Batch effect correction by matching mutual nearest neighbors (Haghverdi et al, 2018) has been implemented as a function 'mnnCorrect' in the R package scran. Sadly it's extremely slow for big datasets and doesn't make full use of the parallel architecture of modern CPUs. - - This project is a python implementation of the MNN correct algorithm which takes advantage of python's extendability and hackability. It seamlessly integrates with the scanpy framework and has multicore support in its bones. - reference: "hie2019efficient" - repository_url: "https://github.com/chriscainx/mnnpy" - documentation_url: "https://github.com/chriscainx/mnnpy#readme" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/mnn.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: log_cp10k - variants: - mnn_full_unscaled: - mnn_full_scaled: - preferred_normalization: log_cp10k_scaled - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - resources: - - type: python_script - path: script.py -platforms: - # Due to a [ gcc-8 ] dependency in the mnnpy package, we need to use a python:3.8 image - - type: docker - image: python:3.8 - setup: - - type: apt - packages: - - procps - - type: python - pypi: - - anndata~=0.8.0 - - scanpy - - pyyaml - - requests - - jsonschema - github: - - chriscainx/mnnpy - - type: nextflow - directives: - label: [ midtime, lowcpu, lowmem ] diff --git a/src/tasks/batch_integration/methods/mnnpy/script.py b/src/tasks/batch_integration/methods/mnnpy/script.py deleted file mode 100644 index 1551573650..0000000000 --- a/src/tasks/batch_integration/methods/mnnpy/script.py +++ /dev/null @@ -1,55 +0,0 @@ -import anndata as ad -import mnnpy - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'n_hvg': 2000, -} -meta = { - 'functionality_name': 'foo', - 'config': 'bar' -} -## VIASH END - -print('Read input', flush=True) -adata = ad.read_h5ad(par['input']) -adata.X = adata.layers['normalized'] -del adata.layers['normalized'] -del adata.layers['counts'] - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - -print('Run mnn', flush=True) -split = [] -batch_categories = adata.obs['batch'].cat.categories -for i in batch_categories: - split.append(adata[adata.obs['batch'] == i].copy()) -corrected, _, _ = mnnpy.mnn_correct( - *split, - batch_key='batch', - batch_categories=batch_categories, - index_unique=None - ) - -print("Store outputs", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - }, - layers={ - 'corrected_counts': corrected.X, - } -) - - -print("Store outputs", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml b/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml deleted file mode 100644 index cf16b2e684..0000000000 --- a/src/tasks/batch_integration/methods/pyliger/config.vsh.yaml +++ /dev/null @@ -1,37 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_embedding.yaml -functionality: - name: pyliger - info: - label: pyliger - summary: Python implementation of LIGER (Linked Inference of Genomic Experimental Relationships - description: | - LIGER (installed as rliger) is a package for integrating and analyzing multiple - single-cell datasets, developed by the Macosko lab and maintained/extended by the - Welch lab. It relies on integrative non-negative matrix factorization to identify - shared and dataset-specific factors. - reference: welch2019single - repository_url: https://github.com/welch-lab/pyliger - documentation_url: https://github.com/welch-lab/pyliger - preferred_normalization: log_cp10k - variants: - liger_unscaled: - liger_scaled: - preferred_normalization: log_cp10k_scaled - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - umap-learn[plot] - - pyliger - - dask-expr - - type: nextflow - directives: - label: [lowcpu, highmem, midtime] diff --git a/src/tasks/batch_integration/methods/pyliger/script.py b/src/tasks/batch_integration/methods/pyliger/script.py deleted file mode 100644 index 2066e6965b..0000000000 --- a/src/tasks/batch_integration/methods/pyliger/script.py +++ /dev/null @@ -1,86 +0,0 @@ -import sys -import anndata as ad -import numpy as np -import pyliger - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'pyliger' -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('>> Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/counts', - obs='obs', - var='var', - uns='uns' -) -adata.layers['norm_data'] = read_anndata(par['input'], X='layers/normalized').X - -print('>> Prepare data', flush=True) -adata_per_batch = [] -for batch in adata.obs['batch'].unique(): - adb = adata[adata.obs['batch'] == batch].copy() - - # save row sum and sum of squares for further use - norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0)) - norm_sum_sq = np.ravel(np.sum(adb.layers["norm_data"].power(2), axis=0)) - adb.var["norm_sum"] = norm_sum - adb.var["norm_sum_sq"] = norm_sum_sq - adb.var["norm_mean"] = norm_sum / adb.shape[0] - - # set more metadata - adb.obs.index.name = 'cell_barcode' - adb.var.index.name = 'gene_id' - adb.uns['sample_name'] = batch - - # append to list - adata_per_batch.append(adb) - -print('Create liger object', flush=True) -lobj = pyliger.create_liger( - adata_per_batch, - remove_missing=False -) - -# do not select genes -lobj.var_genes = adata.var_names - -print('>> Scaling', flush=True) -pyliger.scale_not_center(lobj, remove_missing=False) - -print('>> Optimize ALS', flush=True) -pyliger.optimize_ALS(lobj, k=20) - -print('>> Quantile normalization', flush=True) -pyliger.quantile_norm(lobj) - -print('>> Concatenate outputs', flush=True) -ad_out = ad.concat(lobj.adata_list) - -print('Store output', flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - obsm={ - 'X_emb': ad_out[adata.obs_names, :].obsm['H_norm'] - }, - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml deleted file mode 100644 index 3437df19c9..0000000000 --- a/src/tasks/batch_integration/methods/scalex_embed/config.vsh.yaml +++ /dev/null @@ -1,41 +0,0 @@ -__merge__: ../../api/comp_method_embedding.yaml -functionality: - name: scalex_embed - info: - label: SCALEX (embedding) - summary: Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space - description : | - SCALEX is a method for integrating heterogeneous single-cell data online using a VAE framework. Its generalised encoder disentangles batch-related components from batch-invariant biological components, which are then projected into a common cell-embedding space. - reference: xiong2021online - repository_url: https://github.com/jsxlei/SCALEX - documentation_url: https://scalex.readthedocs.io - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - scalex_feature_unscaled: - scanorama_feature_scaled: - preferred_normalization: log_cp10k_scaled - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scalex - - numpy<1.24 - - torch<2.1 - - type: nextflow - directives: - label: [lowmem, lowcpu, midtime] diff --git a/src/tasks/batch_integration/methods/scalex_embed/script.py b/src/tasks/batch_integration/methods/scalex_embed/script.py deleted file mode 100644 index 9974eba4b3..0000000000 --- a/src/tasks/batch_integration/methods/scalex_embed/script.py +++ /dev/null @@ -1,70 +0,0 @@ -import sys -import anndata as ad -import scalex - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'hvg': True, -} -meta = { - 'functionality_name' : 'foo', - 'config': 'bar' -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - -print('Run SCALEX', flush=True) -adata = scalex.SCALEX( - adata, - batch_key="batch", - ignore_umap=True, - impute=adata.obs["batch"].cat.categories[0], - processed=True, - max_iteration=40, - min_features=None, - min_cells=None, - n_top_features=0, - outdir=None, - gpu=0, -) -adata.obsm["X_emb"] = adata.obsm["latent"] - -print("Store outputs", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - layers={ - 'corrected_counts': adata.layers["impute"], - }, - obsm={ - 'X_emb': adata.obsm['latent'], - }, - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml deleted file mode 100644 index 1874bc190e..0000000000 --- a/src/tasks/batch_integration/methods/scalex_feature/config.vsh.yaml +++ /dev/null @@ -1,41 +0,0 @@ -__merge__: ../../api/comp_method_feature.yaml -functionality: - name: scalex_feature - info: - label: SCALEX (feature) - summary: Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space - description : | - SCALEX is a method for integrating heterogeneous single-cell data online using a VAE framework. Its generalised encoder disentangles batch-related components from batch-invariant biological components, which are then projected into a common cell-embedding space. - reference: xiong2021online - repository_url: https://github.com/jsxlei/SCALEX - documentation_url: https://scalex.readthedocs.io - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scalex.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - scalex_feature_unscaled: - scanorama_feature_scaled: - preferred_normalization: log_cp10k_scaled - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - resources: - - type: python_script - path: ../scalex_embed/script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scalex - - numpy<1.24 - - torch<2.1 - - type: nextflow - directives: - label: [lowmem, lowcpu, midtime] diff --git a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml deleted file mode 100644 index b5dcd8f54a..0000000000 --- a/src/tasks/batch_integration/methods/scanorama_embed/config.vsh.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_embedding.yaml -functionality: - name: scanorama_embed - info: - label: Scanorama (embedding) - summary: "Efficient integration of heterogeneous single-cell - transcriptomes using Scanorama" - description: | - "Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane." - reference: "hie2019efficient" - repository_url: "https://github.com/brianhie/scanorama" - documentation_url: "https://github.com/brianhie/scanorama#readme" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - scanorama_embed_full_unscaled: - scanorama_embed_full_scaled: - preferred_normalization: log_cp10k_scaled - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scanorama - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/batch_integration/methods/scanorama_embed/script.py b/src/tasks/batch_integration/methods/scanorama_embed/script.py deleted file mode 100644 index db12b458d5..0000000000 --- a/src/tasks/batch_integration/methods/scanorama_embed/script.py +++ /dev/null @@ -1,87 +0,0 @@ -import sys -import anndata as ad -import scanorama - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', - 'output': 'output.h5ad', - 'n_hvg': 2000, -} -meta = { - 'functionality_name': 'foo', - 'config': 'bar' -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -# based on scib -# -> https://github.com/theislab/scib/blob/59ae6eee5e611d9d3db067685ec96c28804e9127/scib/utils.py#L51C1-L72C62 -def merge_adata(*adata_list, **kwargs): - """Merge adatas from list while remove duplicated ``obs`` and ``var`` columns - - :param adata_list: ``anndata`` objects to be concatenated - :param kwargs: arguments to be passed to ``anndata.AnnData.concatenate`` - """ - - if len(adata_list) == 1: - return adata_list[0] - - # Make sure that adatas do not contain duplicate columns - for _adata in adata_list: - for attr in ("obs", "var"): - df = getattr(_adata, attr) - dup_mask = df.columns.duplicated() - if dup_mask.any(): - print( - f"Deleting duplicated keys `{list(df.columns[dup_mask].unique())}` from `adata.{attr}`." - ) - setattr(_adata, attr, df.loc[:, ~dup_mask]) - - return ad.AnnData.concatenate(*adata_list, **kwargs) - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) - -if par['n_hvg']: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] - adata = adata[:, idx].copy() - -print('Run scanorama', flush=True) -split = [] -batch_categories = adata.obs['batch'].cat.categories -for i in batch_categories: - split.append(adata[adata.obs['batch'] == i].copy()) -corrected = scanorama.correct_scanpy(split, return_dimred=True) -corrected = merge_adata(*corrected, batch_key='batch', batch_categories=batch_categories, index_unique=None) - -print("Store output", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': meta['functionality_name'], - }, - layers={ - 'corrected_counts': corrected.X, - }, - obsm={ - 'X_emb': corrected.obsm["X_scanorama"], - } -) - -print("Write output to file", flush=True) -output.write(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml b/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml deleted file mode 100644 index 3f735ddffd..0000000000 --- a/src/tasks/batch_integration/methods/scanorama_feature/config.vsh.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_feature.yaml -functionality: - name: scanorama_feature - info: - label: Scanorama (feature) - summary: "Efficient integration of heterogeneous single-cell - transcriptomes using Scanorama" - description: | - "Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane." - reference: "hie2019efficient" - repository_url: "https://github.com/brianhie/scanorama" - documentation_url: "https://github.com/brianhie/scanorama#readme" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanorama.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - scanorama_feature_full_unscaled: - scanorama_feature_full_scaled: - preferred_normalization: log_cp10k_scaled - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - resources: - - type: python_script - path: ../scanorama_embed/script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scanorama - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml deleted file mode 100644 index 5615fd72cd..0000000000 --- a/src/tasks/batch_integration/methods/scanvi/config.vsh.yaml +++ /dev/null @@ -1,61 +0,0 @@ -__merge__: ../../api/comp_method_embedding.yaml - -functionality: - name: scanvi - info: - label: scANVI - summary: "scANVI is a deep learning method that considers cell type labels." - description : | - scANVI (single-cell ANnotation using Variational Inference; Python class SCANVI) is a semi-supervised model for single-cell transcriptomics data. In a sense, it can be seen as a scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells. - reference: "lopez2018deep" - repository_url: "https://github.com/scverse/scvi-tools" - documentation_url: "https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scanvi.py - commit: 29803b95c88b4ec5921df2eec7111fd5d1a95daf - preferred_normalization: counts - variants: - scanvi_full_unscaled: - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - - name: --n_latent - type: integer - default: 30 - description: Number of latent dimensions. - - name: --n_hidden - type: integer - default: 128 - description: Number of hidden units. - - name: --n_layers - type: integer - default: 2 - description: Number of layers. - - name: --max_epochs_scvi - type: integer - example: 400 - description: Maximum number of training epochs for scVI. - - name: --max_epochs_scanvi - type: integer - example: 10 - description: Maximum number of training epochs for scANVI. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scvi-tools>=1.1.0 - - type: docker - run: | - pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu, gpu] diff --git a/src/tasks/batch_integration/methods/scanvi/script.py b/src/tasks/batch_integration/methods/scanvi/script.py deleted file mode 100644 index 35d5b80f32..0000000000 --- a/src/tasks/batch_integration/methods/scanvi/script.py +++ /dev/null @@ -1,76 +0,0 @@ -import sys -import anndata as ad -from scvi.model import SCVI, SCANVI - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', - 'output': 'output.h5ad', - 'n_hvg': 2000, - 'n_latent': 30, - 'n_hidden': 128, - 'n_layers': 2, - 'max_epochs_scvi': 20, - 'max_epochs_scanvi': 20 -} -meta = { - 'functionality_name' : 'scanvi', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/counts', - obs='obs', - var='var', - uns='uns' -) - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - adata = adata[:, idx].copy() - -print("Processing data", flush=True) -SCVI.setup_anndata(adata, batch_key="batch") - -print("Run scVI", flush=True) -model_kwargs = { - key: par[key] - for key in ["n_latent", "n_hidden", "n_layers"] - if par[key] is not None -} - -vae = SCVI(adata, **model_kwargs) - -vae.train(max_epochs=par["max_epochs_scvi"], train_size=1.0) - -print('Run SCANVI', flush=True) -scanvae = SCANVI.from_scvi_model( - scvi_model=vae, - labels_key="label", - unlabeled_category="UnknownUnknown", # pick anything definitely not in a dataset -) -scanvae.train(max_epochs=par["max_epochs_scanvi"], train_size=1.0) - -print("Store outputs", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - obsm={ - "X_emb": scanvae.get_latent_representation(), - }, - uns={ - "dataset_id": adata.uns["dataset_id"], - "normalization_id": adata.uns["normalization_id"], - "method_id": meta["functionality_name"], - }, -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml b/src/tasks/batch_integration/methods/scvi/config.vsh.yaml deleted file mode 100644 index 45eb09d5cf..0000000000 --- a/src/tasks/batch_integration/methods/scvi/config.vsh.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# use method api spec -__merge__: ../../api/comp_method_embedding.yaml -functionality: - name: scvi - info: - label: scVI - summary: "scVI combines a variational autoencoder with a hierarchical Bayesian model." - description: | - scVI combines a variational autoencoder with a hierarchical Bayesian model. It uses the negative binomial distribution to describe gene expression of each cell, conditioned on unobserved factors and the batch variable. ScVI is run as implemented in Luecken et al. - reference: "lopez2018deep" - repository_url: "https://github.com/scverse/scvi-tools" - documentation_url: "https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html" - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/methods/scvi.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: counts - variants: - scvi_full_unscaled: - # defaults are derived from te scvi tutorial: - # https://docs.scvi-tools.org/en/stable/tutorials/notebooks/scrna/harmonization.html - arguments: - - name: --n_hvg - type: integer - default: 2000 - description: Number of highly variable genes to use. - - name: --n_latent - type: integer - default: 30 - description: Number of latent dimensions. - - name: --n_hidden - type: integer - default: 128 - description: Number of hidden units. - - name: --n_layers - type: integer - default: 2 - description: Number of layers. - - name: --max_epochs - type: integer - example: 400 - description: Maximum number of epochs. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scvi-tools>=1.1.0 - - type: docker - run: | - pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - - type: nextflow - directives: - label: [midtime, midmem, lowcpu, gpu] diff --git a/src/tasks/batch_integration/methods/scvi/script.py b/src/tasks/batch_integration/methods/scvi/script.py deleted file mode 100644 index 26490737a5..0000000000 --- a/src/tasks/batch_integration/methods/scvi/script.py +++ /dev/null @@ -1,66 +0,0 @@ -import sys -import anndata as ad -from scvi.model import SCVI - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', - 'output': 'output.h5ad', - 'n_hvg': 2000, - 'n_latent': 30, - 'n_hidden': 128, - 'n_layers': 2, - 'max_epochs': 400 -} -meta = { - 'functionality_name' : 'scvi', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/counts', - obs='obs', - var='var', - uns='uns' -) - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = adata.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - adata = adata[:, idx].copy() - -print("Processing data", flush=True) -SCVI.setup_anndata(adata, batch_key="batch") - -print("Run scVI", flush=True) -model_kwargs = { - key: par[key] - for key in ["n_latent", "n_hidden", "n_layers"] - if par[key] is not None -} - -vae = SCVI(adata, **model_kwargs) - -vae.train(max_epochs=par["max_epochs"], train_size=1.0) - -print("Store outputs", flush=True) -output = ad.AnnData( - obs=adata.obs[[]], - var=adata.var[[]], - obsm={ - "X_emb": vae.get_latent_representation(), - }, - uns={ - "dataset_id": adata.uns["dataset_id"], - "normalization_id": adata.uns["normalization_id"], - "method_id": meta["functionality_name"], - }, -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml deleted file mode 100644 index be6567271c..0000000000 --- a/src/tasks/batch_integration/metrics/asw_batch/config.vsh.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_embedding.yaml -functionality: - name: asw_batch - info: - metrics: - - name: asw_batch - label: ASW batch - summary: Average silhouette of batches per cell identity label (cell type) - description: | - We consider the absolute silhouette width, s(i), on - batch labels per cell i. Here, 0 indicates that batches are well mixed, and any - deviation from 0 indicates a batch effect: - 𝑠batch(𝑖)=|𝑠(𝑖)|. - - To ensure higher scores indicate better batch mixing, these scores are scaled by - subtracting them from 1. As we expect batches to integrate within cell identity - clusters, we compute the batchASWj score for each cell label j separately, - using the equation: - batchASW𝑗=1|𝐶𝑗|∑𝑖∈𝐶𝑗1−𝑠batch(𝑖), - - where Cj is the set of cells with the cell label j and |Cj| denotes the number of cells - in that set. - - To obtain the final batchASW score, the label-specific batchASWj scores are averaged: - batchASW=1|𝑀|∑𝑗∈𝑀batchASW𝑗. - - Here, M is the set of unique cell labels. - reference: luecken2022benchmarking - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/asw_batch/script.py b/src/tasks/batch_integration/metrics/asw_batch/script.py deleted file mode 100644 index 35b110b895..0000000000 --- a/src/tasks/batch_integration/metrics/asw_batch/script.py +++ /dev/null @@ -1,44 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import silhouette_batch - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('compute score', flush=True) -score = silhouette_batch( - adata, - batch_key='batch', - label_key='label', - embed='X_emb', -) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': adata.uns['method_id'], - 'metric_ids': [ meta['functionality_name'] ], - 'metric_values': [ score ] - } -) - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml b/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml deleted file mode 100644 index 068381b9e3..0000000000 --- a/src/tasks/batch_integration/metrics/asw_label/config.vsh.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_embedding.yaml -functionality: - name: asw_label - info: - metrics: - - name: asw_label - label: ASW Label - summary: Average silhouette of cell identity labels (cell types) - description: | - For the bio-conservation score, the ASW was computed on cell identity labels and - scaled to a value between 0 and 1 using the equation: - celltypeASW=(ASW_C+1)/2, - - where C denotes the set of all cell identity labels. - For information about the batch silhouette score, check sil_batch. - reference: luecken2022benchmarking - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/asw_label/script.py b/src/tasks/batch_integration/metrics/asw_label/script.py deleted file mode 100644 index 01a7a2ad41..0000000000 --- a/src/tasks/batch_integration/metrics/asw_label/script.py +++ /dev/null @@ -1,44 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import silhouette - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('compute score', flush=True) -score = silhouette( - adata, - label_key='label', - embed='X_emb' -) - -print("Create output AnnData object", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - "method_id": adata.uns['method_id'], - "metric_ids": [meta['functionality_name']], - "metric_values": [score] - } -) - -print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml b/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml deleted file mode 100644 index 3852029a60..0000000000 --- a/src/tasks/batch_integration/metrics/cell_cycle_conservation/config.vsh.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_embedding.yaml -functionality: - name: cell_cycle_conservation - info: - metrics: - - name: cell_cycle_conservation - label: Cell Cycle Conservation - summary: Cell cycle conservation score based on principle component regression on cell cycle gene scores - description: | - The cell-cycle conservation score evaluates how well the cell-cycle effect can be - captured before and after integration. We computed cell-cycle scores using Scanpy’s - score_cell_cycle function with a reference gene set from Tirosh et al for the - respective cell-cycle phases. We used the same set of cell-cycle genes for mouse and - human data (using capitalization to convert between the gene symbols). We then computed - the variance contribution of the resulting S and G2/M phase scores using principal - component regression (Principal component regression), which was performed for each - batch separately. The differences in variance before, Varbefore, and after, Varafter, - integration were aggregated into a final score between 0 and 1, using the equation: - CCconservation=1−|Varafter−Varbefore|/Varbefore. - - In this equation, values close to 0 indicate lower conservation and 1 indicates complete - conservation of the variance explained by cell cycle. In other words, the variance - remains unchanged within each batch for complete conservation, while any deviation from - the preintegration variance contribution reduces the score. - reference: luecken2022benchmarking - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py b/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py deleted file mode 100644 index fa432a21c6..0000000000 --- a/src/tasks/batch_integration/metrics/cell_cycle_conservation/script.py +++ /dev/null @@ -1,69 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import cell_cycle -import numpy as np - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad' -} - -meta = { - 'functionality_name': 'foo' -} -## VIASH END -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata_solution = read_anndata( - par['input_solution'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) -adata_integrated = read_anndata( - par['input_integrated'], - obs='obs', - obsm='obsm', - uns='uns' -) - -print('Use gene symbols for features', flush=True) -adata_solution.var_names = adata_solution.var['feature_name'] - -translator = { - "homo_sapiens": "human", - "mus_musculus": "mouse", -} - -print('Compute score', flush=True) -if adata_solution.uns['dataset_organism'] not in translator: - score = np.nan -else: - organism = translator[adata_solution.uns['dataset_organism']] - score = cell_cycle( - adata_solution, - adata_integrated, - batch_key='batch', - embed='X_emb', - organism=organism, - ) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata_solution.uns['dataset_id'], - 'normalization_id': adata_solution.uns['normalization_id'], - 'method_id': adata_integrated.uns['method_id'], - 'metric_ids': [ meta['functionality_name'] ], - 'metric_values': [ score ] - } -) - - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml deleted file mode 100644 index 8d92033e40..0000000000 --- a/src/tasks/batch_integration/metrics/clustering_overlap/config.vsh.yaml +++ /dev/null @@ -1,61 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_graph.yaml -functionality: - name: clustering_overlap - info: - metrics: - - name: ari - label: ARI - summary: Adjusted Rand Index compares clustering overlap, correcting for random labels and considering correct overlaps and disagreements. - description: | - The Adjusted Rand Index (ARI) compares the overlap of two clusterings; - it considers both correct clustering overlaps while also counting correct - disagreements between two clusterings. - We compared the cell-type labels with the NMI-optimized - Louvain clustering computed on the integrated dataset. - The adjustment of the Rand index corrects for randomly correct labels. - An ARI of 0 or 1 corresponds to random labeling or a perfect match, - respectively. - reference: - - hubert1985comparing - - luecken2022benchmarking - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - - name: nmi - label: NMI - summary: "NMI compares overlap by scaling using mean entropy terms and optimizing Louvain clustering to obtain the best match between clusters and labels." - description: | - Normalized Mutual Information (NMI) compares the overlap of two clusterings. - We used NMI to compare the cell-type labels with Louvain clusters computed on - the integrated dataset. The overlap was scaled using the mean of the entropy terms - for cell-type and cluster labels. Thus, NMI scores of 0 or 1 correspond to uncorrelated - clustering or a perfect match, respectively. We performed optimized Louvain clustering - for this metric to obtain the best match between clusters and labels. - reference: - - amelio2015normalized - - luecken2022benchmarking - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/nmi.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/clustering_overlap/script.py b/src/tasks/batch_integration/metrics/clustering_overlap/script.py deleted file mode 100644 index 7bb9e533c8..0000000000 --- a/src/tasks/batch_integration/metrics/clustering_overlap/script.py +++ /dev/null @@ -1,53 +0,0 @@ -import sys -import anndata as ad -import scanpy as sc -from scib.metrics.clustering import cluster_optimal_resolution -from scib.metrics import ari, nmi - -## VIASH START -par = { - 'adata_integrated': 'resources_test/batch_integration/pancreas/integrated_graph.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality_name': 'foo' -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('Run optimal Leiden clustering', flush=True) -cluster_optimal_resolution( - adata=adata, - label_key='label', - cluster_key='cluster', - cluster_function=sc.tl.leiden, -) - -print('Compute ARI score', flush=True) -ari_score = ari(adata, cluster_key='cluster', label_key='label') - -print('Compute NMI score', flush=True) -nmi_score = nmi(adata, cluster_key='cluster', label_key='label') - -print("Create output AnnData object", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - "method_id": adata.uns['method_id'], - "metric_ids": [ "ari", "nmi" ], - "metric_values": [ ari_score, nmi_score ] - } -) - -print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml b/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml deleted file mode 100644 index 6384feca62..0000000000 --- a/src/tasks/batch_integration/metrics/graph_connectivity/config.vsh.yaml +++ /dev/null @@ -1,47 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_graph.yaml -functionality: - name: graph_connectivity - info: - metrics: - - name: graph_connectivity - label: Graph Connectivity - summary: Connectivity of the subgraph per cell type label - description: | - The graph connectivity metric assesses whether the kNN graph representation, - G, of the integrated data directly connects all cells with the same cell - identity label. For each cell identity label c, we created the subset kNN - graph G(Nc;Ec) to contain only cells from a given label. Using these subset - kNN graphs, we computed the graph connectivity score using the equation: - - gc =1/|C| Σc∈C |LCC(G(Nc;Ec))|/|Nc|. - - Here, C represents the set of cell identity labels, |LCC()| is the number - of nodes in the largest connected component of the graph, and |Nc| is the - number of nodes with cell identity c. The resultant score has a range - of (0;1], where 1 indicates that all cells with the same cell identity - are connected in the integrated kNN graph, and the lowest possible score - indicates a graph where no cell is connected. As this score is computed - on the kNN graph, it can be used to evaluate all integration outputs. - reference: luecken2022benchmarking - min: 0 - max: 1 - maximize: true - v1: - path: https://github.com/openproblems-bio/openproblems/blob/main/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/graph_connectivity.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/graph_connectivity/script.py b/src/tasks/batch_integration/metrics/graph_connectivity/script.py deleted file mode 100644 index ead8f146bc..0000000000 --- a/src/tasks/batch_integration/metrics/graph_connectivity/script.py +++ /dev/null @@ -1,42 +0,0 @@ -import sys -import anndata as ad -import scib - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('compute score', flush=True) -score = scib.metrics.graph_connectivity( - adata, - label_key='label' -) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': adata.uns['method_id'], - 'metric_ids': [ meta['functionality_name'] ], - 'metric_values': [ score ] - } -) - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml b/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml deleted file mode 100644 index a8025783d6..0000000000 --- a/src/tasks/batch_integration/metrics/hvg_overlap/config.vsh.yaml +++ /dev/null @@ -1,46 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_feature.yaml -functionality: - name: hvg_overlap - info: - metrics: - - name: hvg_overlap - label: HVG overlap - summary: Overlap of highly variable genes per batch before and after integration. - description: | - The HVG conservation score is a proxy for the preservation of - the biological signal. If the data integration method returned - a corrected data matrix, we computed the number of HVGs before - and after correction for each batch via Scanpy’s - highly_variable_genes function (using the ‘cell ranger’ flavor). - If available, we computed 500 HVGs per batch. If fewer than 500 - genes were present in the integrated object for a batch, - the number of HVGs was set to half the total genes in that batch. - The overlap coefficient is as follows: - overlap(𝑋,𝑌)=|𝑋∩𝑌|/min(|𝑋|,|𝑌|), - - where X and Y denote the fraction of preserved informative genes. - The overall HVG score is the mean of the per-batch HVG overlap - coefficients. - reference: luecken2022benchmarking - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/_batch_integration/batch_integration_feature/metrics/hvg_conservation.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/hvg_overlap/script.py b/src/tasks/batch_integration/metrics/hvg_overlap/script.py deleted file mode 100644 index b7d177e991..0000000000 --- a/src/tasks/batch_integration/metrics/hvg_overlap/script.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import hvg_overlap - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata_solution = read_anndata( - par['input_solution'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' -) -adata_integrated = read_anndata( - par['input_integrated'], - X='layers/corrected_counts', - obs='obs', - var='var', - uns='uns' -) - -print('compute score', flush=True) -score = hvg_overlap( - adata_solution, - adata_integrated, - batch_key="batch" -) - -print("Create output AnnData object", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": adata_solution.uns['dataset_id'], - 'normalization_id': adata_solution.uns['normalization_id'], - "method_id": adata_integrated.uns['method_id'], - "metric_ids": [meta['functionality_name']], - "metric_values": [score] - } -) - -print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml deleted file mode 100644 index 65e1970c4f..0000000000 --- a/src/tasks/batch_integration/metrics/isolated_label_asw/config.vsh.yaml +++ /dev/null @@ -1,40 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_embedding.yaml -functionality: - name: isolated_label_asw - info: - metrics: - - name: isolated_label_asw - label: Isolated label ASW - summary: Evaluate how well isolated labels separate by average silhouette width - description: | - Isolated cell labels are defined as the labels present in the least number - of batches in the integration task. The score evaluates how well these isolated labels - separate from other cell identities. - - The isolated label ASW score is obtained by computing the - ASW of isolated versus non-isolated labels on the PCA embedding (ASW metric above) and - scaling this score to be between 0 and 1. The final score for each metric version - consists of the mean isolated score of all isolated labels. - reference: luecken2022benchmarking - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_sil.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - min: 0 - max: 1 - maximize: true - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/isolated_label_asw/script.py b/src/tasks/batch_integration/metrics/isolated_label_asw/script.py deleted file mode 100644 index 094937e687..0000000000 --- a/src/tasks/batch_integration/metrics/isolated_label_asw/script.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import isolated_labels_asw - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('compute score', flush=True) - -score = isolated_labels_asw( - adata, - label_key='label', - batch_key='batch', - embed='X_emb', - iso_threshold=None, - verbose=True, -) -print(score, flush=True) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': adata.uns['method_id'], - 'metric_ids': [ meta['functionality_name'] ], - 'metric_values': [ score ] - } -) - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml b/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml deleted file mode 100644 index 6b8f0703bf..0000000000 --- a/src/tasks/batch_integration/metrics/isolated_label_f1/config.vsh.yaml +++ /dev/null @@ -1,52 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_graph.yaml -functionality: - name: isolated_label_f1 - info: - metrics: - - name: isolated_label_f1 - label: Isolated label F1 score - summary: Evaluate how well isolated labels coincide with clusters - description: | - We developed two isolated label scores to evaluate how well the data integration methods - dealt with cell identity labels shared by few batches. Specifically, we identified - isolated cell labels as the labels present in the least number of batches in the - integration task. - The score evaluates how well these isolated labels separate from other cell identities. - We implemented the isolated label metric in two versions: - (1) the best clustering of the isolated label (F1 score) and - (2) the global ASW of the isolated label. For the cluster-based score, - we first optimize the cluster assignment of the isolated label using the F1 score˚ - across louvain clustering resolutions ranging from 0.1 to 2 in resolution steps of 0.1. - The optimal F1 score for the isolated label is then used as the metric score. - The F1 score is a weighted mean of precision and recall given by the equation: - 𝐹1=2×(precision×recall)/(precision+recall). - - It returns a value between 0 and 1, - where 1 shows that all of the isolated label cells and no others are captured in - the cluster. For the isolated label ASW score, we compute the ASW of isolated - versus nonisolated labels on the PCA embedding (ASW metric above) and scale this - score to be between 0 and 1. The final score for each metric version consists of - the mean isolated score of all isolated labels. - reference: luecken2022benchmarking - v1: - path: openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - min: 0 - max: 1 - maximize: true - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/isolated_label_f1/script.py b/src/tasks/batch_integration/metrics/isolated_label_f1/script.py deleted file mode 100644 index 30fe25bccf..0000000000 --- a/src/tasks/batch_integration/metrics/isolated_label_f1/script.py +++ /dev/null @@ -1,48 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import isolated_labels_f1 - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('compute score', flush=True) -score = isolated_labels_f1( - adata, - label_key='label', - batch_key='batch', - embed=None, - iso_threshold=None, - verbose=True, -) -print(score, flush=True) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': adata.uns['method_id'], - 'metric_ids': [ meta['functionality_name'] ], - 'metric_values': [ score ] - } -) - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml b/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml deleted file mode 100644 index aca556a8fc..0000000000 --- a/src/tasks/batch_integration/metrics/kbet/config.vsh.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_embedding.yaml -functionality: - name: kbet - info: - metrics: - - name: kbet - label: kBET - summary: kBET algorithm to determine how well batches are mixed within a cell type - description: | - The kBET algorithm (v.0.99.6, release 4c9dafa) determines whether the label composition - of a k nearest neighborhood of a cell is similar to the expected (global) label - composition (Buettner et al., Nat Meth 2019). The test is repeated for a random subset - of cells, and the results are summarized as a rejection rate over all tested - neighborhoods. Thus, kBET works on a kNN graph. - - We compute kNN graphs where k = 50 for joint embeddings and corrected feature outputs - via Scanpy preprocessing steps. To test for technical effects and to account for - cell-type frequency shifts across datasets, we applied kBET - separately on the batch variable for each cell identity label. Using the kBET defaults, - a k equal to the median of the number of cells per batch within each label is used for - this computation. Additionally, we set the minimum and maximum thresholds of k to 10 and - 100, respectively. As kNN graphs that have been subset by cell identity labels may no - longer be connected, we compute kBET per connected component. If >25% of cells were - assigned to connected components too small for kBET computation (smaller than k × 3), - we assigned a kBET score of 1 to denote poor batch removal. Subsequently, kBET scores - for each label were averaged and subtracted from 1 to give a final kBET score. - - In Open Problems we do not run kBET on graph outputs to avoid computation-intensive - diffusion processes being run. - reference: luecken2022benchmarking - v1: - path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - min: 0 - max: 1 - maximize: true - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - github: theislab/kBET - - type: python - pypi: - - scib==1.1.5 - - rpy2>=3 - - anndata2ri - - scipy<=1.13 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/kbet/script.py b/src/tasks/batch_integration/metrics/kbet/script.py deleted file mode 100644 index 9834f525d5..0000000000 --- a/src/tasks/batch_integration/metrics/kbet/script.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import kBET - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('compute score', flush=True) -score = kBET( - adata, - batch_key="batch", - label_key="label", - type_="embed", - embed="X_emb", - scaled=True, - verbose=False, -) -print(score, flush=True) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': adata.uns['method_id'], - 'metric_ids': [ meta['functionality_name'] ], - 'metric_values': [ score ] - } -) - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml b/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml deleted file mode 100644 index 750574f84a..0000000000 --- a/src/tasks/batch_integration/metrics/lisi/config.vsh.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_graph.yaml -functionality: - status: disabled - name: lisi - info: - metrics: - - name: ilisi - label: iLISI - summary: Local inverse Simpson's Index - description: | - Local Inverse Simpson's Index metrics adapted from Korsunsky et al. 2019 to run on - all full feature, embedding and kNN integration outputs via shortest path-based - distance computation on single-cell kNN graphs. The metric assesses whether clusters - of cells in a single-cell RNA-seq dataset are well-mixed across a categorical batch - variable. - - The original LISI score ranges from 0 to the number of categories, with the latter - indicating good cell mixing. This is rescaled to a score between 0 and 1. - reference: luecken2022benchmarking - min: 0 - max: 1 - maximize: true - repository_url: https://github.com/theislab/scib/blob/ed3e2846414ca1e3dc07552c0eef1e68d82230d4/scib/metrics/lisi.py - documentation_url: https://scib.readthedocs.io/en/latest/api/scib.metrics.ilisi_graph.html - - name: clisi - label: cLISI - summary: Local inverse Simpson's Index - description: | - Local Inverse Simpson's Index metrics adapted from Korsunsky et al. 2019 to run on - all full feature, embedding and kNN integration outputs via shortest path-based - distance computation on single-cell kNN graphs. The metric assesses whether clusters - of cells in a single-cell RNA-seq dataset are well-mixed across a categorical cell type variable. - - The original LISI score ranges from 0 to the number of categories, with the latter indicating good cell mixing. This is rescaled to a score between 0 and 1. - reference: luecken2022benchmarking - min: 0 - max: 1 - maximize: true - repository_url: https://github.com/theislab/scib/blob/ed3e2846414ca1e3dc07552c0eef1e68d82230d4/scib/metrics/lisi.py - documentation_url: https://scib.readthedocs.io/en/latest/api/scib.metrics.clisi_graph.html - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - git+https://github.com/theislab/scib.git@v1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/lisi/script.py b/src/tasks/batch_integration/metrics/lisi/script.py deleted file mode 100644 index 44181dab71..0000000000 --- a/src/tasks/batch_integration/metrics/lisi/script.py +++ /dev/null @@ -1,64 +0,0 @@ -import sys -import numpy as np -import anndata as ad -from scib.metrics.lisi import lisi_graph_py - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata(par['input_integrated'], obs='obs', obsp='obsp', uns='uns') -adata.obs = read_anndata(par['input_solution'], obs='obs').obs -adata.uns |= read_anndata(par['input_solution'], uns='uns').uns - -print('compute iLISI score...', flush=True) -ilisi_scores = lisi_graph_py( - adata=adata, - obs_key='batch', - n_neighbors=90, - perplexity=None, - subsample=None, - n_cores=1, - verbose=False, -) -ilisi = np.nanmedian(ilisi_scores) -ilisi = (ilisi - 1) / (adata.obs['batch'].nunique() - 1) - -print('compute cLISI scores...', flush=True) -clisi_scores = lisi_graph_py( - adata=adata, - obs_key='label', - n_neighbors=90, - perplexity=None, - subsample=None, - n_cores=1, - verbose=False, -) -clisi = np.nanmedian(clisi_scores) -nlabs = adata.obs['label'].nunique() -clisi = (nlabs - clisi) / (nlabs - 1) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata.uns['dataset_id'], - 'normalization_id': adata.uns['normalization_id'], - 'method_id': adata.uns['method_id'], - 'metric_ids': [ 'ilisi', 'clisi' ], - 'metric_values': [ ilisi, clisi ] - } -) - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml b/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml deleted file mode 100644 index d3391fb528..0000000000 --- a/src/tasks/batch_integration/metrics/pcr/config.vsh.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# use metric api spec -__merge__: ../../api/comp_metric_embedding.yaml -functionality: - name: pcr - info: - metrics: - - name: pcr - label: PCR - summary: Compare explained variance by batch before and after integration - description: | - Principal component regression, derived from PCA, has previously been used to quantify - batch removal. Briefly, the R2 was calculated from a linear regression of the - covariate of interest (for example, the batch variable B) onto each principal component. - The variance contribution of the batch effect per principal component was then - calculated as the product of the variance explained by the ith principal component (PC) - and the corresponding R2(PCi|B). The sum across all variance contributions by the batch - effects in all principal components gives the total variance explained by the batch - variable as follows: - Var(𝐶|𝐵)=∑𝑖=1𝐺Var(𝐶|PC𝑖)×𝑅2(PC𝑖|𝐵), - - where Var(C|PCi) is the variance of the data matrix C explained by the ith principal - component. - reference: luecken2022benchmarking - v1: - path: openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - min: 0 - max: 1 - maximize: true - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/metrics/pcr/script.py b/src/tasks/batch_integration/metrics/pcr/script.py deleted file mode 100644 index 512b3dff6b..0000000000 --- a/src/tasks/batch_integration/metrics/pcr/script.py +++ /dev/null @@ -1,59 +0,0 @@ -import sys -import anndata as ad -from scib.metrics import pcr_comparison - -## VIASH START -par = { - 'input_integrated': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'output': 'output.h5ad', -} - -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata_solution = read_anndata( - par['input_solution'], - X='layers/normalized', - obs='obs', - var='var', - # obsm='obsm', - # varm='varm', - uns='uns' -) -adata_integrated = read_anndata( - par['input_integrated'], - obs='obs', - obsm='obsm', - uns='uns' -) - -print('compute score', flush=True) -score = pcr_comparison( - adata_solution, - adata_integrated, - embed='X_emb', - covariate='batch', - verbose=False -) - -print('Create output AnnData object', flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': adata_solution.uns['dataset_id'], - 'normalization_id': adata_solution.uns['normalization_id'], - 'method_id': adata_integrated.uns['method_id'], - 'metric_ids': [ meta['functionality_name'] ], - 'metric_values': [ score ] - } -) - - -print('Write data to file', flush=True) -output.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/process_dataset/config.vsh.yaml b/src/tasks/batch_integration/process_dataset/config.vsh.yaml deleted file mode 100644 index 73ea5815c3..0000000000 --- a/src/tasks/batch_integration/process_dataset/config.vsh.yaml +++ /dev/null @@ -1,18 +0,0 @@ -__merge__: ../api/comp_process_dataset.yaml -functionality: - name: process_dataset - description: Preprocess adata object for data integration - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/subset_anndata.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scib==1.1.5 - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/tasks/batch_integration/process_dataset/script.py b/src/tasks/batch_integration/process_dataset/script.py deleted file mode 100644 index cf8af4c4b7..0000000000 --- a/src/tasks/batch_integration/process_dataset/script.py +++ /dev/null @@ -1,66 +0,0 @@ -import sys -import anndata as ad - -## VIASH START -par = { - 'input': 'resources_test/common/pancreas/dataset.h5ad', - 'hvgs': 2000, - 'obs_label': 'cell_type', - 'obs_batch': 'batch', - 'subset_hvg': False, - 'output': 'output.h5ad' -} -meta = { - "config": "target/nextflow/batch_integration/process_dataset/.config.vsh.yaml", - "resources_dir": "src/common/helper_functions" -} -## VIASH END - -# import helper functions -sys.path.append(meta['resources_dir']) -from subset_anndata import read_config_slots_info, subset_anndata - -print('Read input', flush=True) -input = ad.read_h5ad(par['input']) - -def compute_batched_hvg(adata, n_hvgs): - adata = adata.copy() - adata.X = adata.layers['normalized'].copy() - if n_hvgs > adata.n_vars or n_hvgs <= 0: - hvg_list = adata.var_names.tolist() - else: - import scib - hvg_list = scib.pp.hvg_batch( - adata, - batch_key='batch', - target_genes=n_hvgs, - adataOut=False - ) - adata.var['hvg'] = adata.var_names.isin(hvg_list) - del adata.X - return adata - -print(f'Select {par["hvgs"]} highly variable genes', flush=True) -adata_with_hvg = compute_batched_hvg(input, n_hvgs=par['hvgs']) - -if par['subset_hvg']: - print('Subsetting to HVG dimensions', flush=True) - adata_with_hvg = adata_with_hvg[:, adata_with_hvg.var['hvg']].copy() - -print(">> Figuring out which data needs to be copied to which output file", flush=True) -# use par arguments to look for label and batch value in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - "batch": par["obs_batch"], - } -} -slot_info = read_config_slots_info(meta["config"], slot_mapping) - -print(">> Create output object", flush=True) -output_dataset = subset_anndata(adata_with_hvg, slot_info["output_dataset"]) -output_solution = subset_anndata(adata_with_hvg, slot_info["output_solution"]) - -print('Writing adatas to file', flush=True) -output_dataset.write(par['output_dataset'], compression='gzip') -output_solution.write(par['output_solution'], compression='gzip') diff --git a/src/tasks/batch_integration/resources_scripts/process_datasets.sh b/src/tasks/batch_integration/resources_scripts/process_datasets.sh deleted file mode 100755 index b49c203af8..0000000000 --- a/src/tasks/batch_integration/resources_scripts/process_datasets.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -input_states: s3://openproblems-data/resources/datasets/**/state.yaml -rename_keys: 'input:output_dataset' -settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' -output_state: "$id/state.yaml" -publish_dir: s3://openproblems-data/resources/batch_integration/datasets -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' - withName:'.*publishStatesProc' { - memory = '16GB' - disk = '100GB' - } - withLabel:highmem { - memory = '350GB' - } -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels batch_integration,process_datasets \ No newline at end of file diff --git a/src/tasks/batch_integration/resources_scripts/run_benchmark.sh b/src/tasks/batch_integration/resources_scripts/run_benchmark.sh deleted file mode 100755 index cd83810680..0000000000 --- a/src/tasks/batch_integration/resources_scripts/run_benchmark.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -publish_dir="s3://openproblems-data/resources/batch_integration/results/${RUN_ID}" - -cat > /tmp/params.yaml << HERE -input_states: s3://openproblems-data/resources/batch_integration/datasets/**/state.yaml -rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' -output_state: "state.yaml" -publish_dir: "$publish_dir" -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config src/wf_utils/labels_tw.config \ - --labels batch_integration,full \ No newline at end of file diff --git a/src/tasks/batch_integration/resources_scripts/run_benchmark_test.sh b/src/tasks/batch_integration/resources_scripts/run_benchmark_test.sh deleted file mode 100755 index eca3049d3a..0000000000 --- a/src/tasks/batch_integration/resources_scripts/run_benchmark_test.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -input_states: s3://openproblems-data/resources_test/batch_integration/**/state.yaml -rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' -output_state: "state.yaml" -publish_dir: s3://openproblems-nextflow/temp/batch_integration/ -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels batch_integration,test \ No newline at end of file diff --git a/src/tasks/batch_integration/resources_test_scripts/process.sh b/src/tasks/batch_integration/resources_test_scripts/process.sh deleted file mode 100755 index 3ab0dd2a4d..0000000000 --- a/src/tasks/batch_integration/resources_test_scripts/process.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/batch_integration - -mkdir -p $DATASET_DIR - -# process dataset -echo Running process_dataset -nextflow run . \ - -main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - --input_states "$RAW_DATA/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' \ - --publish_dir "$DATASET_DIR" \ - --output_state '$id/state.yaml' -# output_state should be moved to settings once workaround is solved - -for id in pancreas cxg_mouse_pancreas_atlas; do - if [ ! -f $DATASET_DIR/$id/dataset.h5ad ]; then - echo "Dataset $id not found" - exit 1 - fi - - echo Running BBKNN on $id - viash run src/tasks/batch_integration/methods/bbknn/config.vsh.yaml -- \ - --input $DATASET_DIR/$id/dataset.h5ad \ - --output $DATASET_DIR/$id/integrated_graph.h5ad - - echo Running SCVI on $id - viash run src/tasks/batch_integration/methods/scvi/config.vsh.yaml -- \ - --input $DATASET_DIR/$id/dataset.h5ad \ - --output $DATASET_DIR/$id/integrated_embedding.h5ad - - echo Running combat on $id - viash run src/tasks/batch_integration/methods/combat/config.vsh.yaml -- \ - --input $DATASET_DIR/$id/dataset.h5ad \ - --output $DATASET_DIR/$id/integrated_feature.h5ad -done \ No newline at end of file diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml b/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml deleted file mode 100644 index e841081a91..0000000000 --- a/src/tasks/batch_integration/transformers/embed_to_graph/config.vsh.yaml +++ /dev/null @@ -1,19 +0,0 @@ -__merge__: ../../api/comp_transformer_embedding_to_graph.yaml -functionality: - name: embed_to_graph - info: - label: Embedding to Graph - summary: Transform an embedding to a graph output. - description: | - Transform an embedding to a graph output by applying the k nearest neighbors algorithm. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/embed_to_graph/script.py b/src/tasks/batch_integration/transformers/embed_to_graph/script.py deleted file mode 100644 index 74166eb77c..0000000000 --- a/src/tasks/batch_integration/transformers/embed_to_graph/script.py +++ /dev/null @@ -1,33 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/integrated_embedding.h5ad', - 'ouput': 'output.h5ad' -} - -meta = { - 'functionality': 'foo', - 'config': 'bar' -} -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - obs='obs', - obsm='obsm', - uns='uns' -) - - -print('Run kNN...', flush=True) -sc.pp.neighbors(adata, use_rep='X_emb') - -print("Store outputs", flush=True) -adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml b/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml deleted file mode 100644 index e08013c63b..0000000000 --- a/src/tasks/batch_integration/transformers/feature_to_embed/config.vsh.yaml +++ /dev/null @@ -1,20 +0,0 @@ -__merge__: ../../api/comp_transformer_feature_to_embedding.yaml -functionality: - name: feature_to_embed - info: - type: transformer - label: Feature to Embedding - summary: Transform a feature output to an embedding. - description: | - Transform a feature output to an embedding by computing a PCA on the corrected counts. - resources: - - type: python_script - path: script.py - - type: python_script - path: /src/common/helper_functions/read_anndata_partial.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/batch_integration/transformers/feature_to_embed/script.py b/src/tasks/batch_integration/transformers/feature_to_embed/script.py deleted file mode 100644 index 0e022db8b1..0000000000 --- a/src/tasks/batch_integration/transformers/feature_to_embed/script.py +++ /dev/null @@ -1,41 +0,0 @@ -import sys -import scanpy as sc - -## VIASH START -par = { - 'input': 'resources_test/batch_integration/pancreas/integrated_feature.h5ad', - 'ouput': 'output.h5ad' -} - -meta = { - 'functionality': 'foo', - 'config': 'bar' -} - -## VIASH END - -sys.path.append(meta["resources_dir"]) -from read_anndata_partial import read_anndata - - -print('Read input', flush=True) -adata = read_anndata( - par['input'], - X='layers/corrected_counts', - obs='obs', - var='var', - uns='uns' -) - - -print('Run PCA', flush=True) -adata.obsm['X_emb'] = sc.pp.pca( - adata.X, - n_comps=50, - use_highly_variable=False, # Do we want to set this to True? - svd_solver='arpack', - return_info=False -) - -print('Store outputs', flush=True) -adata.write_h5ad(par['output'], compression='gzip') \ No newline at end of file diff --git a/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml b/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml deleted file mode 100644 index 3273e84165..0000000000 --- a/src/tasks/batch_integration/workflows/process_datasets/config.vsh.yaml +++ /dev/null @@ -1,30 +0,0 @@ -functionality: - name: "process_datasets" - namespace: "batch_integration/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: "/src/tasks/batch_integration/api/file_common_dataset.yaml" - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_dataset" - __merge__: /src/tasks/batch_integration/api/file_dataset.yaml - required: true - direction: output - - name: "--output_solution" - __merge__: /src/tasks/batch_integration/api/file_solution.yaml - required: true - direction: output - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: common/check_dataset_schema - - name: batch_integration/process_dataset -platforms: - - type: nextflow diff --git a/src/tasks/batch_integration/workflows/process_datasets/main.nf b/src/tasks/batch_integration/workflows/process_datasets/main.nf deleted file mode 100644 index 59cfee9f47..0000000000 --- a/src/tasks/batch_integration/workflows/process_datasets/main.nf +++ /dev/null @@ -1,54 +0,0 @@ -include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" - -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - output_ch = input_ch - - | check_dataset_schema.run( - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset": checks["exit_code"] == 0 ? state.input : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset != null - } - - | process_dataset.run( - fromState: [ input: "dataset" ], - toState: [ - output_dataset: "output_dataset", - output_solution: "output_solution" - ] - ) - - // only output the files for which an output file was specified - | setState(["output_dataset", "output_solution"]) - - emit: - output_ch -} diff --git a/src/tasks/batch_integration/workflows/process_datasets/run_nextflow.sh b/src/tasks/batch_integration/workflows/process_datasets/run_nextflow.sh deleted file mode 100755 index 28e9382879..0000000000 --- a/src/tasks/batch_integration/workflows/process_datasets/run_nextflow.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Run this prior to executing this script: -# bin/viash_build -q 'batch_integration' - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -export NXF_VER=22.04.5 - -nextflow run . \ - -main-script target/nextflow/batch_integration/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - -c src/wf_utils/labels_ci.config \ - --id resources_test \ - --input_states "resources_test/common/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_dataset": "dataset.h5ad", "output_solution": "solution.h5ad"}' \ - --publish_dir "output/test" \ No newline at end of file diff --git a/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml b/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml deleted file mode 100644 index fd6f6811d2..0000000000 --- a/src/tasks/batch_integration/workflows/run_benchmark/config.vsh.yaml +++ /dev/null @@ -1,115 +0,0 @@ -functionality: - name: "run_benchmark" - namespace: "batch_integration/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input_dataset" - __merge__: /src/tasks/batch_integration/api/file_dataset.yaml - required: true - direction: input - - name: "--input_solution" - __merge__: /src/tasks/batch_integration/api/file_solution.yaml - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--output_method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--output_metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.yaml - - name: Methods - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: ../../api/task_info.yaml - dependencies: - - name: common/check_dataset_schema - - name: common/extract_metadata - - name: batch_integration/methods/bbknn - - name: batch_integration/methods/combat - - name: batch_integration/methods/fastmnn_embedding - - name: batch_integration/methods/fastmnn_feature - - name: batch_integration/methods/liger - - name: batch_integration/methods/mnn_correct - - name: batch_integration/methods/mnnpy - - name: batch_integration/methods/pyliger - - name: batch_integration/methods/scalex_embed - - name: batch_integration/methods/scalex_feature - - name: batch_integration/methods/scanorama_embed - - name: batch_integration/methods/scanorama_feature - - name: batch_integration/methods/scanvi - - name: batch_integration/methods/scvi - - name: batch_integration/control_methods/no_integration/batch_embed - alias: no_integration_batch_embed - - name: batch_integration/control_methods/no_integration/global_embed - alias: no_integration_global_embed - - name: batch_integration/control_methods/no_integration/global_feature - alias: no_integration_global_feature - - name: batch_integration/control_methods/no_integration/global_graph - alias: no_integration_global_graph - - name: batch_integration/control_methods/perfect_integration/celltype_embed - alias: perfect_integration_celltype_embed - - name: batch_integration/control_methods/perfect_integration/celltype_jitter_embed - alias: perfect_integration_celltype_jitter_embed - - name: batch_integration/control_methods/random_integration/batch_embed - alias: random_integration_batch_embed - - name: batch_integration/control_methods/random_integration/batch_feature - alias: random_integration_batch_feature - - name: batch_integration/control_methods/random_integration/batch_graph - alias: random_integration_batch_graph - - name: batch_integration/control_methods/random_integration/celltype_embed - alias: random_integration_celltype_embed - - name: batch_integration/control_methods/random_integration/celltype_feature - alias: random_integration_celltype_feature - - name: batch_integration/control_methods/random_integration/celltype_graph - alias: random_integration_celltype_graph - - name: batch_integration/control_methods/random_integration/global_embed - alias: random_integration_global_embed - - name: batch_integration/control_methods/random_integration/global_feature - alias: random_integration_global_feature - - name: batch_integration/control_methods/random_integration/global_graph - alias: random_integration_global_graph - - name: batch_integration/transformers/feature_to_embed - - name: batch_integration/transformers/embed_to_graph - - name: batch_integration/metrics/asw_batch - - name: batch_integration/metrics/asw_label - - name: batch_integration/metrics/cell_cycle_conservation - - name: batch_integration/metrics/clustering_overlap - - name: batch_integration/metrics/graph_connectivity - - name: batch_integration/metrics/hvg_overlap - - name: batch_integration/metrics/isolated_label_asw - - name: batch_integration/metrics/isolated_label_f1 - - name: batch_integration/metrics/kbet - - name: batch_integration/metrics/pcr -platforms: - - type: nextflow diff --git a/src/tasks/batch_integration/workflows/run_benchmark/main.nf b/src/tasks/batch_integration/workflows/run_benchmark/main.nf deleted file mode 100644 index d86293f2a5..0000000000 --- a/src/tasks/batch_integration/workflows/run_benchmark/main.nf +++ /dev/null @@ -1,258 +0,0 @@ -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - - // construct list of methods - methods = [ - bbknn, - combat, - fastmnn_embedding, - fastmnn_feature, - liger, - mnn_correct, - mnnpy, - pyliger, - scalex_embed, - scalex_feature, - scanorama_embed, - scanorama_feature, - scanvi, - scvi, - no_integration_batch_embed, - no_integration_global_embed, - no_integration_global_feature, - no_integration_global_graph, - perfect_integration_celltype_embed, - perfect_integration_celltype_jitter_embed, - random_integration_batch_embed, - random_integration_batch_feature, - random_integration_batch_graph, - random_integration_celltype_embed, - random_integration_celltype_feature, - random_integration_celltype_graph, - random_integration_global_embed, - random_integration_global_feature, - random_integration_global_graph, - ] - - // construct list of metrics - metrics = [ - asw_batch, - asw_label, - cell_cycle_conservation, - clustering_overlap, - graph_connectivity, - hvg_overlap, - isolated_label_asw, - isolated_label_f1, - kbet, - pcr - ] - - /**************************** - * EXTRACT DATASET METADATA * - ****************************/ - dataset_ch = input_ch - // store join id - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - // extract the dataset metadata - | extract_metadata.run( - fromState: [input: "input_solution"], - toState: { id, output, state -> - state + [ - dataset_uns: readYaml(output.output).uns - ] - } - ) - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - // run all methods - method_out_ch1 = dataset_ch - | runEach( - components: methods, - - // use the 'filter' argument to only run a method on the normalisation the component is asking for - filter: { id, state, comp -> - def norm = state.dataset_uns.normalization_id - def pref = comp.config.functionality.info.preferred_normalization - // if the preferred normalisation is none at all, - // we can pass whichever dataset we want - def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref - def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) - - method_check && norm_check - }, - - // define a new 'id' by appending the method name to the dataset id - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [input: "input_dataset"], - - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - method_id: comp.config.functionality.name, - method_output: output.output, - method_subtype: comp.config.functionality.info.subtype - ] - } - ) - - - // append feature->embed transformations - method_out_ch2 = method_out_ch1 - | runEach( - components: feature_to_embed, - id: { id, state, comp -> - id + "_f2e" - }, - filter: { id, state, comp -> state.method_subtype == "feature"}, - fromState: [ input: "method_output" ], - toState: { id, output, state, comp -> - state + [ - method_output: output.output, - method_subtype: comp.config.functionality.info.subtype - ] - } - ) - | mix(method_out_ch1) - - // append embed->graph transformations - method_out_ch3 = method_out_ch2 - | runEach( - components: embed_to_graph, - id: { id, state, comp -> - id + "_e2g" - }, - filter: { id, state, comp -> state.method_subtype == "embedding"}, - fromState: [ input: "method_output" ], - toState: { id, output, state, comp -> - state + [ - method_output: output.output, - method_subtype: comp.config.functionality.info.subtype - ] - } - ) - | mix(method_out_ch2) - - // run metrics - score_ch = method_out_ch3 - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - filter: { id, state, comp -> - state.method_subtype == comp.config.functionality.info.subtype - }, - fromState: [ - input_integrated: "method_output", - input_solution: "input_solution" - ], - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.functionality.name, - metric_output: output.output - ] - } - ) - - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // TODO: can we store everything below in a separate helper function? - - // extract the dataset metadata - dataset_meta_ch = dataset_ch - // only keep one of the normalization methods - | filter{ id, state -> - state.dataset_uns.normalization_id == "log_cp10k" - } - | joinStates { ids, states -> - // store the dataset metadata in a file - def dataset_uns = states.collect{state -> - def uns = state.dataset_uns.clone() - uns.remove("normalization_id") - uns - } - def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - def dataset_uns_file = tempFile("dataset_uns.yaml") - dataset_uns_file.write(dataset_uns_yaml_blob) - - ["output", [output_dataset_info: dataset_uns_file]] - } - - output_ch = score_ch - // extract scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - // store the method configs in a file - def method_configs = methods.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - // store the task info in a file - def task_info_file = meta.resources_dir.resolve("task_info.yaml") - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - // create state - def new_state = [ - output_method_configs: method_configs_file, - output_metric_configs: metric_configs_file, - output_task_info: task_info_file, - output_scores: score_uns_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - | mix(dataset_meta_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -} diff --git a/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh b/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh deleted file mode 100755 index a24ebb706f..0000000000 --- a/src/tasks/batch_integration/workflows/run_benchmark/run_test.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -# export TOWER_WORKSPACE_ID=53907369739130 - -DATASETS_DIR="resources_test/batch_integration" -OUTPUT_DIR="output/temp" - -if [ ! -d "$OUTPUT_DIR" ]; then - mkdir -p "$OUTPUT_DIR" -fi - -export NXF_VER=22.04.5 -nextflow run . \ - -main-script target/nextflow/batch_integration/workflows/run_benchmark/main.nf \ - -profile docker \ - -resume \ - -c src/wf_utils/labels_ci.config \ - -entry auto \ - --input_states "$DATASETS_DIR/**/state.yaml" \ - --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \ - --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ - --publish_dir "$OUTPUT_DIR" \ - --output_state "state.yaml" \ No newline at end of file diff --git a/src/tasks/denoising/README.md b/src/tasks/denoising/README.md index 5f33715180..9eb865b69a 100644 --- a/src/tasks/denoising/README.md +++ b/src/tasks/denoising/README.md @@ -1,357 +1,3 @@ # Denoising - -Removing noise in sparse single-cell RNA-sequencing count data - -Path: -[`src/tasks/denoising`](https://github.com/openproblems-bio/openproblems/tree/main/src/tasks/denoising) - -## Motivation - -Single-cell RNA-Seq protocols only detect a fraction of the mRNA -molecules present in each cell. As a result, the measurements (UMI -counts) observed for each gene and each cell are associated with -generally high levels of technical noise ([Grün et al., -2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes -the task of estimating the true expression level of each gene in each -cell. In the single-cell literature, this task is also referred to as -*imputation*, a term which is typically used for missing data problems -in statistics. Similar to the use of the terms “dropout”, “missing -data”, and “technical zeros”, this terminology can create confusion -about the underlying measurement process ([Sarkar and Stephens, -2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). - -## Description - -A key challenge in evaluating denoising methods is the general lack of a -ground truth. A recent benchmark study ([Hou et al., -2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) -relied on flow-sorted datasets, mixture control experiments ([Tian et -al., 2019](https://www.nature.com/articles/s41592-019-0425-8)), and -comparisons with bulk RNA-Seq data. Since each of these approaches -suffers from specific limitations, it is difficult to combine these -different approaches into a single quantitative measure of denoising -accuracy. Here, we instead rely on an approach termed molecular -cross-validation (MCV), which was specifically developed to quantify -denoising accuracy in the absence of a ground truth ([Batson et al., -2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the -observed molecules in a given scRNA-Seq dataset are first partitioned -between a *training* and a *test* dataset. Next, a denoising method is -applied to the training dataset. Finally, denoising accuracy is measured -by comparing the result to the test dataset. The authors show that both -in theory and in practice, the measured denoising accuracy is -representative of the accuracy that would be obtained on a ground truth -dataset. - -## Authors & contributors - -| name | roles | -|:------------------|:-------------------| -| Wesley Lewis | author, maintainer | -| Scott Gigante | author, maintainer | -| Robrecht Cannoodt | author | -| Kai Waldrant | author | - -## API - -``` mermaid -flowchart LR - file_common_dataset("Common dataset") - comp_process_dataset[/"Data processor"/] - file_train("Training data") - file_test("Test data") - comp_control_method[/"Control method"/] - comp_method[/"Method"/] - comp_metric[/"Metric"/] - file_denoised("Denoised data") - file_score("Score") - file_common_dataset---comp_process_dataset - comp_process_dataset-->file_train - comp_process_dataset-->file_test - file_train---comp_control_method - file_train---comp_method - file_test---comp_control_method - file_test---comp_metric - comp_control_method-->file_denoised - comp_method-->file_denoised - comp_metric-->file_score - file_denoised---comp_metric -``` - -## File format: Common dataset - -A dataset processed by the common dataset processing pipeline. - -Example file: `resources_test/common/pancreas/dataset.h5ad` - -Description: - -This dataset contains both raw counts and normalized data matrices, as -well as a PCA embedding, HVG selection and a kNN graph. - -Format: - -
- - AnnData object - obs: 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid', 'size_factors' - var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' - obsm: 'X_pca' - obsp: 'knn_distances', 'knn_connectivities' - varm: 'pca_loadings' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'pca_variance', 'knn' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------------------------------|:----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | -| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | -| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | -| `obs["cell_type"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | -| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | -| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | -| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | -| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | -| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | -| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | -| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | -| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | -| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | -| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | -| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | -| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | -| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | -| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | -| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | -| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | -| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | -| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | -| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | -| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors created by the normalisation method, if any. | -| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | -| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | -| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalised expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | -| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["pca_variance"]` | `double` | The PCA variance objects. | -| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | - -
- -## Component type: Data processor - -Path: -[`src/denoising`](https://github.com/openproblems-bio/openproblems/tree/main/src/denoising) - -A denoising dataset processor. - -Arguments: - -
- -| Name | Type | Description | -|:-----------------|:-------|:------------------------------------------------------------------| -| `--input` | `file` | A dataset processed by the common dataset processing pipeline. | -| `--output_train` | `file` | (*Output*) The subset of molecules used for the training dataset. | -| `--output_test` | `file` | (*Output*) The subset of molecules used for the test dataset. | - -
- -## File format: Training data - -The subset of molecules used for the training dataset - -Example file: `resources_test/denoising/pancreas/train.h5ad` - -Format: - -
- - AnnData object - layers: 'counts' - uns: 'dataset_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------|:----------|:-------------------------------------| -| `layers["counts"]` | `integer` | Raw counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | - -
- -## File format: Test data - -The subset of molecules used for the test dataset - -Example file: `resources_test/denoising/pancreas/test.h5ad` - -Format: - -
- - AnnData object - layers: 'counts' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'train_sum' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `layers["counts"]` | `integer` | Raw counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["train_sum"]` | `integer` | The total number of counts in the training dataset. | - -
- -## Component type: Control method - -Path: -[`src/denoising/control_methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/denoising/control_methods) - -Quality control methods for verifying the pipeline. - -Arguments: - -
- -| Name | Type | Description | -|:----------------|:-------|:---------------------------------------------------------------| -| `--input_train` | `file` | The subset of molecules used for the training dataset. | -| `--input_test` | `file` | The subset of molecules used for the test dataset. | -| `--output` | `file` | (*Output*) A denoised dataset as output by a denoising method. | - -
- -## Component type: Method - -Path: -[`src/denoising/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/denoising/methods) - -A denoising method. - -Arguments: - -
- -| Name | Type | Description | -|:----------------|:-------|:---------------------------------------------------------------| -| `--input_train` | `file` | The subset of molecules used for the training dataset. | -| `--output` | `file` | (*Output*) A denoised dataset as output by a denoising method. | - -
- -## Component type: Metric - -Path: -[`src/denoising/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/denoising/metrics) - -A denoising metric. - -Arguments: - -
- -| Name | Type | Description | -|:-------------------|:-------|:----------------------------------------------------| -| `--input_test` | `file` | The subset of molecules used for the test dataset. | -| `--input_denoised` | `file` | A denoised dataset as output by a denoising method. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## File format: Denoised data - -A denoised dataset as output by a denoising method. - -Example file: `resources_test/denoising/pancreas/denoised.h5ad` - -Format: - -
- - AnnData object - layers: 'denoised' - uns: 'dataset_id', 'method_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:---------------------|:----------|:-------------------------------------| -| `layers["denoised"]` | `integer` | denoised data. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | - -
- -## File format: Score - -NA - -Example file: `resources_test/denoising/pancreas/score.h5ad` - -Description: - -Metric score file - -Format: - -
- - AnnData object - uns: 'dataset_id', 'method_id', 'metric_ids', 'metric_values' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------|:---------|:---------------------------------------------------------------------------------------------| -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | -| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | - -
- +# This task has been moved to [https://github.com/openproblems-bio/task_denoising](https://github.com/openproblems-bio/task_denoising)! diff --git a/src/tasks/denoising/api/comp_control_method.yaml b/src/tasks/denoising/api/comp_control_method.yaml deleted file mode 100644 index 6fe13f2a35..0000000000 --- a/src/tasks/denoising/api/comp_control_method.yaml +++ /dev/null @@ -1,33 +0,0 @@ -functionality: - namespace: "denoising/control_methods" - info: - type: control_method - type_info: - label: Control method - summary: Quality control methods for verifying the pipeline. - description: | - These components have the same interface as the regular methods - but also receive the solution object as input. It serves as a - starting point to test the relative accuracy of new methods in - the task, and also as a quality control for the metrics defined - in the task. - arguments: - - name: "--input_train" - __merge__: file_train.yaml - direction: input - required: true - - name: "--input_test" - __merge__: file_test.yaml - direction: input - required: true - - name: "--output" - __merge__: file_denoised.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/denoising/pancreas - dest: resources_test/denoising/pancreas \ No newline at end of file diff --git a/src/tasks/denoising/api/comp_method.yaml b/src/tasks/denoising/api/comp_method.yaml deleted file mode 100644 index 517723772d..0000000000 --- a/src/tasks/denoising/api/comp_method.yaml +++ /dev/null @@ -1,26 +0,0 @@ -functionality: - namespace: "denoising/methods" - info: - type: method - type_info: - label: Method - summary: A denoising method. - description: | - A denoising method to remove noise (i.e. technical artifacts) from a dataset. - arguments: - - name: "--input_train" - __merge__: file_train.yaml - direction: input - required: true - - name: "--output" - __merge__: file_denoised.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/denoising/pancreas - dest: resources_test/denoising/pancreas - - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/denoising/api/comp_metric.yaml b/src/tasks/denoising/api/comp_metric.yaml deleted file mode 100644 index c2ef922239..0000000000 --- a/src/tasks/denoising/api/comp_metric.yaml +++ /dev/null @@ -1,31 +0,0 @@ -functionality: - namespace: "denoising/metrics" - info: - type: metric - type_info: - label: Metric - summary: A denoising metric. - description: | - A metric for evaluating denoised datasets. - arguments: - - name: "--input_test" - __merge__: file_test.yaml - direction: input - required: true - - name: "--input_denoised" - __merge__: file_denoised.yaml - direction: input - required: true - - name: "--output" - __merge__: file_score.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/denoising/pancreas - dest: resources_test/denoising/pancreas - - path: /src/common/library.bib - \ No newline at end of file diff --git a/src/tasks/denoising/api/comp_process_dataset.yaml b/src/tasks/denoising/api/comp_process_dataset.yaml deleted file mode 100644 index ce6874c0ea..0000000000 --- a/src/tasks/denoising/api/comp_process_dataset.yaml +++ /dev/null @@ -1,27 +0,0 @@ -functionality: - namespace: "denoising" - info: - type: process_dataset - type_info: - label: Data processor - summary: A denoising dataset processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. - arguments: - - name: "--input" - __merge__: /src/datasets/api/file_common_dataset.yaml - direction: input - required: true - - name: "--output_train" - __merge__: file_train.yaml - direction: output - required: true - - name: "--output_test" - __merge__: file_test.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas diff --git a/src/tasks/denoising/api/file_common_dataset.yaml b/src/tasks/denoising/api/file_common_dataset.yaml deleted file mode 100644 index ff913ce0de..0000000000 --- a/src/tasks/denoising/api/file_common_dataset.yaml +++ /dev/null @@ -1,40 +0,0 @@ -type: file -example: "resources_test/common/pancreas/dataset.h5ad" -info: - label: "Common Dataset" - summary: A subset of the common dataset. - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false diff --git a/src/tasks/denoising/api/file_denoised.yaml b/src/tasks/denoising/api/file_denoised.yaml deleted file mode 100644 index fc79694028..0000000000 --- a/src/tasks/denoising/api/file_denoised.yaml +++ /dev/null @@ -1,21 +0,0 @@ -type: file -example: "resources_test/denoising/pancreas/denoised.h5ad" -info: - label: "Denoised data" - summary: A denoised dataset as output by a denoising method. - slots: - layers: - - type: integer - name: denoised - description: denoised data - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - \ No newline at end of file diff --git a/src/tasks/denoising/api/file_score.yaml b/src/tasks/denoising/api/file_score.yaml deleted file mode 100644 index 4f34eeb7f7..0000000000 --- a/src/tasks/denoising/api/file_score.yaml +++ /dev/null @@ -1,21 +0,0 @@ -type: file -description: "Metric score file" -example: "resources_test/denoising/pancreas/score.h5ad" -info: - label: "Score" - slots: - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - - type: string - name: method_id - description: "A unique identifier for the method" - - type: string - name: metric_ids - description: "One or more unique metric identifiers" - multiple: true - - type: double - name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." - multiple: true diff --git a/src/tasks/denoising/api/file_test.yaml b/src/tasks/denoising/api/file_test.yaml deleted file mode 100644 index 371b3054f7..0000000000 --- a/src/tasks/denoising/api/file_test.yaml +++ /dev/null @@ -1,44 +0,0 @@ -type: file -example: "resources_test/denoising/pancreas/test.h5ad" -info: - label: "Test data" - summary: The subset of molecules used for the test dataset - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: train_sum - type: integer - description: The total number of counts in the training dataset. - required: true \ No newline at end of file diff --git a/src/tasks/denoising/api/file_train.yaml b/src/tasks/denoising/api/file_train.yaml deleted file mode 100644 index 302eae2d5c..0000000000 --- a/src/tasks/denoising/api/file_train.yaml +++ /dev/null @@ -1,16 +0,0 @@ -type: file -example: "resources_test/denoising/pancreas/train.h5ad" -info: - label: "Training data" - summary: The subset of molecules used for the training dataset - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true \ No newline at end of file diff --git a/src/tasks/denoising/api/task_info.yaml b/src/tasks/denoising/api/task_info.yaml deleted file mode 100644 index f7de1118f2..0000000000 --- a/src/tasks/denoising/api/task_info.yaml +++ /dev/null @@ -1,54 +0,0 @@ -name: denoising -label: Denoising -v1: - path: openproblems/tasks/denoising/README.md - commit: 3fe9251ba906061b6769eed2ac9da0db5f8e26bb -summary: "Removing noise in sparse single-cell RNA-sequencing count data" -image: "thumbnail.svg" -motivation: | - Single-cell RNA-Seq protocols only detect a fraction of the mRNA molecules present - in each cell. As a result, the measurements (UMI counts) observed for each gene and each - cell are associated with generally high levels of technical noise ([Grün et al., - 2014](https://www.nature.com/articles/nmeth.2930)). Denoising describes the task of - estimating the true expression level of each gene in each cell. In the single-cell - literature, this task is also referred to as *imputation*, a term which is typically - used for missing data problems in statistics. Similar to the use of the terms "dropout", - "missing data", and "technical zeros", this terminology can create confusion about the - underlying measurement process ([Sarkar and Stephens, - 2020](https://www.biorxiv.org/content/10.1101/2020.04.07.030007v2)). -description: | - A key challenge in evaluating denoising methods is the general lack of a ground truth. A - recent benchmark study ([Hou et al., - 2020](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02132-x)) - relied on flow-sorted datasets, mixture control experiments ([Tian et al., - 2019](https://www.nature.com/articles/s41592-019-0425-8)), and comparisons with bulk - RNA-Seq data. Since each of these approaches suffers from specific limitations, it is - difficult to combine these different approaches into a single quantitative measure of - denoising accuracy. Here, we instead rely on an approach termed molecular - cross-validation (MCV), which was specifically developed to quantify denoising accuracy - in the absence of a ground truth ([Batson et al., - 2019](https://www.biorxiv.org/content/10.1101/786269v1)). In MCV, the observed molecules - in a given scRNA-Seq dataset are first partitioned between a *training* and a *test* - dataset. Next, a denoising method is applied to the training dataset. Finally, denoising - accuracy is measured by comparing the result to the test dataset. The authors show that - both in theory and in practice, the measured denoising accuracy is representative of the - accuracy that would be obtained on a ground truth dataset. -authors: - - name: "Wesley Lewis" - roles: [ author, maintainer ] - info: - github: wes-lewis - - name: "Scott Gigante" - roles: [ author, maintainer ] - info: - github: scottgigante - orcid: "0000-0002-4544-2764" - - name: Robrecht Cannoodt - roles: [ author ] - info: - github: rcannood - orcid: "0000-0003-3641-729X" - - name: Kai Waldrant - roles: [ author ] - info: - github: KaiWaldrant \ No newline at end of file diff --git a/src/tasks/denoising/api/thumbnail.svg b/src/tasks/denoising/api/thumbnail.svg deleted file mode 100644 index 65936f0e1e..0000000000 --- a/src/tasks/denoising/api/thumbnail.svg +++ /dev/null @@ -1 +0,0 @@ -dim-2dim-1dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml deleted file mode 100644 index 64a35f9986..0000000000 --- a/src/tasks/denoising/control_methods/no_denoising/config.vsh.yaml +++ /dev/null @@ -1,22 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "no_denoising" - info: - label: No Denoising - summary: "negative control by copying train counts" - description: "This method serves as a negative control, where the denoised data is a copy of the unaltered training data. This represents the scoring threshold if denoising was not performed on the data." - v1: - path: openproblems/tasks/denoising/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - variants: - no_denoising: - preferred_normalization: counts - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/tasks/denoising/control_methods/no_denoising/script.py b/src/tasks/denoising/control_methods/no_denoising/script.py deleted file mode 100644 index 97c9a4184c..0000000000 --- a/src/tasks/denoising/control_methods/no_denoising/script.py +++ /dev/null @@ -1,22 +0,0 @@ -import anndata as ad - -## VIASH START -par = { - 'input_train': 'output_train.h5ad', - 'output': 'output_ND.h5ad', -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) - -print("Process data", flush=True) -input_train.layers["denoised"] = input_train.layers['counts'] - -input_train.uns["method_id"] = meta['functionality_name'] - -print("Write Data", flush=True) -input_train.write_h5ad(par['output'],compression="gzip") diff --git a/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml b/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml deleted file mode 100644 index b16862360b..0000000000 --- a/src/tasks/denoising/control_methods/perfect_denoising/config.vsh.yaml +++ /dev/null @@ -1,22 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "perfect_denoising" - info: - label: Perfect Denoising - summary: "Positive control by copying the test counts" - description: "This method serves as a positive control, where the test data is copied 1-to-1 to the denoised data. This makes it seem as if the data is perfectly denoised as it will be compared to the test data in the metrics." - v1: - path: openproblems/tasks/denoising/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - variants: - perfect_denoising: - preferred_normalization: counts - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/tasks/denoising/control_methods/perfect_denoising/script.py b/src/tasks/denoising/control_methods/perfect_denoising/script.py deleted file mode 100644 index c280a4a3bc..0000000000 --- a/src/tasks/denoising/control_methods/perfect_denoising/script.py +++ /dev/null @@ -1,24 +0,0 @@ -import anndata as ad - -## VIASH START -par = { - 'input_train': 'resources_test/denoising/pancreas/train.h5ad', - 'input_test': 'resources_test/denoising/pancreas/test.h5ad', - 'output': 'output_PD.h5ad', -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print("Process data", flush=True) -input_train.layers["denoised"] = input_test.layers['counts'] - -input_train.uns["method_id"] = meta['functionality_name'] - -print("Write Data", flush=True) -input_train.write_h5ad(par['output'],compression="gzip") diff --git a/src/tasks/denoising/methods/alra/config.vsh.yaml b/src/tasks/denoising/methods/alra/config.vsh.yaml deleted file mode 100644 index 374d317fce..0000000000 --- a/src/tasks/denoising/methods/alra/config.vsh.yaml +++ /dev/null @@ -1,43 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "alra" - info: - label: ALRA - summary: "ALRA imputes missing values in scRNA-seq data by computing rank-k approximation, thresholding by gene, and rescaling the matrix." - description: | - Adaptively-thresholded Low Rank Approximation (ALRA). - - ALRA is a method for imputation of missing values in single cell RNA-sequencing data, - described in the preprint, "Zero-preserving imputation of scRNA-seq data using low-rank approximation" - available [here](https://www.biorxiv.org/content/early/2018/08/22/397588). Given a - scRNA-seq expression matrix, ALRA first computes its rank-k approximation using randomized SVD. - Next, each row (gene) is thresholded by the magnitude of the most negative value of that gene. - Finally, the matrix is rescaled. - reference: "linderman2018zero" - repository_url: "https://github.com/KlugerLab/ALRA" - documentation_url: https://github.com/KlugerLab/ALRA/blob/master/README.md - v1: - path: openproblems/tasks/denoising/methods/alra.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - variants: - alra: - preferred_normalization: counts - arguments: - - name: "--norm" - type: string - choices: ["sqrt", "log"] - default: "log" - description: Normalization method - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ Matrix, rsvd ] - github: KlugerLab/ALRA - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/alra/script.R b/src/tasks/denoising/methods/alra/script.R deleted file mode 100644 index 9a5b237c6f..0000000000 --- a/src/tasks/denoising/methods/alra/script.R +++ /dev/null @@ -1,53 +0,0 @@ -cat(">> Loading dependencies\n") -library(anndata, warn.conflicts = FALSE) -library(ALRA, warn.conflicts = FALSE) - -## VIASH START -par <- list( - input_train = "resources_test/denoising/pancreas/train.h5ad", - norm = "log", - output = "output.h5ad" -) -meta <- list( - functionality_name = "alra" -) -## VIASH END - -cat(">> Load input data\n") -input_train <- read_h5ad(par$input_train, backed = "r") - -cat(">> Set normalization method\n") -if (par$norm == "sqrt") { - norm_fn <- sqrt - denorm_fn <- function(x) x^2 -} else if (par$norm == "log") { - norm_fn <- log1p - denorm_fn <- expm1 -} else { - stop("Unknown normalization method: ", par$norm) -} - -cat(">> Normalize data\n") -data <- as.matrix(input_train$layers[["counts"]]) -totalPerCell <- rowSums(data) -data <- sweep(data, 1, totalPerCell, "/") -data <- norm_fn(data) - -cat(">> Run ALRA\n") -data <- alra(data)$A_norm_rank_k_cor_sc -data <- denorm_fn(data) -data <- sweep(data, 1, totalPerCell, "*") - -cat(">> Store output\n") -output <- AnnData( - layers = list(denoised = data), - obs = input_train$obs[, c(), drop = FALSE], - var = input_train$var[, c(), drop = FALSE], - uns = list( - dataset_id = input_train$uns[["dataset_id"]], - method_id = meta$functionality_name - ) -) - -cat(">> Write output to file\n") -output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/denoising/methods/dca/config.vsh.yaml b/src/tasks/denoising/methods/dca/config.vsh.yaml deleted file mode 100644 index 33c6079866..0000000000 --- a/src/tasks/denoising/methods/dca/config.vsh.yaml +++ /dev/null @@ -1,45 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "dca" - info: - label: DCA - summary: "A deep autoencoder with ZINB loss function to address the dropout effect in count data" - description: | - "Deep Count Autoencoder - - Removes the dropout effect by taking the count structure, overdispersed nature and sparsity of the data into account - using a deep autoencoder with zero-inflated negative binomial (ZINB) loss function." - reference: "eraslan2019single" - documentation_url: "https://github.com/theislab/dca#readme" - repository_url: "https://github.com/theislab/dca" - v1: - path: openproblems/tasks/denoising/methods/dca.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - variants: - dca: - preferred_normalization: counts - arguments: - - name: "--epochs" - type: "integer" - default: 300 - description: "Number of total epochs in training" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: python:3.9 - setup: - - type: apt - packages: procps - - type: python - packages: - - anndata~=0.8.0 - - scanpy - - pyyaml - - requests - - jsonschema - - "git+https://github.com/scottgigante-immunai/dca.git@patch-1" - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/dca/script.py b/src/tasks/denoising/methods/dca/script.py deleted file mode 100644 index d35f3c00a5..0000000000 --- a/src/tasks/denoising/methods/dca/script.py +++ /dev/null @@ -1,39 +0,0 @@ -import anndata as ad -from dca.api import dca - -## VIASH START -par = { - 'input_train': 'resources_test/denoising/pancreas/train.h5ad', - 'output': 'output_dca.h5ad', - 'epochs': 300, -} -meta = { - 'functionality_name': 'dca', -} -## VIASH END - -print("load input data", flush=True) -input_train = ad.read_h5ad(par['input_train'], backed="r") - -print("Remove unneeded data", flush=True) -output = ad.AnnData( - X=input_train.layers["counts"], - obs=input_train.obs[[]], - var=input_train.var[[]], - uns={ - "dataset_id": input_train.uns["dataset_id"], - "method_id": meta["functionality_name"] - } -) - -del input_train - -print("Run DCA", flush=True) -dca(output, epochs=par["epochs"]) - -print("Move output to correct location", flush=True) -output.layers["denoised"] = output.X -del output.X - -print("Writing data", flush=True) -output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml b/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml deleted file mode 100644 index b0c55ae0d8..0000000000 --- a/src/tasks/denoising/methods/knn_smoothing/config.vsh.yaml +++ /dev/null @@ -1,41 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "knn_smoothing" - info: - label: KNN Smoothing - summary: "Iterative kNN-smoothing denoises scRNA-seq data by iteratively increasing the size of neighbourhoods for smoothing until a maximum k value is reached." - description: "Iterative kNN-smoothing is a method to repair or denoise noisy scRNA-seq - expression matrices. Given a scRNA-seq expression matrix, KNN-smoothing first - applies initial normalisation and smoothing. Then, a chosen number of - principal components is used to calculate Euclidean distances between cells. - Minimally sized neighbourhoods are initially determined from these Euclidean - distances, and expression profiles are shared between neighbouring cells. - Then, the resultant smoothed matrix is used as input to the next step of - smoothing, where the size (k) of the considered neighbourhoods is increased, - leading to greater smoothing. This process continues until a chosen maximum k - value has been reached, at which point the iteratively smoothed object is - then optionally scaled to yield a final result." - reference: "wagner2018knearest" - documentation_url: "https://github.com/yanailab/knn-smoothing#readme" - repository_url: "https://github.com/yanailab/knn-smoothing" - v1: - path: openproblems/tasks/denoising/methods/knn_smoothing.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - variants: - knn_smoothing: - preferred_normalization: counts - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - scipy - github: - - scottgigante-immunai/knn-smoothing@python_package - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/knn_smoothing/script.py b/src/tasks/denoising/methods/knn_smoothing/script.py deleted file mode 100644 index 450da2012a..0000000000 --- a/src/tasks/denoising/methods/knn_smoothing/script.py +++ /dev/null @@ -1,39 +0,0 @@ -import knn_smooth -import anndata as ad - -## VIASH START -par = { - 'input_train': 'resources_test/denoising/pancreas/train.h5ad', - 'output': 'output_knn.h5ad', -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par["input_train"], backed="r") - -print("Remove unneeded data", flush=True) -X = input_train.layers["counts"].astype(float).transpose().toarray() - -# Create output AnnData for later use -output = ad.AnnData( - obs=input_train.obs[[]], - var=input_train.var[[]], - uns={ - "dataset_id": input_train.uns["dataset_id"], - "method_id": meta["functionality_name"] - } -) - -del input_train - -print("Run KNN smoothing", flush=True) -X = knn_smooth.knn_smoothing(X, k=10).transpose() - -print("Process data", flush=True) -output.layers["denoised"] = X - -print("Writing data", flush=True) -output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/denoising/methods/magic/config.vsh.yaml b/src/tasks/denoising/methods/magic/config.vsh.yaml deleted file mode 100644 index 380666a1b5..0000000000 --- a/src/tasks/denoising/methods/magic/config.vsh.yaml +++ /dev/null @@ -1,63 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "magic" - info: - label: MAGIC - summary: "MAGIC imputes and denoises scRNA-seq data that is noisy or dropout-prone." - description: "MAGIC (Markov Affinity-based Graph Imputation of Cells) is a method for - imputation and denoising of noisy or dropout-prone single cell RNA-sequencing - data. Given a normalised scRNA-seq expression matrix, it first calculates - Euclidean distances between each pair of cells in the dataset, which is then - augmented using a Gaussian kernel (function) and row-normalised to give a - normalised affinity matrix. A t-step markov process is then calculated, by - powering this affinity matrix t times. Finally, the powered affinity matrix - is right-multiplied by the normalised data, causing the final imputed values - to take the value of a per-gene average weighted by the affinities of cells. - The resultant imputed matrix is then rescaled, to more closely match the - magnitude of measurements in the normalised (input) matrix." - reference: "van2018recovering" - documentation_url: "https://github.com/KrishnaswamyLab/MAGIC#readme" - repository_url: "https://github.com/KrishnaswamyLab/MAGIC" - v1: - path: openproblems/tasks/denoising/methods/magic.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - variants: - magic: - magic_approx: - solver: approximate - magic_knn_naive: - norm: log - decay: none - t: 1 - preferred_normalization: counts - arguments: - - name: "--solver" - type: "string" - choices: ["exact", "approximate"] - default: "exact" - description: Which solver to use. - - name: "--norm" - type: string - choices: ["sqrt", "log"] - default: "log" - description: Normalization method - - name: "--decay" - type: integer - default: 1 - description: sets decay rate of kernel tails - - name: "--t" - type: integer - default: 3 - description: power to which the diffusion operator is powered - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pip: [scprep, magic-impute, scipy, scikit-learn<1.2] - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/magic/script.py b/src/tasks/denoising/methods/magic/script.py deleted file mode 100644 index 075d2e21cd..0000000000 --- a/src/tasks/denoising/methods/magic/script.py +++ /dev/null @@ -1,76 +0,0 @@ -import anndata as ad -import numpy as np -import scprep -from magic import MAGIC -import scipy - - -## VIASH START -par = { - "input_train": "resources_test/denoising/pancreas/train.h5ad", - "output": "output_magic.h5ad", - "solver": "exact", - "norm": "sqrt", - "decay": 1, - "t": 3, -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load data", flush=True) -input_train = ad.read_h5ad(par["input_train"], backed="r") - -print("Set normalization method", flush=True) -if par["norm"] == "sqrt": - norm_fn = np.sqrt - denorm_fn = np.square -elif par["norm"] == "log": - norm_fn = np.log1p - denorm_fn = np.expm1 -else: - raise ValueError("Unknown normalization method: " + par["norm"] + ".") - -print("Remove unneeded data", flush=True) -X = input_train.layers["counts"] - -# Create output AnnData for later use -output = ad.AnnData( - obs=input_train.obs[[]], - var=input_train.var[[]], - uns={ - "dataset_id": input_train.uns["dataset_id"], - "method_id": meta["functionality_name"] - } -) - -del input_train - -print("Normalize data", flush=True) -X, libsize = scprep.normalize.library_size_normalize( - X, - rescale=1, - return_library_size=True -) -X = scprep.utils.matrix_transform(X, norm_fn) - -print("Run MAGIC", flush=True) -magic = MAGIC( - solver=par["solver"], - decay=par["decay"], - t=par["t"], - verbose=False, -) -X = magic.fit_transform(X, genes="all_genes") - -print("Denormalizing data", flush=True) -X = scprep.utils.matrix_transform(X, denorm_fn) -X = scprep.utils.matrix_vector_elementwise_multiply(X, libsize, axis=0) - -print("Create output AnnData", flush=True) -output.layers["denoised"] = X - -print("Write Data", flush=True) -output.write_h5ad(par["output"], compression="gzip") - diff --git a/src/tasks/denoising/methods/saver/config.vsh.yaml b/src/tasks/denoising/methods/saver/config.vsh.yaml deleted file mode 100644 index 3c997fc36f..0000000000 --- a/src/tasks/denoising/methods/saver/config.vsh.yaml +++ /dev/null @@ -1,32 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: saver - status: disabled - info: - label: SAVER - summary: SAVER (Single-cell Analysis Via Expression Recovery) implements a regularized regression prediction and empirical Bayes method to recover the true gene expression profile. - description: | - SAVER takes advantage of gene-to-gene relationships to recover the true expression level of each gene in each cell, - removing technical variation while retaining biological variation across cells (https://github.com/mohuangx/SAVER). - SAVER uses a post-quality-control scRNA-seq dataset with UMI counts as input. SAVER assumes that the count of each - gene in each cell follows a Poisson-gamma mixture, also known as a negative binomial model. Instead of specifying - the gamma prior, we estimate the prior parameters in an empirical Bayes-like approach with a Poisson LASSO regression, - using the expression of other genes as predictors. Once the prior parameters are estimated, SAVER outputs the - posterior distribution of the true expression, which quantifies estimation uncertainty, and the posterior mean is - used as the SAVER recovered expression value. - reference: huang2018savergene - repository_url: https://github.com/mohuangx/SAVER - documentation_url: https://mohuangx.github.io/SAVER/index.html - preferred_normalization: counts - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - github: mohuangx/SAVER - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/denoising/methods/saver/script.R b/src/tasks/denoising/methods/saver/script.R deleted file mode 100644 index f6a44f4c3a..0000000000 --- a/src/tasks/denoising/methods/saver/script.R +++ /dev/null @@ -1,39 +0,0 @@ -cat(">> Loading dependencies\n") -library(anndata, warn.conflicts = FALSE) -library(SAVER, warn.conflicts = FALSE) -library(Matrix, warn.conflicts = FALSE) - -## VIASH START -par <- list( - input_train = "resources_test/denoising/pancreas/train.h5ad", - norm = "log", - output = "output.h5ad" -) -meta <- list( - functionality_name = "saver", - ncpus = 30 -) -## VIASH END - -cat(">> Load input data\n") -input_train <- read_h5ad(par$input_train, backed = "r") - -cat(">> Normalize data\n") -data <- as(t(input_train$layers[["counts"]]), "CsparseMatrix") - -cat(">> Run SAVER\n") -data <- t(saver(data, ncores = meta$ncpus, estimates.only = TRUE)) - -cat(">> Store output\n") -output <- AnnData( - layers = list(denoised = data), - obs = input_train$obs[, c(), drop = FALSE], - var = input_train$var[, c(), drop = FALSE], - uns = list( - dataset_id = input_train$uns[["dataset_id"]], - method_id = meta$functionality_name - ) -) - -cat(">> Write output to file\n") -output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/denoising/metrics/mse/config.vsh.yaml b/src/tasks/denoising/metrics/mse/config.vsh.yaml deleted file mode 100644 index 8330a8de31..0000000000 --- a/src/tasks/denoising/metrics/mse/config.vsh.yaml +++ /dev/null @@ -1,30 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "mse" - info: - metrics: - - name: mse - label: Mean-squared error - summary: "The mean squared error between the denoised counts and the true counts." - description: "The mean squared error between the denoised counts of the training dataset and the true counts of the test dataset after reweighing by the train/test ratio" - reference: batson2019molecular - v1: - path: openproblems/tasks/denoising/metrics/mse.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - maximize: false - min: 0 - max: "+.inf" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - scikit-learn - - scprep - - type: nextflow - directives: - label: [midtime, highmem, midcpu] diff --git a/src/tasks/denoising/metrics/mse/script.py b/src/tasks/denoising/metrics/mse/script.py deleted file mode 100644 index eba964f132..0000000000 --- a/src/tasks/denoising/metrics/mse/script.py +++ /dev/null @@ -1,51 +0,0 @@ -import anndata as ad -import scanpy as sc -import sklearn.metrics -import scprep - -## VIASH START -par = { - 'input_test': 'resources_test/denoising/pancreas/test.h5ad', - 'input_denoised': 'resources_test/denoising/pancreas/magic.h5ad', - 'output': 'output_mse.h5ad' -} -meta = { - 'functionality_name': 'mse' -} -## VIASH END - -print("Load data", flush=True) -input_denoised = ad.read_h5ad(par['input_denoised'], backed="r") -input_test = ad.read_h5ad(par['input_test'], backed="r") - -test_data = ad.AnnData(X=input_test.layers["counts"], dtype="float") -denoised_data = ad.AnnData(X=input_denoised.layers["denoised"], dtype="float") - -print("Normalize data", flush=True) - -# scaling and transformation -target_sum = 10000 - -sc.pp.normalize_total(test_data, target_sum) -sc.pp.log1p(test_data) - -sc.pp.normalize_total(denoised_data, target_sum) -sc.pp.log1p(denoised_data) - -print("Compute mse value", flush=True) -error = sklearn.metrics.mean_squared_error( - scprep.utils.toarray(test_data.X), scprep.utils.toarray(denoised_data.X) -) - -print("Store mse value", flush=True) -output = ad.AnnData( - uns={ key: val for key, val in input_test.uns.items() }, -) - -output.uns["method_id"] = input_denoised.uns["method_id"] -output.uns["metric_ids"] = meta['functionality_name'] -output.uns["metric_values"] = error - -print("Write adata to file", flush=True) -output.write_h5ad(par['output'], compression="gzip") - diff --git a/src/tasks/denoising/metrics/poisson/config.vsh.yaml b/src/tasks/denoising/metrics/poisson/config.vsh.yaml deleted file mode 100644 index e523a9306e..0000000000 --- a/src/tasks/denoising/metrics/poisson/config.vsh.yaml +++ /dev/null @@ -1,28 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "poisson" - info: - metrics: - - name: poisson - label: Poisson Loss - summary: "The Poisson log likelihood of the true counts observed in the distribution of denoised counts" - description: "The Poisson log likelihood of observing the true counts of the test dataset given the distribution given in the denoised dataset." - reference: batson2019molecular - v1: - path: openproblems/tasks/denoising/metrics/poisson.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - maximize: false - min: 0 - max: "+.inf" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pip: scprep - - type: nextflow - directives: - label: [midtime, highmem, midcpu] \ No newline at end of file diff --git a/src/tasks/denoising/metrics/poisson/script.py b/src/tasks/denoising/metrics/poisson/script.py deleted file mode 100644 index 537ccf0119..0000000000 --- a/src/tasks/denoising/metrics/poisson/script.py +++ /dev/null @@ -1,46 +0,0 @@ -import anndata as ad -import scprep -import numpy as np - -## VIASH START -par = { - 'input_denoised': 'output_magic.h5ad', - 'input_test': 'output_test.h5ad', - 'output': 'output_poisson.h5ad' -} -meta = { - 'functionality_name': 'poisson' -} -## VIASH END - -print("Load Data", flush=True) -input_denoised = ad.read_h5ad(par['input_denoised'], backed="r") -input_test = ad.read_h5ad(par['input_test'], backed="r") - -test_data = scprep.utils.toarray(input_test.layers["counts"]) -denoised_data = scprep.utils.toarray(input_denoised.layers["denoised"]) - -print("Compute metric value", flush=True) -# scaling -initial_sum = input_test.uns["train_sum"] -target_sum = test_data.sum() -denoised_data = denoised_data * target_sum / initial_sum - -# from molecular_cross_validation.mcv_sweep import poisson_nll_loss -# copied from: https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/mcv_sweep.py -def poisson_nll_loss(y_pred: np.ndarray, y_true: np.ndarray) -> float: - return (y_pred - y_true * np.log(y_pred + 1e-6)).mean() - -error = poisson_nll_loss(test_data, denoised_data) - -print("Store poisson value", flush=True) -output = ad.AnnData( - uns={ key: val for key, val in input_test.uns.items() }, -) - -output.uns["method_id"] = input_denoised.uns["method_id"] -output.uns["metric_ids"] = meta['functionality_name'] -output.uns["metric_values"] = error - -print("Write adata to file", flush=True) -output.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/denoising/process_dataset/config.vsh.yaml b/src/tasks/denoising/process_dataset/config.vsh.yaml deleted file mode 100644 index c9b5b06c1a..0000000000 --- a/src/tasks/denoising/process_dataset/config.vsh.yaml +++ /dev/null @@ -1,37 +0,0 @@ -__merge__: ../api/comp_process_dataset.yaml -functionality: - name: "process_dataset" - description: | - Split data using molecular cross-validation. - - Splits molecules into two (potentially overlapping) groups using a fraction ratio. - These are output as two separate AnnData objects. - arguments: - - name: "--method" - type: "string" - description: "The process method to assign train/test." - choices: ["mcv"] - default: "mcv" - - name: "--train_frac" - type: "double" - description: "The fraction the molecules need to be split to train dataset" - default: 0.9 - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - resources: - - type: python_script - path: script.py - - path: helper.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - numpy - - scipy - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/tasks/denoising/process_dataset/helper.py b/src/tasks/denoising/process_dataset/helper.py deleted file mode 100644 index 2044ed4c6e..0000000000 --- a/src/tasks/denoising/process_dataset/helper.py +++ /dev/null @@ -1,55 +0,0 @@ -# MIT License - -# Copyright (c) 2019 Chan Zuckerberg Biohub - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. - -# Copied from https://github.com/czbiohub/molecular-cross-validation/blob/master/src/molecular_cross_validation/util.py - - -from typing import Tuple - -import numpy as np - -def split_molecules( - umis: np.ndarray, - data_split: float, - overlap_factor: float = 0.0, - random_state: np.random.RandomState = None, -) -> Tuple[np.ndarray, np.ndarray]: - """Splits molecules into two (potentially overlapping) groups. - :param umis: Array of molecules to split - :param data_split: Proportion of molecules to assign to the first group - :param overlap_factor: Overlap correction factor, if desired - :param random_state: For reproducible sampling - :return: umis_X and umis_Y, representing ``split`` and ``~(1 - split)`` counts - sampled from the input array - """ - if random_state is None: - random_state = np.random.RandomState() - - umis_X_disjoint = random_state.binomial(umis, data_split - overlap_factor) - umis_Y_disjoint = random_state.binomial( - umis - umis_X_disjoint, (1 - data_split) / (1 - data_split + overlap_factor) - ) - overlap_factor = umis - umis_X_disjoint - umis_Y_disjoint - umis_X = umis_X_disjoint + overlap_factor - umis_Y = umis_Y_disjoint + overlap_factor - - return umis_X, umis_Y \ No newline at end of file diff --git a/src/tasks/denoising/process_dataset/script.py b/src/tasks/denoising/process_dataset/script.py deleted file mode 100644 index 94a5884046..0000000000 --- a/src/tasks/denoising/process_dataset/script.py +++ /dev/null @@ -1,75 +0,0 @@ -import sys -import anndata as ad -import numpy as np - -## VIASH START -par = { - 'input': "resources_test/common/pancreas/dataset.h5ad", - 'output_train': "train.h5ad", - 'output_test': "test.h5ad", - 'train_frac': 0.9, - 'seed': 0 -} -meta = { - "functionality_name": "process_dataset", - "resources_dir": "src/tasks/denoising/process_dataset" -} -## VIASH END - -# add helper scripts to path -sys.path.append(meta["resources_dir"]) -from helper import split_molecules - -# set random state -random_state = np.random.RandomState(par['seed']) - -print(">> Load Data", flush=True) -adata = ad.read_h5ad(par["input"]) - -# remove all layers except for counts -for key in list(adata.layers.keys()): - if key != "counts": - del adata.layers[key] - -# round counts and convert to int -counts = np.array(adata.layers["counts"]).round().astype(int) - -print(">> process and split data", flush=True) -train_data, test_data = split_molecules( - counts.data, par["train_frac"], 0.0, random_state -) - -X_train = counts.copy() -X_test = counts.copy() -X_train.data = train_data -X_test.data = test_data -X_train.eliminate_zeros() -X_test.eliminate_zeros() - -# copy adata to train_set, test_set -output_train = ad.AnnData( - layers={"counts": X_train}, - obs=adata.obs[[]], - var=adata.var[[]], - uns={"dataset_id": adata.uns["dataset_id"]} -) -test_uns_keys = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] -output_test = ad.AnnData( - layers={"counts": X_test}, - obs=adata.obs[[]], - var=adata.var[[]], - uns={key: adata.uns[key] for key in test_uns_keys} -) - -# add additional information for the train set -output_test.uns["train_sum"] = X_train.sum() - -# Remove no cells that do not have enough reads -is_missing = np.array(X_train.sum(axis=0) == 0) - -output_train = output_train[:, ~is_missing.flatten()] -output_test = output_test[:, ~is_missing.flatten()] - -print(">> Write to file", flush=True) -output_train.write_h5ad(par["output_train"]) -output_test.write_h5ad(par["output_test"]) diff --git a/src/tasks/denoising/resources_scripts/process_datasets.sh b/src/tasks/denoising/resources_scripts/process_datasets.sh deleted file mode 100755 index 873b9fb0b4..0000000000 --- a/src/tasks/denoising/resources_scripts/process_datasets.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -id: denoising_process_datasets -input_states: s3://openproblems-data/resources/datasets/**/log_cp10k/state.yaml -rename_keys: 'input:output_dataset' -settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' -output_state: "$id/state.yaml" -publish_dir: s3://openproblems-data/resources/denoising/datasets -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' - withName:'.*publishStatesProc' { - memory = '16GB' - disk = '100GB' - } - withLabel:highmem { - memory = '350GB' - } -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/denoising/workflows/process_datasets/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels denoising,process_datasets \ No newline at end of file diff --git a/src/tasks/denoising/resources_scripts/run_benchmark.sh b/src/tasks/denoising/resources_scripts/run_benchmark.sh deleted file mode 100755 index 8e38568ac8..0000000000 --- a/src/tasks/denoising/resources_scripts/run_benchmark.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -publish_dir="s3://openproblems-data/resources/denoising/results/${RUN_ID}" - -# make sure only log_cp10k is used -cat > /tmp/params.yaml << HERE -input_states: s3://openproblems-data/resources/denoising/datasets/**/log_cp10k/state.yaml -rename_keys: 'input_train:output_train,input_test:output_test' -output_state: "state.yaml" -publish_dir: "$publish_dir" -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/denoising/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config src/wf_utils/labels_tw.config \ - --labels denoising,full \ No newline at end of file diff --git a/src/tasks/denoising/resources_scripts/run_benchmark_test.sh b/src/tasks/denoising/resources_scripts/run_benchmark_test.sh deleted file mode 100755 index c9023c26f1..0000000000 --- a/src/tasks/denoising/resources_scripts/run_benchmark_test.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -input_states: s3://openproblems-data/resources_test/denoising/**/state.yaml -rename_keys: 'input_train:output_train,input_test:output_test' -output_state: "state.yaml" -publish_dir: s3://openproblems-nextflow/temp/denoising/ -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/denoising/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels denoising,test \ No newline at end of file diff --git a/src/tasks/denoising/resources_test_scripts/pancreas.sh b/src/tasks/denoising/resources_test_scripts/pancreas.sh deleted file mode 100755 index c737b39c2e..0000000000 --- a/src/tasks/denoising/resources_test_scripts/pancreas.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/denoising - -mkdir -p $DATASET_DIR - -# process dataset -echo Running process_dataset -nextflow run . \ - -main-script target/nextflow/denoising/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - --input_states "$RAW_DATA/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}' \ - --publish_dir "$DATASET_DIR" \ - --output_state '$id/state.yaml' - -# run one method -viash run src/tasks/denoising/methods/magic/config.vsh.yaml -- \ - --input_train $DATASET_DIR/pancreas/train.h5ad \ - --output $DATASET_DIR/pancreas/denoised.h5ad - -# run one metric -viash run src/tasks/denoising/metrics/poisson/config.vsh.yaml -- \ - --input_denoised $DATASET_DIR/pancreas/denoised.h5ad \ - --input_test $DATASET_DIR/pancreas/test.h5ad \ - --output $DATASET_DIR/pancreas/score.h5ad - -# # run benchmark -# export NXF_VER=22.04.5 - -# nextflow \ -# run . \ -# -main-script src/tasks/denoising/workflows/run/main.nf \ -# -profile docker \ -# -resume \ -# --id pancreas \ -# --input_train $DATASET_DIR/train.h5ad \ -# --input_test $DATASET_DIR/test.h5ad \ -# --output scores.tsv \ -# --publish_dir $DATASET_DIR/ \ No newline at end of file diff --git a/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml b/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml deleted file mode 100644 index 6fc095704b..0000000000 --- a/src/tasks/denoising/workflows/process_datasets/config.vsh.yaml +++ /dev/null @@ -1,30 +0,0 @@ -functionality: - name: "process_datasets" - namespace: "denoising/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - required: true - example: dataset.h5ad - __merge__: "/src/tasks/denoising/api/file_common_dataset.yaml" - - name: Outputs - arguments: - - name: "--output_train" - __merge__: "/src/tasks/denoising/api/file_train.yaml" - direction: output - required: true - - name: "--output_test" - __merge__: "/src/tasks/denoising/api/file_test.yaml" - direction: output - required: true - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: common/check_dataset_schema - - name: denoising/process_dataset -platforms: - - type: nextflow diff --git a/src/tasks/denoising/workflows/process_datasets/main.nf b/src/tasks/denoising/workflows/process_datasets/main.nf deleted file mode 100644 index 4437206b09..0000000000 --- a/src/tasks/denoising/workflows/process_datasets/main.nf +++ /dev/null @@ -1,54 +0,0 @@ -include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" - -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - output_ch = input_ch - - | check_dataset_schema.run( - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset": checks["exit_code"] == 0 ? state.input : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset != null - } - - | process_dataset.run( - fromState: [ input: "dataset" ], - toState: [ - output_train: "output_train", - output_test: "output_test" - ] - ) - - // only output the files for which an output file was specified - | setState(["output_train", "output_test"]) - - emit: - output_ch -} diff --git a/src/tasks/denoising/workflows/process_datasets/run_test.sh b/src/tasks/denoising/workflows/process_datasets/run_test.sh deleted file mode 100755 index ed8484693b..0000000000 --- a/src/tasks/denoising/workflows/process_datasets/run_test.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Run this prior to executing this script: -# bin/viash_build -q 'batch_integration' - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -export NXF_VER=22.04.5 - -nextflow run . \ - -main-script target/nextflow/denoising/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - -c src/wf_utils/labels_ci.config \ - --id run_test \ - --input_states "resources_test/common/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \ - --publish_dir "resources_test/denoising" \ No newline at end of file diff --git a/src/tasks/denoising/workflows/run_benchmark/config.vsh.yaml b/src/tasks/denoising/workflows/run_benchmark/config.vsh.yaml deleted file mode 100644 index 5b1cf3dd04..0000000000 --- a/src/tasks/denoising/workflows/run_benchmark/config.vsh.yaml +++ /dev/null @@ -1,67 +0,0 @@ -functionality: - name: "run_benchmark" - namespace: "denoising/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input_train" - __merge__: "/src/tasks/denoising/api/file_train.yaml" - required: true - direction: input - - name: "--input_test" - __merge__: "/src/tasks/denoising/api/file_test.yaml" - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--output_method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--output_metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.yaml - - name: Methods - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: "../../api/task_info.yaml" - dependencies: - - name: common/check_dataset_schema - - name: common/extract_metadata - - name: denoising/control_methods/no_denoising - - name: denoising/control_methods/perfect_denoising - - name: denoising/methods/alra - - name: denoising/methods/dca - - name: denoising/methods/knn_smoothing - - name: denoising/methods/magic - - name: denoising/metrics/mse - - name: denoising/metrics/poisson -platforms: - - type: nextflow diff --git a/src/tasks/denoising/workflows/run_benchmark/main.nf b/src/tasks/denoising/workflows/run_benchmark/main.nf deleted file mode 100644 index 8b8f6ebd8d..0000000000 --- a/src/tasks/denoising/workflows/run_benchmark/main.nf +++ /dev/null @@ -1,184 +0,0 @@ -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - - // construct list of methods - methods = [ - no_denoising, - perfect_denoising, - alra, - dca, - knn_smoothing, - magic - ] - - // construct list of metrics - metrics = [ - mse, - poisson - ] - - /**************************** - * EXTRACT DATASET METADATA * - ****************************/ - dataset_ch = input_ch - // store join id - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - // extract the dataset metadata - | extract_metadata.run( - fromState: [input: "input_test"], - toState: { id, output, state -> - state + [ - dataset_uns: readYaml(output.output).uns - ] - } - ) - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - score_ch = dataset_ch - - // run all methods - | runEach( - components: methods, - - // use the 'filter' argument to only run a defined method or all methods - filter: { id, state, comp -> - def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) - - method_check - }, - - // define a new 'id' by appending the method name to the dataset id - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [ - input_train: "input_train", - input_test: "input_test" - ], - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - method_id: comp.config.functionality.name, - method_output: output.output - ] - } - ) - - // run all metrics - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [ - input_test: "input_test", - input_denoised: "method_output" - ], - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.functionality.name, - metric_output: output.output - ] - } - ) - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // TODO: can we store everything below in a separate helper function? - // NOTE: the 'denoising' task doesn't use normalized data, - // so code related to normalization_ids is commented out - - // extract the dataset metadata - dataset_meta_ch = dataset_ch - // // only keep one of the normalization methods - // | filter{ id, state -> - // state.dataset_uns.normalization_id == "log_cp10k" - // } - | joinStates { ids, states -> - // store the dataset metadata in a file - def dataset_uns = states.collect{state -> - def uns = state.dataset_uns.clone() - // uns.remove("normalization_id") - uns - } - def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - def dataset_uns_file = tempFile("dataset_uns.yaml") - dataset_uns_file.write(dataset_uns_yaml_blob) - - ["output", [output_dataset_info: dataset_uns_file]] - } - - output_ch = score_ch - - // extract the scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - // store the method configs in a file - def method_configs = methods.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - def task_info_file = meta.resources_dir.resolve("task_info.yaml") - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - def new_state = [ - output_method_configs: method_configs_file, - output_metric_configs: metric_configs_file, - output_task_info: task_info_file, - output_scores: score_uns_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - | mix(dataset_meta_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -} \ No newline at end of file diff --git a/src/tasks/denoising/workflows/run_benchmark/run_test.sh b/src/tasks/denoising/workflows/run_benchmark/run_test.sh deleted file mode 100755 index 9b31877c52..0000000000 --- a/src/tasks/denoising/workflows/run_benchmark/run_test.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -DATASETS_DIR="resources_test/denoising" -OUTPUT_DIR="output/temp" - -if [ ! -d "$OUTPUT_DIR" ]; then - mkdir -p "$OUTPUT_DIR" -fi - -export NXF_VER=22.04.5 -nextflow run . \ - -main-script target/nextflow/denoising/workflows/run_benchmark/main.nf \ - -profile docker \ - -resume \ - -entry auto \ - -c src/wf_utils/labels_ci.config \ - --input_states "$DATASETS_DIR/**/state.yaml" \ - --rename_keys 'input_train:output_train,input_test:output_test' \ - --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ - --publish_dir "$OUTPUT_DIR" \ - --output_state "state.yaml" diff --git a/src/tasks/dimensionality_reduction/README.md b/src/tasks/dimensionality_reduction/README.md index c5bc42e09d..c70842900c 100644 --- a/src/tasks/dimensionality_reduction/README.md +++ b/src/tasks/dimensionality_reduction/README.md @@ -1,376 +1,3 @@ # Dimensionality reduction for 2D visualization - -Reduction of high-dimensional datasets to 2D for visualization & -interpretation - -Path: -[`src/tasks/dimensionality_reduction`](https://github.com/openproblems-bio/openproblems/tree/main/src/tasks/dimensionality_reduction) - -## Motivation - -Data visualisation is an important part of all stages of single-cell -analysis, from initial quality control to interpretation and -presentation of final results. For bulk RNA-seq studies, linear -dimensionality reduction techniques such as PCA and MDS are commonly -used to visualise the variation between samples. While these methods are -highly effective they can only be used to show the first few components -of variation which cannot fully represent the increased complexity and -number of observations in single-cell datasets. For this reason -non-linear techniques (most notably t-SNE and UMAP) have become the -standard for visualising single-cell studies. These methods attempt to -compress a dataset into a two-dimensional space while attempting to -capture as much of the variance between observations as possible. Many -methods for solving this problem now exist. In general these methods try -to preserve distances, while some additionally consider aspects such as -density within the embedded space or conservation of continuous -trajectories. Despite almost every single-cell study using one of these -visualisations there has been debate as to whether they can effectively -capture the variation in single-cell datasets \[@chari2023speciousart\]. - -## Description - -The dimensionality reduction task attempts to quantify the ability of -methods to embed the information present in complex single-cell studies -into a two-dimensional space. Thus, this task is specifically designed -for dimensionality reduction for visualisation and does not consider -other uses of dimensionality reduction in standard single-cell workflows -such as improving the signal-to-noise ratio (and in fact several of the -methods use PCA as a pre-processing step for this reason). Unlike most -tasks, methods for the dimensionality reduction task must accept a -matrix containing expression values normalised to 10,000 counts per cell -and log transformed (log-10k) and produce a two-dimensional coordinate -for each cell. Pre-normalised matrices are required to enforce -consistency between the metric evaluation (which generally requires -normalised data) and the method runs. When these are not consistent, -methods that use the same normalisation as used in the metric tend to -score more highly. For some methods we also evaluate the pre-processing -recommended by the method. - -## Authors & contributors - -| name | roles | -|:-----------------------|:-------------------| -| Luke Zappia | maintainer, author | -| Michal Klein | author | -| Scott Gigante | author | -| Ben DeMeo | author | -| Robrecht Cannoodt | author | -| Kai Waldrant | contributor | -| Sai Nirmayi Yasa | contributor | -| Juan A. Cordero Varela | contributor | - -## API - -``` mermaid -flowchart LR - file_common_dataset("Common dataset") - comp_process_dataset[/"Data processor"/] - file_dataset("Dataset") - file_solution("Test data") - comp_control_method[/"Control method"/] - comp_method[/"Method"/] - comp_metric[/"Metric"/] - file_embedding("Embedding") - file_score("Score") - file_common_dataset---comp_process_dataset - comp_process_dataset-->file_dataset - comp_process_dataset-->file_solution - file_dataset---comp_control_method - file_dataset---comp_method - file_solution---comp_control_method - file_solution---comp_metric - comp_control_method-->file_embedding - comp_method-->file_embedding - comp_metric-->file_score - file_embedding---comp_metric -``` - -## File format: Common dataset - -A dataset processed by the common dataset processing pipeline. - -Example file: `resources_test/common/pancreas/dataset.h5ad` - -Description: - -This dataset contains both raw counts and normalized data matrices, as -well as a PCA embedding, HVG selection and a kNN graph. - -Format: - -
- - AnnData object - obs: 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism', 'organism_ontology_term_id', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_general', 'tissue_general_ontology_term_id', 'batch', 'soma_joinid', 'size_factors' - var: 'feature_id', 'feature_name', 'soma_joinid', 'hvg', 'hvg_score' - obsm: 'X_pca' - obsp: 'knn_distances', 'knn_connectivities' - varm: 'pca_loadings' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'pca_variance', 'knn' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------------------------------|:----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `obs["dataset_id"]` | `string` | (*Optional*) Identifier for the dataset from which the cell data is derived, useful for tracking and referencing purposes. | -| `obs["assay"]` | `string` | (*Optional*) Type of assay used to generate the cell data, indicating the methodology or technique employed. | -| `obs["assay_ontology_term_id"]` | `string` | (*Optional*) Experimental Factor Ontology (`EFO:`) term identifier for the assay, providing a standardized reference to the assay type. | -| `obs["cell_type"]` | `string` | (*Optional*) Classification of the cell type based on its characteristics and function within the tissue or organism. | -| `obs["cell_type_ontology_term_id"]` | `string` | (*Optional*) Cell Ontology (`CL:`) term identifier for the cell type, offering a standardized reference to the specific cell classification. | -| `obs["development_stage"]` | `string` | (*Optional*) Stage of development of the organism or tissue from which the cell is derived, indicating its maturity or developmental phase. | -| `obs["development_stage_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the developmental stage, providing a standardized reference to the organism’s developmental phase. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Developmental Stages (`HsapDv:`) ontology is used. If the organism is mouse (`organism_ontology_term_id == 'NCBITaxon:10090'`), then the Mouse Developmental Stages (`MmusDv:`) ontology is used. Otherwise, the Uberon (`UBERON:`) ontology is used. | -| `obs["disease"]` | `string` | (*Optional*) Information on any disease or pathological condition associated with the cell or donor. | -| `obs["disease_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the disease, enabling standardized disease classification and referencing. Must be a term from the Mondo Disease Ontology (`MONDO:`) ontology term, or `PATO:0000461` from the Phenotype And Trait Ontology (`PATO:`). | -| `obs["donor_id"]` | `string` | (*Optional*) Identifier for the donor from whom the cell sample is obtained. | -| `obs["is_primary_data"]` | `boolean` | (*Optional*) Indicates whether the data is primary (directly obtained from experiments) or has been computationally derived from other primary data. | -| `obs["organism"]` | `string` | (*Optional*) Organism from which the cell sample is obtained. | -| `obs["organism_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the organism, providing a standardized reference for the organism. Must be a term from the NCBI Taxonomy Ontology (`NCBITaxon:`) which is a child of `NCBITaxon:33208`. | -| `obs["self_reported_ethnicity"]` | `string` | (*Optional*) Ethnicity of the donor as self-reported, relevant for studies considering genetic diversity and population-specific traits. | -| `obs["self_reported_ethnicity_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the self-reported ethnicity, providing a standardized reference for ethnic classifications. If the organism is human (`organism_ontology_term_id == 'NCBITaxon:9606'`), then the Human Ancestry Ontology (`HANCESTRO:`) is used. | -| `obs["sex"]` | `string` | (*Optional*) Biological sex of the donor or source organism, crucial for studies involving sex-specific traits or conditions. | -| `obs["sex_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the biological sex, ensuring standardized classification of sex. Only `PATO:0000383`, `PATO:0000384` and `PATO:0001340` are allowed. | -| `obs["suspension_type"]` | `string` | (*Optional*) Type of suspension or medium in which the cells were stored or processed, important for understanding cell handling and conditions. | -| `obs["tissue"]` | `string` | (*Optional*) Specific tissue from which the cells were derived, key for context and specificity in cell studies. | -| `obs["tissue_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the tissue, providing a standardized reference for the tissue type. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | -| `obs["tissue_general"]` | `string` | (*Optional*) General category or classification of the tissue, useful for broader grouping and comparison of cell data. | -| `obs["tissue_general_ontology_term_id"]` | `string` | (*Optional*) Ontology term identifier for the general tissue category, aiding in standardizing and grouping tissue types. For organoid or tissue samples, the Uber-anatomy ontology (`UBERON:`) is used. The term ids must be a child term of `UBERON:0001062` (anatomical entity). For cell cultures, the Cell Ontology (`CL:`) is used. The term ids cannot be `CL:0000255`, `CL:0000257` or `CL:0000548`. | -| `obs["batch"]` | `string` | (*Optional*) A batch identifier. This label is very context-dependent and may be a combination of the tissue, assay, donor, etc. | -| `obs["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the cell. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors created by the normalisation method, if any. | -| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `var["soma_joinid"]` | `integer` | (*Optional*) If the dataset was retrieved from CELLxGENE census, this is a unique identifier for the feature. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A score for the feature indicating how highly variable it is. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. | -| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. | -| `varm["pca_loadings"]` | `double` | The PCA loadings matrix. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalised expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived. | -| `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["pca_variance"]` | `double` | The PCA variance objects. | -| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. | - -
- -## Component type: Data processor - -Path: -[`src/dimensionality_reduction`](https://github.com/openproblems-bio/openproblems/tree/main/src/dimensionality_reduction) - -A dimensionality reduction dataset processor. - -Arguments: - -
- -| Name | Type | Description | -|:--------------------|:-------|:---------------------------------------------------------------| -| `--input` | `file` | A dataset processed by the common dataset processing pipeline. | -| `--output_dataset` | `file` | (*Output*) The dataset to pass to a method. | -| `--output_solution` | `file` | (*Output*) The data for evaluating a dimensionality reduction. | - -
- -## File format: Dataset - -The dataset to pass to a method. - -Example file: -`resources_test/dimensionality_reduction/pancreas/dataset.h5ad` - -Format: - -
- - AnnData object - var: 'hvg_score' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:----------|:-------------------------------------------------------------------------------------| -| `var["hvg_score"]` | `double` | High variability gene score (normalized dispersion). The greater, the more variable. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## File format: Test data - -The data for evaluating a dimensionality reduction. - -Example file: -`resources_test/dimensionality_reduction/pancreas/solution.h5ad` - -Format: - -
- - AnnData object - obs: 'cell_type' - var: 'hvg_score' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:---------------------------------------------------------------------------------------------------------| -| `obs["cell_type"]` | `string` | Classification of the cell type based on its characteristics and function within the tissue or organism. | -| `var["hvg_score"]` | `double` | High variability gene score (normalized dispersion). The greater, the more variable. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## Component type: Control method - -Path: -[`src/dimensionality_reduction/control_methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/dimensionality_reduction/control_methods) - -Quality control methods for verifying the pipeline. - -Arguments: - -
- -| Name | Type | Description | -|:-------------------|:-------|:--------------------------------------------------------------| -| `--input` | `file` | The dataset to pass to a method. | -| `--input_solution` | `file` | The data for evaluating a dimensionality reduction. | -| `--output` | `file` | (*Output*) A dataset with dimensionality reduction embedding. | - -
- -## Component type: Method - -Path: -[`src/dimensionality_reduction/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/dimensionality_reduction/methods) - -A dimensionality reduction method. - -Arguments: - -
- -| Name | Type | Description | -|:-----------|:-------|:--------------------------------------------------------------| -| `--input` | `file` | The dataset to pass to a method. | -| `--output` | `file` | (*Output*) A dataset with dimensionality reduction embedding. | - -
- -## Component type: Metric - -Path: -[`src/dimensionality_reduction/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/dimensionality_reduction/metrics) - -A dimensionality reduction metric. - -Arguments: - -
- -| Name | Type | Description | -|:--------------------|:-------|:----------------------------------------------------| -| `--input_embedding` | `file` | A dataset with dimensionality reduction embedding. | -| `--input_solution` | `file` | The data for evaluating a dimensionality reduction. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## File format: Embedding - -A dataset with dimensionality reduction embedding. - -Example file: -`resources_test/dimensionality_reduction/pancreas/embedding.h5ad` - -Format: - -
- - AnnData object - obsm: 'X_emb' - uns: 'dataset_id', 'method_id', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:-------------------------------------| -| `obsm["X_emb"]` | `double` | The dimensionally reduced embedding. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## File format: Score - -Metric score file - -Example file: -`resources_test/dimensionality_reduction/pancreas/score.h5ad` - -Format: - -
- - AnnData object - uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | -| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | - -
- +# This task has been moved to [https://github.com/openproblems-bio/task_dimensionality_reduction](https://github.com/openproblems-bio/task_dimensionality_reduction)! diff --git a/src/tasks/dimensionality_reduction/api/comp_control_method.yaml b/src/tasks/dimensionality_reduction/api/comp_control_method.yaml deleted file mode 100644 index dfa346752f..0000000000 --- a/src/tasks/dimensionality_reduction/api/comp_control_method.yaml +++ /dev/null @@ -1,33 +0,0 @@ -functionality: - namespace: dimensionality_reduction/control_methods - info: - type: control_method - type_info: - label: Control method - summary: Quality control methods for verifying the pipeline. - description: | - Control methods have the same interface as the regular methods - but also receive the solution object as input. It serves as a - starting point to test the relative accuracy of new methods in - the task, and also as a quality control for the metrics defined - in the task. - arguments: - - name: "--input" - __merge__: file_dataset.yaml - direction: input - required: true - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: "--output" - __merge__: file_embedding.yaml - direction: output - required: true - test_resources: - - path: /resources_test/dimensionality_reduction/pancreas/ - dest: resources_test/dimensionality_reduction/pancreas/ - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/api/comp_method.yaml b/src/tasks/dimensionality_reduction/api/comp_method.yaml deleted file mode 100644 index 34d63607a4..0000000000 --- a/src/tasks/dimensionality_reduction/api/comp_method.yaml +++ /dev/null @@ -1,27 +0,0 @@ -functionality: - namespace: dimensionality_reduction/methods - info: - type: method - type_info: - label: Method - summary: A dimensionality reduction method. - description: | - A dimensionality reduction method to summarise the biological - information in a dataset in as few dimensions as possible. - arguments: - - name: "--input" - __merge__: file_dataset.yaml - direction: input - required: true - - name: "--output" - __merge__: file_embedding.yaml - direction: output - required: true - test_resources: - - path: /resources_test/dimensionality_reduction/pancreas/ - dest: resources_test/dimensionality_reduction/pancreas/ - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib diff --git a/src/tasks/dimensionality_reduction/api/comp_metric.yaml b/src/tasks/dimensionality_reduction/api/comp_metric.yaml deleted file mode 100644 index 8cd90e4ca1..0000000000 --- a/src/tasks/dimensionality_reduction/api/comp_metric.yaml +++ /dev/null @@ -1,30 +0,0 @@ -functionality: - namespace: dimensionality_reduction/metrics - info: - type: metric - type_info: - label: Metric - summary: A dimensionality reduction metric. - description: | - A metric for evaluating dimensionality reductions. - arguments: - - name: "--input_embedding" - direction: input - __merge__: file_embedding.yaml - required: true - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: "--output" - __merge__: file_score.yaml - direction: output - required: true - test_resources: - - path: /resources_test/dimensionality_reduction/pancreas/ - dest: resources_test/dimensionality_reduction/pancreas/ - - type: python_script - path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib diff --git a/src/tasks/dimensionality_reduction/api/comp_process_dataset.yaml b/src/tasks/dimensionality_reduction/api/comp_process_dataset.yaml deleted file mode 100644 index 1f7b150871..0000000000 --- a/src/tasks/dimensionality_reduction/api/comp_process_dataset.yaml +++ /dev/null @@ -1,27 +0,0 @@ -functionality: - namespace: dimensionality_reduction - info: - type: process_dataset - type_info: - label: Data processor - summary: A dimensionality reduction dataset processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. - arguments: - - name: "--input" - __merge__: /src/datasets/api/file_common_dataset.yaml - direction: input - required: true - - name: "--output_dataset" - __merge__: file_dataset.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true - test_resources: - - path: /resources_test/common/pancreas/ - dest: resources_test/common/pancreas/ - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml b/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml deleted file mode 100644 index dba599da9a..0000000000 --- a/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml +++ /dev/null @@ -1,58 +0,0 @@ -type: file -example: "resources_test/dimensionality_reduction/pancreas/dataset.h5ad" -info: - label: "Dataset" - summary: "The dataset to pass to a method." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: cell_type - description: Classification of the cell type based on its characteristics and function within the tissue or organism. - required: true - var: - - type: double - name: hvg_score - description: High variability gene score (normalized dispersion). The greater, the more variable. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/dimensionality_reduction/api/file_dataset.yaml b/src/tasks/dimensionality_reduction/api/file_dataset.yaml deleted file mode 100644 index 8061f8f0c5..0000000000 --- a/src/tasks/dimensionality_reduction/api/file_dataset.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/dimensionality_reduction/pancreas/dataset.h5ad" -info: - label: "Dataset" - summary: "The dataset to pass to a method." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - var: - - type: double - name: hvg_score - description: High variability gene score (normalized dispersion). The greater, the more variable. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/dimensionality_reduction/api/file_embedding.yaml b/src/tasks/dimensionality_reduction/api/file_embedding.yaml deleted file mode 100644 index c33d76ae8f..0000000000 --- a/src/tasks/dimensionality_reduction/api/file_embedding.yaml +++ /dev/null @@ -1,25 +0,0 @@ -type: file -example: "resources_test/dimensionality_reduction/pancreas/embedding.h5ad" -info: - label: "Embedding" - summary: "A dataset with dimensionality reduction embedding." - slots: - obsm: - - type: double - name: X_emb - description: The dimensionally reduced embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - diff --git a/src/tasks/dimensionality_reduction/api/file_score.yaml b/src/tasks/dimensionality_reduction/api/file_score.yaml deleted file mode 100644 index 71200ef9e1..0000000000 --- a/src/tasks/dimensionality_reduction/api/file_score.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/dimensionality_reduction/pancreas/score.h5ad" -info: - label: "Score" - summary: "Metric score file" - slots: - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: string - name: metric_ids - description: "One or more unique metric identifiers" - multiple: true - required: true - - type: double - name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." - multiple: true - required: true \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/api/file_solution.yaml b/src/tasks/dimensionality_reduction/api/file_solution.yaml deleted file mode 100644 index 9d08f8fb7a..0000000000 --- a/src/tasks/dimensionality_reduction/api/file_solution.yaml +++ /dev/null @@ -1,58 +0,0 @@ -type: file -example: "resources_test/dimensionality_reduction/pancreas/solution.h5ad" -info: - label: "Test data" - summary: "The data for evaluating a dimensionality reduction." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: cell_type - description: Classification of the cell type based on its characteristics and function within the tissue or organism. - required: true - var: - - type: double - name: hvg_score - description: High variability gene score (normalized dispersion). The greater, the more variable. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/dimensionality_reduction/api/task_info.yaml b/src/tasks/dimensionality_reduction/api/task_info.yaml deleted file mode 100644 index 4f24ae9764..0000000000 --- a/src/tasks/dimensionality_reduction/api/task_info.yaml +++ /dev/null @@ -1,73 +0,0 @@ -name: dimensionality_reduction -label: "Dimensionality reduction for 2D visualization" -v1: - path: openproblems/tasks/dimensionality_reduction/README.md - commit: b353a462f6ea353e0fc43d0f9fcbbe621edc3a0b -summary: Reduction of high-dimensional datasets to 2D for visualization & interpretation -image: "thumbnail.svg" -motivation: | - Data visualisation is an important part of all stages of single-cell analysis, from - initial quality control to interpretation and presentation of final results. For bulk RNA-seq - studies, linear dimensionality reduction techniques such as PCA and MDS are commonly used - to visualise the variation between samples. While these methods are highly effective they - can only be used to show the first few components of variation which cannot fully represent - the increased complexity and number of observations in single-cell datasets. For this reason - non-linear techniques (most notably t-SNE and UMAP) have become the standard for visualising - single-cell studies. These methods attempt to compress a dataset into a two-dimensional space - while attempting to capture as much of the variance between observations as possible. Many - methods for solving this problem now exist. In general these methods try to preserve distances, - while some additionally consider aspects such as density within the embedded space or conservation - of continuous trajectories. Despite almost every single-cell study using one of these visualisations - there has been debate as to whether they can effectively capture the variation in single-cell - datasets [@chari2023speciousart]. -description: | - The dimensionality reduction task attempts to quantify the ability of methods to embed the - information present in complex single-cell studies into a two-dimensional space. Thus, this task - is specifically designed for dimensionality reduction for visualisation and does not consider other - uses of dimensionality reduction in standard single-cell workflows such as improving the - signal-to-noise ratio (and in fact several of the methods use PCA as a pre-processing step for this - reason). Unlike most tasks, methods for the dimensionality reduction task must accept a matrix - containing expression values normalised to 10,000 counts per cell and log transformed (log-10k) and - produce a two-dimensional coordinate for each cell. Pre-normalised matrices are required to - enforce consistency between the metric evaluation (which generally requires normalised data) and - the method runs. When these are not consistent, methods that use the same normalisation as used in - the metric tend to score more highly. For some methods we also evaluate the pre-processing - recommended by the method. -authors: - - name: Luke Zappia - roles: [ maintainer, author ] - info: - github: lazappi - - name: Michal Klein - roles: [ author ] - info: - github: michalk8 - - name: Scott Gigante - roles: [ author ] - info: - github: scottgigante - orcid: "0000-0002-4544-2764" - - name: Ben DeMeo - roles: [ author ] - info: - github: bendemeo - - name: Robrecht Cannoodt - roles: [ author ] - info: - github: rcannood - orcid: 0000-0003-3641-729X - - name: Kai Waldrant - roles: [ contributor ] - info: - github: KaiWaldrant - orcid: 0009-0003-8555-1361 - - name: Sai Nirmayi Yasa - roles: [ contributor ] - info: - github: sainirmayi - orcid: 0009-0003-6319-9803 - - name: Juan A. Cordero Varela - roles: [ contributor ] - info: - github: jacorvar - orcid: 0000-0002-7373-5433 diff --git a/src/tasks/dimensionality_reduction/api/thumbnail.svg b/src/tasks/dimensionality_reduction/api/thumbnail.svg deleted file mode 100644 index 62911379a1..0000000000 --- a/src/tasks/dimensionality_reduction/api/thumbnail.svg +++ /dev/null @@ -1 +0,0 @@ -dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml deleted file mode 100644 index 6c0d36ad44..0000000000 --- a/src/tasks/dimensionality_reduction/control_methods/random_features/config.vsh.yaml +++ /dev/null @@ -1,22 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "random_features" - info: - label: Random Features - summary: "Negative control by randomly embedding into a 2D space." - description: "This method serves as a negative control, where the data is randomly embedded into a two-dimensional space, with no attempt to preserve the original structure." - v1: - path: openproblems/tasks/dimensionality_reduction/methods/baseline.py - commit: 80b37e7a6aa27df4436f400397564c01276817e0 - preferred_normalization: counts - variants: - random_features: - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, highmem, highcpu] \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/random_features/script.py b/src/tasks/dimensionality_reduction/control_methods/random_features/script.py deleted file mode 100644 index 7908207bda..0000000000 --- a/src/tasks/dimensionality_reduction/control_methods/random_features/script.py +++ /dev/null @@ -1,34 +0,0 @@ -import anndata as ad -import numpy as np - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", - "output": "reduced.h5ad", -} -meta = { - "functionality_name": "random_features", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) - -print("Create random embedding", flush=True) -X_emb = np.random.normal(0, 1, (input.shape[0], 2)) - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml deleted file mode 100644 index b3ae5aa95b..0000000000 --- a/src/tasks/dimensionality_reduction/control_methods/spectral_features/config.vsh.yaml +++ /dev/null @@ -1,41 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "spectral_features" - info: - label: Spectral Features - summary: "Positive control by Use 1000-dimensional diffusions maps as an embedding." - description: "This serves as a positive control since it uses 1000-dimensional diffusions maps as an embedding" - v1: - path: openproblems/tasks/dimensionality_reduction/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - spectral_features: - arguments: - - name: "--n_comps" - type: integer - default: 1000 - description: "Number of components to use for the embedding." - - name: t - type: integer - default: 1 - description: "Number to power the eigenvalues by." - - name: n_retries - type: integer - default: 1 - description: "Number of times to retry if the embedding fails, each time adding noise." - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - umap-learn - - scipy - - numpy - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/control_methods/spectral_features/script.py b/src/tasks/dimensionality_reduction/control_methods/spectral_features/script.py deleted file mode 100644 index cf8633120c..0000000000 --- a/src/tasks/dimensionality_reduction/control_methods/spectral_features/script.py +++ /dev/null @@ -1,77 +0,0 @@ -import anndata as ad -import umap - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", - "output": "reduced.h5ad", - "n_comps": 2, -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -def diffusion_map(graph, n_comps, t, n_retries): - import numpy as np - import scipy.sparse.linalg - - diag_data = np.asarray(graph.sum(axis=0)) - identity = scipy.sparse.identity(graph.shape[0], dtype=np.float64) - diag = scipy.sparse.spdiags( - 1.0 / np.sqrt(diag_data), 0, graph.shape[0], graph.shape[0] - ) - laplacian = identity - diag * graph * diag - num_lanczos_vectors = max(2 * n_comps + 1, int(np.sqrt(graph.shape[0]))) - try: - eigenvalues, eigenvectors = scipy.sparse.linalg.eigsh( - laplacian, - n_comps, - which="SM", - ncv=num_lanczos_vectors, - tol=1e-4, - v0=np.ones(laplacian.shape[0]), - maxiter=graph.shape[0] * 5, - ) - return (eigenvalues**t) * eigenvectors - except scipy.sparse.linalg.ArpackNoConvergence: - if n_retries > 0: - # add some noise and try again - graph_rand = graph.copy().tocoo() - graph_rand.row = np.random.choice( - graph_rand.shape[0], len(graph_rand.row), replace=True - ) - graph_rand.data *= 0.01 - return diffusion_map( - graph + graph_rand, n_comps, t, n_retries=n_retries - 1 - ) - else: - raise - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) - -print("Create high dimensionally embedding with all features", flush=True) - -n_comps = min(par["n_comps"], min(input.shape) - 2) - -graph = umap.UMAP(transform_mode="graph").fit_transform(input.layers["normalized"]) - -X_emb = diffusion_map(graph, n_comps, t=par["t"], n_retries=par["n_retries"]) - - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml b/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml deleted file mode 100644 index a83d393072..0000000000 --- a/src/tasks/dimensionality_reduction/control_methods/true_features/config.vsh.yaml +++ /dev/null @@ -1,22 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "true_features" - info: - label: True Features - summary: "Positive control by retaining the dimensionality without loss of information." - description: "This serves as a positive control since the original high-dimensional data is retained as is, without any loss of information" - v1: - path: openproblems/tasks/dimensionality_reduction/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - true_features: - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/control_methods/true_features/script.py b/src/tasks/dimensionality_reduction/control_methods/true_features/script.py deleted file mode 100644 index 1a58cd4984..0000000000 --- a/src/tasks/dimensionality_reduction/control_methods/true_features/script.py +++ /dev/null @@ -1,33 +0,0 @@ -import anndata as ad - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/test.h5ad", - "output": "reduced.h5ad", -} -meta = { - "functionality_name": "true_features", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) - -print("Create high dimensionally embedding with all features", flush=True) -X_emb = input.layers["normalized"].toarray() - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml deleted file mode 100644 index ff5764a561..0000000000 --- a/src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml +++ /dev/null @@ -1,45 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "densmap" - info: - label: densMAP - summary: "Modified UMAP with preservation of local density information" - description: "A modification of UMAP that adds an extra cost term in order to preserve information about the relative local density of the data. It is performed on the same inputs as UMAP." - reference: "narayan2021assessing" - repository_url: https://github.com/lmcinnes/umap - documentation_url: https://github.com/lmcinnes/umap#readme - v1: - path: openproblems/tasks/dimensionality_reduction/methods/umap.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - densmap_logCP10k: - densmap_pca_logCP10k: - n_pca_dims: 50 - densmap_logCP10k_1kHVG: - n_hvg: 1000 - densmap_pca_logCP10k_1kHVG: - n_pca_dims: 50 - n_hvg: 1000 - arguments: - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - - name: "--n_pca_dims" - type: integer - description: Number of PCA dimensions to use. If not specified, no PCA will be performed. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - umap-learn - - pynndescent==0.5.11 - - type: native - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/densmap/script.py b/src/tasks/dimensionality_reduction/methods/densmap/script.py deleted file mode 100644 index 985c95d78a..0000000000 --- a/src/tasks/dimensionality_reduction/methods/densmap/script.py +++ /dev/null @@ -1,54 +0,0 @@ -import anndata as ad -from umap import UMAP -import scanpy as sc - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", - "output": "reduced.h5ad", - "n_pca_dims": 50, - "n_hvg": 1000 -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) -X_mat = input.layers["normalized"] - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_mat = X_mat[:, idx] - -if par["n_pca_dims"]: - print("Apply PCA to normalized data", flush=True) - umap_input = sc.tl.pca( - X_mat, - n_comps=par["n_pca_dims"], - svd_solver="arpack" - ) -else: - print("Use normalized data as input for UMAP", flush=True) - umap_input = X_mat - -print("Run densMAP", flush=True) -X_emb = UMAP(densmap=True, random_state=42).fit_transform(umap_input) - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml deleted file mode 100644 index ced082c708..0000000000 --- a/src/tasks/dimensionality_reduction/methods/diffusion_map/config.vsh.yaml +++ /dev/null @@ -1,31 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: diffusion_map - info: - label: Diffusion Map - summary: Finding meaningful geometric descriptions of datasets using diffusion maps. - description: Implements diffusion map method of data parametrization, including creation and visualization of diffusion map, clustering with diffusion K-means and regression using adaptive regression model. - reference: coifman2006diffusion - documentation_url: https://bioconductor.org/packages/release/bioc/html/destiny.html - repository_url: https://github.com/theislab/destiny - v1: - path: openproblems/tasks/dimensionality_reduction/methods/diffusion_map.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - resources: - - type: r_script - path: script.R - arguments: - - name: "--n_dim" - type: integer - description: Number of dimensions. - default: 3 -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - bioc: destiny - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/diffusion_map/script.R b/src/tasks/dimensionality_reduction/methods/diffusion_map/script.R deleted file mode 100644 index a9146c8db9..0000000000 --- a/src/tasks/dimensionality_reduction/methods/diffusion_map/script.R +++ /dev/null @@ -1,37 +0,0 @@ -requireNamespace("anndata", quietly = TRUE) -requireNamespace("diffusionMap", quietly = TRUE) - -## VIASH START -par <- list( - input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", - output = "output.h5ad", - n_dim = 3 -) -## VIASH END - -cat("Reading input files\n") -input <- anndata::read_h5ad(par$input) - -cat("Running destiny diffusion map\n") -# create SummarizedExperiment object -sce <- SingleCellExperiment::SingleCellExperiment( - assays = list( - logcounts = t(as.matrix(input$layers[["normalized"]])) - ) -) -dm <- destiny::DiffusionMap(sce) -X_emb <- destiny::eigenvectors(dm)[, seq_len(par$n_dim)] - -cat("Write output AnnData to file\n") -output <- anndata::AnnData( - uns = list( - dataset_id = input$uns[["dataset_id"]], - normalization_id = input$uns[["normalization_id"]], - method_id = meta$functionality_name - ), - obsm = list( - X_emb = X_emb - ), - shape = input$shape -) -output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml deleted file mode 100644 index aa3c5ca0b4..0000000000 --- a/src/tasks/dimensionality_reduction/methods/ivis/config.vsh.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# see https://github.com/openproblems-bio/openproblems/blob/9ebb777b3b76337e731a3b99f4bf39462a15c4cc/openproblems/tasks/dimensionality_reduction/methods/ivis.py - -__merge__: ../../api/comp_method.yaml -functionality: - name: "ivis" - info: - label: "ivis" - summary: "Structure-preserving dimensionality reduction using a siamese neural network trained on triplets." - description: | - ivis is a machine learning library for reducing dimensionality of very large datasets using Siamese Neural Networks. - ivis preserves global data structures in a low-dimensional space, adds new data points to existing embeddings using - a parametric mapping function, and scales linearly to millions of observations. - reference: szubert2019structurepreserving - repository_url: "https://github.com/beringresearch/ivis" - documentation_url: "https://github.com/beringresearch/ivis#readme" - v1: - path: openproblems/tasks/dimensionality_reduction/methods/ivis.py - commit: 93d2161a08da3edf249abedff5111fb5ce527552 - preferred_normalization: log_cp10k - variants: - ivis_logCPM_1kHVG: - arguments: - - name: '--n_pca_dims' - type: integer - default: 50 - description: Number of principal components of PCA to use. - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - default: 1000 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - ivis[cpu] - - tensorflow<2.16 - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/ivis/script.py b/src/tasks/dimensionality_reduction/methods/ivis/script.py deleted file mode 100644 index 1eade8b74d..0000000000 --- a/src/tasks/dimensionality_reduction/methods/ivis/script.py +++ /dev/null @@ -1,57 +0,0 @@ -import anndata as ad -import scanpy as sc -from ivis import Ivis - -# todo: allow using gpus instead! - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", - "output": "reduced.h5ad", - "n_hvg": 1000, - "n_pca_dims": 50 -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) -X_mat = input.layers["normalized"] - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_mat = X_mat[:, idx] - -print(f"Running PCA with {par['n_pca_dims']} dimensions", flush=True) -X_pca = sc.tl.pca(X_mat, n_comps=par["n_pca_dims"], svd_solver="arpack") - -print("Run ivis", flush=True) -# parameters taken from: -# https://bering-ivis.readthedocs.io/en/latest/scanpy_singlecell.html#reducing-dimensionality-using-ivis -ivis = Ivis( - k=15, - model="maaten", - n_epochs_without_progress=5, - verbose=0, - embedding_dims=2, -) -X_emb = ivis.fit_transform(X_pca) - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml deleted file mode 100644 index 2b651271a9..0000000000 --- a/src/tasks/dimensionality_reduction/methods/lmds/config.vsh.yaml +++ /dev/null @@ -1,44 +0,0 @@ -__merge__: ../../api/comp_method.yaml - -functionality: - name: lmds - - info: - label: LMDS - summary: Landmark Multi-Dimensional Scaling - description: | - Landmark Multi-Dimensional Scaling (LMDS) is a method for dimensionality reduction that is based on the concept of multi-dimensional scaling. - LMDS is a non-linear dimensionality reduction method that is based on the concept of multi-dimensional scaling. - preferred_normalization: log_cp10k - reference: saelens2019comparison - documentation_url: https://dynverse.org/lmds/ - repository_url: https://github.com/dynverse/lmds - - arguments: - - name: "--n_dim" - type: integer - description: Number of dimensions. - default: 2 - - name: "--n_landmarks" - type: integer - description: Number of landmarks. - default: 1000 - - name: "--distance_method" - type: string - description: Number of clusters to be estimated over the input dataset. - choices: ["euclidean", "pearson", "spearman", "cosine", "chisquared", "hamming", "kullback", "manhattan", "maximum", "canberra", "minkowski"] - default: "pearson" - - resources: - - type: r_script - path: script.R - -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ Matrix, lmds ] - - type: nextflow - directives: - label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/methods/lmds/script.R b/src/tasks/dimensionality_reduction/methods/lmds/script.R deleted file mode 100644 index ae9461c496..0000000000 --- a/src/tasks/dimensionality_reduction/methods/lmds/script.R +++ /dev/null @@ -1,39 +0,0 @@ -requireNamespace("anndata", quietly = TRUE) -requireNamespace("lmds", quietly = TRUE) - -## VIASH START -par <- list( - input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", - output = "output.h5ad", - n_dim = 3, - n_landmarks = 1000, - distance_method = "pearson" -) -## VIASH END - -cat("Reading input files\n") -input <- anndata::read_h5ad(par$input) - -# TODO: if we wanted to, we could compute the distance -# matrix in batches. This would be useful for large datasets. -cat("Running LMDS\n") -X_emb <- lmds::lmds( - input$layers[["normalized"]], - ndim = par$n_dim, - num_landmarks = par$n_landmarks, - distance_method = par$distance_method -) - -cat("Write output AnnData to file\n") -output <- anndata::AnnData( - uns = list( - dataset_id = input$uns[["dataset_id"]], - method_id = meta$functionality_name, - normalization_id = input$uns[["normalization_id"]] - ), - obsm = list( - X_emb = X_emb - ), - shape = input$shape -) -output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml deleted file mode 100644 index 0d3d0234c4..0000000000 --- a/src/tasks/dimensionality_reduction/methods/neuralee/config.vsh.yaml +++ /dev/null @@ -1,55 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "neuralee" - info: - label: NeuralEE - summary: "Non-linear method that uses a neural network to preserve pairwise distances between data points in a high-dimensional space." - description: | - A neural network implementation of elastic embedding. It is a - non-linear method that preserves pairwise distances between data points. - NeuralEE uses a neural network to optimize an objective function that - measures the difference between pairwise distances in the original - high-dimensional space and the two-dimensional space. It is computed on both - the recommended input from the package authors of 500 HVGs selected from a - logged expression matrix (without sequencing depth scaling) and the default - logCPM matrix with 1000 HVGs. - reference: "xiong2020neuralee" - repository_url: "https://github.com/HiBearME/NeuralEE" - documentation_url: "https://github.com/HiBearME/NeuralEE#readme" - v1: - path: openproblems/tasks/dimensionality_reduction/methods/neuralee.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - neuralee_default: - normalize: true - n_hvg: 500 - neuralee_logCP10k_1kHVG: - normalize: false - n_hvg: 1000 - arguments: - - name: "--n_iter" - type: integer - description: Number of iterations. - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - default: 1000 - - name: "--normalize" - type: boolean - default: false - description: Whether to perform own normalization - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - torch - - "git+https://github.com/michalk8/neuralee@8946abf" - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/neuralee/script.py b/src/tasks/dimensionality_reduction/methods/neuralee/script.py deleted file mode 100644 index bd13a2f34d..0000000000 --- a/src/tasks/dimensionality_reduction/methods/neuralee/script.py +++ /dev/null @@ -1,78 +0,0 @@ -import anndata as ad -import torch -from neuralee.embedding import NeuralEE -from neuralee.dataset import GeneExpressionDataset - -# todo: allow gpu -device = torch.device("cpu") - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", - "output": "reduced.h5ad", - "n_hvg": 1000, - "n_iter": 10, - "normalize": True -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) - -if par["normalize"]: - print("Performing own normalization", flush=True) - # perform own normalization based on the "recommended" preprocessing taken from example notebooks, e.g.: - # https://github.com/HiBearME/NeuralEE/blob/master/tests/notebooks/retina_dataset.ipynb - dataset = GeneExpressionDataset(input.layers["counts"]) - dataset.log_shift() - if par["n_hvg"]: - dataset.subsample_genes(par["n_hvg"]) - dataset.standardscale() - -else: - X_mat = input.layers["normalized"] - - if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[-par["n_hvg"]:] - X_mat = X_mat[:, idx] - - print("Using pre-normalized data", flush=True) - dataset = GeneExpressionDataset(X_mat) - - -# estimate the affinity matrix -batch_size = min(1000, input.n_obs) -print(f"Use {batch_size} cells as batch to estimate the affinity matrix", flush=True) -dataset.affinity_split(N_small=batch_size) - -print("Create NeuralEE object", flush=True) -NEE = NeuralEE(dataset, d=2, device=device) -fine_tune_kwargs = dict(verbose=False) - -if par["n_iter"]: - fine_tune_kwargs["maxit"] = par["n_iter"] - -print("Run NeuralEE", flush=True) -res = NEE.fine_tune(**fine_tune_kwargs) - -X_emb = res["X"].detach().cpu().numpy() - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml deleted file mode 100644 index 11d3841fb6..0000000000 --- a/src/tasks/dimensionality_reduction/methods/pca/config.vsh.yaml +++ /dev/null @@ -1,40 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "pca" - info: - label: "PCA" - summary: A linear method that finds orthogonal directions to compute the two-dimensional embedding. - description: | - Principal Component Analysis is a linear method that finds orthogonal - directions in the data that capture the most variance. The first two - principal components are chosen as the two-dimensional embedding. We select - only the first two principal components as the two-dimensional embedding. PCA - is calculated on the logCPM expression matrix with and without selecting 1000 - HVGs. - reference: pearson1901pca - repository_url: https://github.com/scikit-learn/scikit-learn - documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html - v1: - path: openproblems/tasks/dimensionality_reduction/methods/pca.py - commit: 154ccb9fd99113f3d28d9c3f139194539a0290f9 - preferred_normalization: log_cp10k - variants: - pca_logCP10k: - pca_logCP10k_1kHVG: - n_hvg: 1000 - arguments: - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: scanpy - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/pca/script.py b/src/tasks/dimensionality_reduction/methods/pca/script.py deleted file mode 100644 index 81cff3441f..0000000000 --- a/src/tasks/dimensionality_reduction/methods/pca/script.py +++ /dev/null @@ -1,41 +0,0 @@ -import anndata as ad -import scanpy as sc - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", - "output": "reduced.h5ad", - "n_hvg": 1000 -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) -X_mat = input.layers["normalized"] - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_mat = X_mat[:, idx] - -print(f"Running PCA", flush=True) -X_emb = sc.tl.pca(X_mat, n_comps=2, svd_solver="arpack")[:, :2] - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml deleted file mode 100644 index ff63659780..0000000000 --- a/src/tasks/dimensionality_reduction/methods/phate/config.vsh.yaml +++ /dev/null @@ -1,58 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "phate" - info: - label: PHATE - summary: Preservating trajectories in a dataset by using heat diffusion potential. - description: | - PHATE or "Potential of Heat - diffusion for Affinity - based Transition - Embedding" uses the potential of heat diffusion to preserve trajectories in a - dataset via a diffusion process. It is an affinity - based method that - creates an embedding by finding the dominant eigenvalues of a Markov - transition matrix. We evaluate several variants including using the - recommended square - root transformed CPM matrix as input, this input with - the gamma parameter set to zero and the normal logCPM transformed matrix with - and without HVG selection. - reference: "moon2019visualizing" - repository_url: "https://github.com/KrishnaswamyLab/PHATE" - documentation_url: "https://github.com/KrishnaswamyLab/PHATE#readme" - v1: - path: openproblems/tasks/dimensionality_reduction/methods/phate.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: sqrt_cp10k - variants: - phate_default: - phate_sqrt: - gamma: 0 - phate_logCP10k: - preferred_normalization: log_cp10k - phate_logCP10k_1kHVG: - n_hvg: 1000 - preferred_normalization: log_cp10k - arguments: - - name: '--n_pca_dims' - type: integer - default: 50 - description: Number of principal components of PCA to use. - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - - name: '--gamma' - type: double - description: Gamma value - default: 1 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - phate==1.0.* - - scprep - - "scikit-learn<1.2" - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/phate/script.py b/src/tasks/dimensionality_reduction/methods/phate/script.py deleted file mode 100644 index a21d9e0d87..0000000000 --- a/src/tasks/dimensionality_reduction/methods/phate/script.py +++ /dev/null @@ -1,45 +0,0 @@ -import anndata as ad -from phate import PHATE - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", - "output": "reduced.h5ad", - "n_pca_dims": 50, - "n_hvg": 1000, - "gamma": 1 -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) - -X_mat = input.layers["normalized"] - -if par["n_hvg"]: - print(f"Subsetting to {par['n_hvg']} HVG", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_mat = X_mat[:, idx] - -print("Run PHATE", flush=True) -phate_op = PHATE(n_pca=par["n_pca_dims"], verbose=False, n_jobs=-1, gamma=par["gamma"]) -X_emb = phate_op.fit_transform(X_mat) - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml deleted file mode 100644 index 2f733bb714..0000000000 --- a/src/tasks/dimensionality_reduction/methods/pymde/config.vsh.yaml +++ /dev/null @@ -1,41 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: pymde - info: - label: PyMDE - summary: "A Python implementation of Minimum-Distortion Embedding" - description: | - PyMDE is a Python implementation of Minimum-Distortion Embedding. It is a non-linear - method that preserves distances between cells or neighbourhoods in the original space. - reference: agrawal2021mde - repository_url: https://github.com/cvxgrp/pymde - documentation_url: https://pymde.org - v1: - path: openproblems/tasks/dimensionality_reduction/methods/pymde.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - arguments: - - name: --embed_method - type: string - description: The method to use for embedding. Options are 'umap' and 'tsne'. - default: neighbors - choices: [ neighbors, distances ] - - name: --n_hvg - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - - name: --n_pca_dims - type: integer - description: Number of principal components to use for the initial PCA step. - default: 100 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: pymde - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/pymde/script.py b/src/tasks/dimensionality_reduction/methods/pymde/script.py deleted file mode 100644 index 612582d8c3..0000000000 --- a/src/tasks/dimensionality_reduction/methods/pymde/script.py +++ /dev/null @@ -1,59 +0,0 @@ -import anndata as ad -import scanpy as sc -import pymde - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", - "output": "reduced.h5ad", - "embed_method": "neighbors", - "n_hvg": 1000, - "n_pca_dims": 50, -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -if par["embed_method"] == "neighbors": - mde_fn = pymde.preserve_neighbors -elif par["embed_method"] == "distances": - mde_fn = pymde.preserve_distances -else: - raise ValueError(f"Unknown embedding method: {par['embed_method']}") - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) -X_mat = input.layers["normalized"] - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_mat = X_mat[:, idx] - -print(f"Compute PCA", flush=True) -X_pca = sc.tl.pca(X_mat, n_comps=par["n_pca_dims"], svd_solver="arpack") - -print(f"Run MDE", flush=True) -X_emb = ( - mde_fn(X_pca, embedding_dim=2, verbose=True) - .embed(verbose=True) - .detach() - .numpy() -) - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml deleted file mode 100644 index ba4b7b3b84..0000000000 --- a/src/tasks/dimensionality_reduction/methods/simlr/config.vsh.yaml +++ /dev/null @@ -1,57 +0,0 @@ -__merge__: ../../api/comp_method.yaml - -functionality: - name: simlr - - info: - label: SIMLR - summary: Multikernel-based learning of distance metrics from gene expression data for dimension reduction, clustering and visulaization. - description: | - Single-cell Interpretation via Multikernel LeaRning (SIMLR) learns cell-to-cell similarity measures from single-cell RNA-seq data in using Gaussian kernels with various hyperparameters in order to perform dimension reduction, clustering and visualization. - SIMLR assumes that if C separable populations exist among the N cells, then the similarity matrix should have an approximate block-diagonal structure with C blocks whereby cells have larger similarities to other cells within the same subpopulations. Learned similarity between two cells should be small if the Euclidean distance between them is large. The cell-to-cell similarity is computed using an optimization framework over an N x N similarity matrix, a low-dimensional auxilary matrix enforcing low rank constraint on the similarity matrix, and the kernel weights. - Dimension reduction is achieved by the stochastic neighbor embedding methodology with the learned similarities as input. - preferred_normalization: log_cp10k - reference: "wang2017visualization" - documentation_url: https://github.com/BatzoglouLabSU/SIMLR/blob/SIMLR/README.md - repository_url: https://github.com/BatzoglouLabSU/SIMLR - - arguments: - - name: "--n_dim" - type: integer - description: Number of dimensions. - - name: "--n_clusters" - type: integer - description: Number of clusters to be estimated over the input dataset. - - name: "--tuning_param" - type: integer - default: 10 - description: Number of dimensions. - - name: "--impute" - type: boolean - default: false - description: Should the input data be transposed? - - name: "--normalize" - type: boolean - default: false - description: Should the input data be normalized? - - name: "--cores_ratio" - type: integer - default: 1 - description: Ratio of the number of cores to be used when computing the multi-kernel. - - resources: - - type: r_script - path: script.R - -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - packages: [ grDevices ] - cran: [ Matrix, parallel, Rcpp, pracma, RcppAnnoy, RSpectra, igraph ] - bioc: [ SIMLR ] - - type: native - - type: nextflow - directives: - label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/methods/simlr/script.R b/src/tasks/dimensionality_reduction/methods/simlr/script.R deleted file mode 100644 index 0622076c08..0000000000 --- a/src/tasks/dimensionality_reduction/methods/simlr/script.R +++ /dev/null @@ -1,69 +0,0 @@ -requireNamespace("anndata", quietly = TRUE) -requireNamespace("SIMLR", quietly = TRUE) - -## VIASH START -par <- list( - input = "resources_test/dimensionality_reduction/pancreas/dataset.h5ad", - output = "output.h5ad", - n_clusters = NULL, - n_dim = NA, - tuning_param = 10, - impute = FALSE, - normalize = FALSE, - cores_ratio = 1 -) -meta <- list( - functionality_name = "simlr" -) -## VIASH END - -cat("Reading input files\n") -input <- anndata::read_h5ad(par$input) - -X <- t(as.matrix(input$layers[["normalized"]])) - -if (is.null(par$n_clusters)) { - cat("Estimating the number of clusters\n") - set.seed(1) - NUMC = 2:5 - estimates <- SIMLR::SIMLR_Estimate_Number_of_Clusters( - X = X, - NUMC = NUMC, - cores.ratio = par$cores_ratio - ) - n_clusters <- NUMC[which.min(estimates$K2)] -} else { - n_clusters <- par$n_clusters -} - -if (is.null(par$n_dim)) { - n_dim <- NA -} else { - n_dim <- par$n_dim -} - -cat("Running SIMLR\n") -simlr_result <- SIMLR::SIMLR( - X = X, - c = n_clusters, - no.dim = n_dim, - k = par$tuning_param, - if.impute = par$impute, - normalize = par$normalize, - cores.ratio = par$cores_ratio -) -obsm_X_emb <- simlr_result$ydata - -cat("Write output AnnData to file\n") -output <- anndata::AnnData( - uns = list( - dataset_id = input$uns[["dataset_id"]], - method_id = meta$functionality_name, - normalization_id = input$uns[["normalization_id"]] - ), - obsm = list( - X_emb = obsm_X_emb - ), - shape = input$shape -) -output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml deleted file mode 100644 index cedaba0484..0000000000 --- a/src/tasks/dimensionality_reduction/methods/tsne/config.vsh.yaml +++ /dev/null @@ -1,49 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "tsne" - info: - label: t-SNE - summary: "Minimizing Kullback-Leibler divergence by converting similarities into joint probabilities between data points and the low/high dimensional embedding." - description: | - t-distributed Stochastic Neighbor Embedding converts similarities - between data points to joint probabilities and tries to minimize the - Kullback-Leibler divergence between the joint probabilities of the - low-dimensional embedding and the high-dimensional data. We use the - implementation in the scanpy package with the result of PCA on the logCPM - expression matrix (with and without HVG selection). - reference: vandermaaten2008visualizing - repository_url: "https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE" - documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html#sklearn.manifold.TSNE" - v1: - path: openproblems/tasks/dimensionality_reduction/methods/tsne.py - commit: 154ccb9fd99113f3d28d9c3f139194539a0290f9 - preferred_normalization: log_cp10k - variants: - tsne_logCP10k: - tsne_logCP10k_1kHVG: - n_hvg: 1000 - arguments: - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - - name: "--n_pca_dims" - type: integer - description: Number of PCA dimensions to use. If not specified, no PCA will be performed. - default: 50 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: apt - packages: - - cmake - - gcc - - type: python - github: - - DmitryUlyanov/Multicore-TSNE - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/tsne/script.py b/src/tasks/dimensionality_reduction/methods/tsne/script.py deleted file mode 100644 index 171e17bded..0000000000 --- a/src/tasks/dimensionality_reduction/methods/tsne/script.py +++ /dev/null @@ -1,47 +0,0 @@ -import anndata as ad -import scanpy as sc - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", - "output": "reduced.h5ad", - "n_pca_dims": 50, - "n_hvg": 1000 -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) - -X_mat = input.layers["normalized"] - -if par["n_hvg"]: - print(f"Subsetting to {par['n_hvg']} HVG", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_mat = X_mat[:, idx] - -print("Computing PCA", flush=True) -input.obsm["X_pca"] = sc.tl.pca(X_mat, n_comps=par["n_pca_dims"], svd_solver="arpack") - -print("Run t-SNE", flush=True) -sc.tl.tsne(input, use_rep="X_pca", n_pcs=par["n_pca_dims"]) -X_emb = input.obsm["X_tsne"].copy() - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml b/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml deleted file mode 100644 index a073e9dbe3..0000000000 --- a/src/tasks/dimensionality_reduction/methods/umap/config.vsh.yaml +++ /dev/null @@ -1,50 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "umap" - info: - label: UMAP - summary: "A manifold learning algorithm that utilizes topological data analysis for dimension reduction." - description: | - Uniform Manifold Approximation and Projection is an algorithm for - dimension reduction based on manifold learning techniques and ideas from - topological data analysis. We perform UMAP on the logCPM expression matrix - before and after HVG selection and with and without PCA as a pre-processing - step. - reference : "mcinnes2018umap" - repository_url: "https://github.com/lmcinnes/umap" - documentation_url: "https://github.com/lmcinnes/umap#readme" - v1: - path: openproblems/tasks/dimensionality_reduction/methods/umap.py - commit: 14d70b330cae09527a6d4c4e552db240601e31cf - preferred_normalization: log_cp10k - variants: - umap_logCP10k: - umap_pca_logCP10k: - n_pca_dims: 50 - umap_logCP10k_1kHVG: - n_hvg: 1000 - umap_pca_logCP10k_1kHVG: - n_pca_dims: 50 - n_hvg: 1000 - arguments: - - name: "--n_hvg" - type: integer - description: Number of highly variable genes to subset to. If not specified, the input matrix will not be subset. - default: 1000 - - name: "--n_pca_dims" - type: integer - description: Number of PCA dimensions to use. If not specified, no PCA will be performed. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - umap-learn - - pynndescent==0.5.11 - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/methods/umap/script.py b/src/tasks/dimensionality_reduction/methods/umap/script.py deleted file mode 100644 index 800e65328c..0000000000 --- a/src/tasks/dimensionality_reduction/methods/umap/script.py +++ /dev/null @@ -1,54 +0,0 @@ -import anndata as ad -from umap import UMAP -import scanpy as sc - -## VIASH START -par = { - "input": "resources_test/dimensionality_reduction/pancreas/train.h5ad", - "output": "reduced.h5ad", - "n_pca_dims": 50, - "n_hvg": 1000 -} -meta = { - "functionality_name": "foo", -} -## VIASH END - -print("Load input data", flush=True) -input = ad.read_h5ad(par["input"]) -X_mat = input.layers["normalized"] - -if par["n_hvg"]: - print(f"Select top {par['n_hvg']} high variable genes", flush=True) - idx = input.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] - X_mat = X_mat[:, idx] - -if par["n_pca_dims"]: - print("Apply PCA to normalized data", flush=True) - umap_input = sc.tl.pca( - X_mat, - n_comps=par["n_pca_dims"], - svd_solver="arpack" - ) -else: - print("Use normalized data as input for UMAP", flush=True) - umap_input = X_mat - -print("Run UMAP", flush=True) -X_emb = UMAP(densmap=False, random_state=42).fit_transform(umap_input) - -print("Create output AnnData", flush=True) -output = ad.AnnData( - obs=input.obs[[]], - obsm={ - "X_emb": X_emb - }, - uns={ - "dataset_id": input.uns["dataset_id"], - "normalization_id": input.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml deleted file mode 100644 index 67f1078f13..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/clustering_performance/config.vsh.yaml +++ /dev/null @@ -1,61 +0,0 @@ -__merge__: ../../api/comp_metric.yaml - -functionality: - name: clustering_performance - info: - metrics: - - name: normalized_mutual_information - label: NMI - summary: Normalized Mutual Information (NMI) is a measure of the concordance between clustering obtained from the reduced-dimensional embeddings and the cell labels. - description: | - The Normalized Mutual Information (NMI) is a measure of the similarity between cluster labels obtained from the clustering of dimensionality reduction embeddings and the true cell labels. It is a normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual information) and 1 (perfect correlation). - Mutual Information quantifies the "amount of information" obtained about one random variable by observing the other random variable. Assuming two label assignments X and Y, it is given by: - $MI(X,Y) = \sum_{x=1}^{X}\sum_{y=1}^{Y}p(x,y)log(\frac{P(x,y)}{P(x)P'(y)})$, - where P(x,y) is the joint probability mass function of X and Y, and P(x), P'(y) are the marginal probability mass functions of X and Y respectively. The mutual information is normalized by some generalized mean of H(X) and H(Y). Therefore, Normalized Mutual Information can be defined as: - $NMI(X,Y) = \frac{MI(X,Y)}{mean(H(X),H(Y))}$, - where H(X) and H(Y) are the entropies of X and Y respectively. Higher NMI score suggests that the method is effective in preserving relevant information. - reference: emmons2016analysis - documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html - repository_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html - min: 0 - max: 1 - maximize: true - - name: adjusted_rand_index - label: ARI - summary: Adjusted Rand Index (ARI) is a measure of the similarities between two cluster assignments of the reduced-dimensional embeddings and the true cell types. - description: | - Adjusted Rand Index (ARI) is a measure of similarity between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted (from the reduced dimensional embeddings) and true clusterings (cell type labels). It is the Rand Index (RI) adjusted for chance. - Assuming the C as the cell type labels and K as the clustering of the reduced dimensional embedding, Rand Index can be defined as: - $RI = \frac{a + b}{{C}_{2}^{n_{samples}}}$, - where 'a' is the number of pairs of elements that are in the same set in C and in the same set in K, 'b' is the number of pairs of elements that are in different sets in C and in different sets in K, and ${C}_{2}^{n_{samples}}$ is the total number of possible pairs in the dataset. Random label assignments can be discounted as follows: - $ARI = \frac{RI - E[RI]}{max(RI) - E[RI]}$, - where E[RI] is the expected RI of random labellings. - reference: santos2009on - documentation_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score - repository_url: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score - min: 0 - max: 1 - maximize: true - - # Component-specific parameters - arguments: - - name: "--nmi_avg_method" - type: string - default: arithmetic - description: Method to compute normalizer in the denominator for normalized mutual information score calculation. - choices: [ min, geometric, arithmetic, max ] - - resources: - - type: python_script - path: script.py - -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: [ scikit-learn, scanpy, leidenalg ] - - type: native - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/clustering_performance/script.py b/src/tasks/dimensionality_reduction/metrics/clustering_performance/script.py deleted file mode 100644 index eff2d5cd97..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/clustering_performance/script.py +++ /dev/null @@ -1,63 +0,0 @@ -import anndata as ad -import scanpy as sc -from sklearn.cluster import KMeans -from sklearn.metrics import normalized_mutual_info_score -from sklearn.metrics import adjusted_rand_score - -## VIASH START -par = { - 'input_embedding': 'resources_test/dimensionality_reduction/pancreas/embedding.h5ad', - 'input_solution': 'resources_test/dimensionality_reduction/pancreas/solution.h5ad', - 'output': 'output.h5ad', - 'nmi_avg_method': 'arithmetic' -} -meta = { - 'functionality_name': 'clustering_performance' -} -## VIASH END - -print('Reading input files', flush=True) -input_embedding = ad.read_h5ad(par['input_embedding']) -input_solution = ad.read_h5ad(par['input_solution']) - -print('Compute metrics', flush=True) - -# Perform Leiden clustering on dimensionlity reduction embedding -n = 20 -resolutions = [2 * x / n for x in range(1, n + 1)] -score_max = 0 -res_max = resolutions[0] -key_max = None -score_all = [] - -if "neighbors" not in input_embedding.uns: - sc.pp.neighbors(input_embedding, use_rep="X_emb") - -for res in resolutions: - key_added = f"X_emb_leiden_{res}" - sc.tl.leiden(input_embedding, resolution=res, key_added=key_added) - score = normalized_mutual_info_score(input_solution.obs["cell_type"], input_embedding.obs[key_added], average_method = par['nmi_avg_method']) - score_all.append(score) - - if score_max < score: - score_max = score - res_max = res - key_max = key_added - -# Compute NMI scores -nmi = normalized_mutual_info_score(input_solution.obs["cell_type"], input_embedding.obs[key_max], average_method = par['nmi_avg_method']) - -# Compute ARI scores -ari = adjusted_rand_score(input_solution.obs["cell_type"], input_embedding.obs[key_max]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_embedding.uns['dataset_id'], - 'normalization_id': input_embedding.uns['normalization_id'], - 'method_id': input_embedding.uns['method_id'], - 'metric_ids': [ 'normalized_mutual_information', 'adjusted_rand_index' ], - 'metric_values': [ nmi, ari ] - } -) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml deleted file mode 100644 index 6787e88f7e..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/coranking/config.vsh.yaml +++ /dev/null @@ -1,166 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "coranking" - # description: | - # This is a set of metrics which all use a co-ranking matrix as the basis of the metric. - info: - metrics: - - name: continuity_at_k30 - label: Continuity at k=30 - reference: venna2006local - summary: "The continuity metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - description: "The continuity metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - repository_url: https://github.com/gdkrmr/coRanking/ - documentation_url: https://coranking.guido-kraemer.com/ - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - note: | - The original v1 implementations consisted of a lot of helper functions which were - derived from the pyDRMetrics package. This version uses the coRanking package - to avoid reimplementing and potentially introducing a lot of bugs in how - the various metrics are computed. - - In addition, the references for each of the metrics were looked up to - properly attribute the original authors of each of the metrics. - - name: trustworthiness_at_k30 - label: Trustworthiness at k=30 - summary: "The trustworthiness metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - description: "The trustworthiness metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - repository_url: https://github.com/gdkrmr/coRanking/ - documentation_url: https://coranking.guido-kraemer.com/ - reference: venna2006local - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - note: | - The original v1 implementations consisted of a lot of helper functions which were - derived from the pyDRMetrics package. This version uses the coRanking package - to avoid reimplementing and potentially introducing a lot of bugs in how - the various metrics are computed. - - In addition, the references for each of the metrics were looked up to - properly attribute the original authors of each of the metrics. - - name: qnx_at_k30 - label: The value for QNX at k=30 - summary: "The QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - description: "The QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - repository_url: https://github.com/gdkrmr/coRanking/ - documentation_url: https://coranking.guido-kraemer.com/ - reference: lee2009quality - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - note: | - The original v1 implementations consisted of a lot of helper functions which were - derived from the pyDRMetrics package. This version uses the coRanking package - to avoid reimplementing and potentially introducing a lot of bugs in how - the various metrics are computed. - - In addition, the references for each of the metrics were looked up to - properly attribute the original authors of each of the metrics. - - name: lcmc_at_k30 - label: The value for LCMC at k=30 - summary: "The LCMC metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - description: "The LCMC metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - repository_url: https://github.com/gdkrmr/coRanking/ - documentation_url: https://coranking.guido-kraemer.com/ - reference: chen2009local - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - note: | - The original v1 implementations consisted of a lot of helper functions which were - derived from the pyDRMetrics package. This version uses the coRanking package - to avoid reimplementing and potentially introducing a lot of bugs in how - the various metrics are computed. - - In addition, the references for each of the metrics were looked up to - properly attribute the original authors of each of the metrics. - - name: qnx_auc - label: Area under the QNX curve - summary: "The AU-QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - description: "The AU-QNX metric at k=30 computed on the co-ranking matrix between expression matrix and embedding." - repository_url: https://github.com/gdkrmr/coRanking/ - documentation_url: https://coranking.guido-kraemer.com/ - reference: lueks2011evaluate - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - note: | - The original v1 implementations consisted of a lot of helper functions which were - derived from the pyDRMetrics package. This version uses the coRanking package - to avoid reimplementing and potentially introducing a lot of bugs in how - the various metrics are computed. - - In addition, the references for each of the metrics were looked up to - properly attribute the original authors of each of the metrics. - - name: qlocal - label: Local quality measure - summary: "The local quality metric computed on the co-ranking matrix between expression matrix and embedding." - description: "The local quality metric computed on the co-ranking matrix between expression matrix and embedding." - repository_url: https://github.com/gdkrmr/coRanking/ - documentation_url: https://coranking.guido-kraemer.com/ - reference: lueks2011evaluate - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - note: | - The original v1 implementations consisted of a lot of helper functions which were - derived from the pyDRMetrics package. This version uses the coRanking package - to avoid reimplementing and potentially introducing a lot of bugs in how - the various metrics are computed. - - In addition, the references for each of the metrics were looked up to - properly attribute the original authors of each of the metrics. - - name: qglobal - label: Global quality measure - summary: "The Global quality metric computed on the co-ranking matrix between expression matrix and embedding." - description: "The Global quality metric computed on the co-ranking matrix between expression matrix and embedding." - repository_url: https://github.com/gdkrmr/coRanking/ - documentation_url: https://coranking.guido-kraemer.com/ - reference: lueks2011evaluate - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/nn_ranking.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - note: | - The original v1 implementations consisted of a lot of helper functions which were - derived from the pyDRMetrics package. This version uses the coRanking package - to avoid reimplementing and potentially introducing a lot of bugs in how - the various metrics are computed. - - In addition, the references for each of the metrics were looked up to - properly attribute the original authors of each of the metrics. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ coRanking ] - - type: nextflow - directives: - label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/coranking/script.R b/src/tasks/dimensionality_reduction/metrics/coranking/script.R deleted file mode 100644 index 7fcce8c2f8..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/coranking/script.R +++ /dev/null @@ -1,101 +0,0 @@ -library(anndata) -library(coRanking) - -## VIASH START -par <- list( - "input_embedding" = "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", - "input_solution" = "resources_test/dimensionality_reduction/pancreas/test.h5ad", - "output" = "score.h5ad" -) -## VIASH END - -cat("Read anndata objects") -input_solution <- anndata::read_h5ad(par[["input_solution"]]) -input_embedding <- anndata::read_h5ad(par[["input_embedding"]]) - -# get datasets -high_dim <- input_solution$layers[["normalized"]] -X_emb <- input_embedding$obsm[["X_emb"]] - -if (any(is.na(X_emb))) { - continuity_at_k30 <- - trustworthiness_at_k30 <- - qnx_at_k30 <- - lcmc_at_k30 <- - qnx_auc <- - qlocal <- - qglobal <- - 0 -} else { - cat("Compute pairwise distances\n") - # TODO: computing a square distance matrix is problematic for large datasets! - # TODO: should we use a different distance metric for the high_dim? - # TODO: or should we subset to the HVG? - dist_highdim <- coRanking:::euclidean(as.matrix(high_dim)) - dist_emb <- coRanking:::euclidean(as.matrix(X_emb)) - - cat("Compute ranking matrices\n") - rmat_highdim <- rankmatrix(dist_highdim, input = "dist") - rmat_emb <- rankmatrix(dist_emb, input = "dist") - - cat("Compute coranking matrix\n") - corank <- coranking(rmat_highdim, rmat_emb, "rank") - - cat("Compute metrics\n") - # Compute QNX. This is a curve indicating the percentage of points - # that are mild in- and extrusions or keep their rank. - qnx <- Q_NX(corank) - - # Calculate the local continuity meta-criterion from a co-ranking matrix. - lcmc <- LCMC(corank) - - # the values of qnx are split into local and global values by kmax - kmax <- which.max(lcmc) - - # check certain quality values at k=30 - k30 <- 30 - trustworthiness_at_k30 <- coRanking:::cm.M_T(corank, k30) - continuity_at_k30 <- coRanking:::cm.M_C(corank, k30) - qnx_at_k30 <- qnx[[k30]] - lcmc_at_k30 <- lcmc[[k30]] - - # area under the QNX curve - qnx_auc <- mean(qnx) - - # local quality measure - qlocal <- mean(qnx[seq_len(kmax)]) - - # global quality measure - qglobal <- mean(qnx[-seq_len(kmax)]) -} - -cat("construct output AnnData\n") -output <- AnnData( - shape = c(0L, 0L), - uns = list( - dataset_id = input_solution$uns[["dataset_id"]], - normalization_id = input_solution$uns[["normalization_id"]], - method_id = input_embedding$uns[["method_id"]], - metric_ids = c( - "continuity_at_k30", - "trustworthiness_at_k30", - "qnx_at_k30", - "lcmc_at_k30", - "qnx_auc", - "qlocal", - "qglobal" - ), - metric_values = c( - continuity_at_k30, - trustworthiness_at_k30, - qnx_at_k30, - lcmc_at_k30, - qnx_auc, - qlocal, - qglobal - ) - ) -) - -cat("Write to file\n") -output$write_h5ad(par$output) diff --git a/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml deleted file mode 100644 index 4b1e9f3a32..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/density_preservation/config.vsh.yaml +++ /dev/null @@ -1,43 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "density_preservation" - info: - metrics: - - name: density_preservation - label: Density preservation - summary: "Similarity between local densities in the high-dimensional data and the reduced data." - description: | - "Similarity between local densities in the high-dimensional data and the reduced data. - This is computed as the pearson correlation of local radii with the local radii in the original data space." - reference: narayan2021assessing - min: -1 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/density.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - arguments: - - name: "--n_neighbors" - type: integer - default: 30 - description: "Number of neighbors to use for density estimation." - - name: "--seed" - type: integer - default: 42 - description: "Random seed." - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - scipy - - numpy - - umap-learn - - pynndescent~=0.5.11 - - type: nextflow - directives: - label: [midtime, lowmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py b/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py deleted file mode 100644 index 9bf44397c2..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/density_preservation/script.py +++ /dev/null @@ -1,132 +0,0 @@ - - -import anndata as ad -import numpy as np -from typing import Optional -from umap import UMAP -from scipy.stats import pearsonr - -## VIASH START -par = { - "input_embedding": "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", - "input_solution": "resources_test/dimensionality_reduction/pancreas/test.h5ad", - "output": "score.h5ad", - "n_neighbors": 30, - "seed": 42, -} -## VIASH END - -# Interpreted from: -# https://github.com/lmcinnes/umap/blob/317ce81dc64aec9e279aa1374ac809d9ced236f6/umap/umap_.py#L1190-L1243 -# -# Author: Leland McInnes -# -# License: BSD 3 clause -def _calculate_radii( - X: np.ndarray, - n_neighbors: int = 30, - random_state: Optional[int] = None -) -> np.ndarray: - from umap.umap_ import fuzzy_simplicial_set - from umap.umap_ import nearest_neighbors - - (knn_indices, knn_dists, _) = nearest_neighbors( - X, - n_neighbors, - "euclidean", - {}, - False, - random_state, - verbose=False, - ) - - emb_graph, _, _, emb_dists = fuzzy_simplicial_set( - X, - n_neighbors, - random_state, - "euclidean", - {}, - knn_indices, - knn_dists, - verbose=False, - return_dists=True, - ) - - emb_graph = emb_graph.tocoo() - emb_graph.sum_duplicates() - emb_graph.eliminate_zeros() - - n_vertices = emb_graph.shape[1] - - mu_sum = np.zeros(n_vertices, dtype=np.float32) - re = np.zeros(n_vertices, dtype=np.float32) - - head = emb_graph.row - tail = emb_graph.col - for i in range(len(head)): - j = head[i] - k = tail[i] - D = emb_dists[j, k] - mu = emb_graph.data[i] - re[j] += mu * D - re[k] += mu * D - mu_sum[j] += mu - mu_sum[k] += mu - - epsilon = 1e-8 - return np.log(epsilon + (re / mu_sum)) - -def compute_density_preservation( - X_emb: np.ndarray, - high_dim: np.ndarray, - n_neighbors: int = 30, - random_state: Optional[int] = None -) -> float: - if np.any(np.isnan(X_emb)): - return 0.0 - - print("Compute local radii in original data", flush=True) - ro = _calculate_radii( - high_dim, - n_neighbors=n_neighbors, - random_state=random_state - ) - - print("Compute local radii of embedding", flush=True) - re = _calculate_radii( - X_emb, - n_neighbors=n_neighbors, - random_state=random_state - ) - - print("Compute pearson correlation", flush=True) - return pearsonr(ro, re)[0] - - -print("Load data", flush=True) -input_solution = ad.read_h5ad(par["input_solution"]) -input_embedding = ad.read_h5ad(par["input_embedding"]) - -high_dim = input_solution.layers["normalized"] -X_emb = input_embedding.obsm["X_emb"] - -density_preservation = compute_density_preservation( - X_emb=X_emb, - high_dim=high_dim, - n_neighbors=par["n_neighbors"], - random_state=par["seed"] -) - -print("Create output AnnData object", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": input_solution.uns["dataset_id"], - "normalization_id": input_solution.uns["normalization_id"], - "method_id": input_embedding.uns["method_id"], - "metric_ids": [ "density_preservation" ], - "metric_values": [ density_preservation ] - } -) - -print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml deleted file mode 100644 index b08c93db2c..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml +++ /dev/null @@ -1,50 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: distance_correlation - info: - metrics: - - name: distance_correlation - label: Distance Correlation - summary: "Calculates the distance correlation by computing Spearman correlations between distances." - description: "Calculates the distance correlation by computing Spearman correlations between distances on the full (or processed) data matrix and the dimensionally-reduced matrix." - reference: kruskal1964mds - min: 0 - max: "+.inf" - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/distance_correlation.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - note: This metric was ported but will probably be removed soon. - - name: distance_correlation_spectral - label: Distance Correlation Spectral - summary: "Spearman correlation between all pairwise diffusion distances in the original and dimension-reduced data." - description: "Spearman correlation between all pairwise diffusion distances in the original and dimension-reduced data." - reference: coifman2006diffusion - min: 0 - max: "+.inf" - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/root_mean_square_error.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - note: This metric was ported but will probably be removed soon. - arguments: - - name: "--spectral" - type: boolean_true - description: Calculate the spectral root mean squared error. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - umap-learn - - scikit-learn - - numpy - - pynndescent~=0.5.11 - - scipy - - type: nextflow - directives: - label: [midtime, highmem, midcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py b/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py deleted file mode 100644 index 5d8e325126..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/distance_correlation/script.py +++ /dev/null @@ -1,59 +0,0 @@ -import anndata as ad -import numpy as np -import sklearn.decomposition -import scipy.stats -import scipy.spatial -from sklearn.metrics import pairwise_distances -import umap -import umap.spectral - -## VIASH START -par = { - "input_embedding": "resources_test/dimensionality_reduction/pancreas/embedding.h5ad", - "input_solution": "resources_test/dimensionality_reduction/pancreas/solution.h5ad", - "output": "score.h5ad", -} -## VIASH END - -def _distance_correlation(X, X_emb): - high_dimensional_distance_vector = scipy.spatial.distance.pdist(X) - low_dimensional_distance_vector = scipy.spatial.distance.pdist(X_emb) - corr = scipy.stats.spearmanr( - low_dimensional_distance_vector, high_dimensional_distance_vector - ) - return corr - -print("Load data", flush=True) -input_solution = ad.read_h5ad(par["input_solution"]) -input_embedding = ad.read_h5ad(par["input_embedding"]) - -high_dim = input_solution.layers["normalized"] -X_emb = input_embedding.obsm["X_emb"] - -print("Compute NNLS residual after SVD", flush=True) -n_svd = 500 -svd_emb = sklearn.decomposition.TruncatedSVD(n_svd).fit_transform(high_dim) -dist_corr = _distance_correlation(svd_emb, X_emb).correlation - -#! Explicitly not changing it to use diffusion map method as this will have a positive effect on the diffusion map method for this specific metric. -print("Compute NLSS residual after spectral embedding", flush=True) -n_comps = min(1000, min(input_solution.shape) - 2) -umap_graph = umap.UMAP(transform_mode="graph").fit_transform(high_dim) -spectral_emb = umap.spectral.spectral_layout( - high_dim, umap_graph, n_comps, random_state=np.random.default_rng() -) -dist_corr_spectral = _distance_correlation(spectral_emb, X_emb).correlation - -print("Create output AnnData object", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": input_solution.uns["dataset_id"], - "normalization_id": input_solution.uns["normalization_id"], - "method_id": input_embedding.uns["method_id"], - "metric_ids": [ "distance_correlation", "distance_correlation_spectral" ], - "metric_values": [ dist_corr, dist_corr_spectral ] - } -) - -print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml b/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml deleted file mode 100644 index 5f75fa8e26..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/trustworthiness/config.vsh.yaml +++ /dev/null @@ -1,31 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "trustworthiness" - info: - metrics: - - name: trustworthiness - label: Trustworthiness at k=15 - summary: "A measurement of similarity between the rank of each point's nearest neighbors in the high-dimensional data and the reduced data." - description: "A measurement of similarity between the rank of each point's nearest neighbors in the high-dimensional data and the reduced data." - reference: venna2006local - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/dimensionality_reduction/metrics/trustworthiness.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - note: This metric is already included in the 'coranking' component and can be removed. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - scikit-learn - - numpy - - type: nextflow - directives: - label: [midtime, highmem, lowcpu] diff --git a/src/tasks/dimensionality_reduction/metrics/trustworthiness/script.py b/src/tasks/dimensionality_reduction/metrics/trustworthiness/script.py deleted file mode 100644 index 410a0b3263..0000000000 --- a/src/tasks/dimensionality_reduction/metrics/trustworthiness/script.py +++ /dev/null @@ -1,37 +0,0 @@ -import anndata as ad -import numpy as np -from sklearn import manifold - -## VIASH START -par = { - "input_embedding": "resources_test/dimensionality_reduction/pancreas/reduced.h5ad", - "input_solution": "resources_test/dimensionality_reduction/pancreas/test.h5ad", - "output": "score.h5ad", -} -## VIASH END - -print("Load data", flush=True) -input_solution = ad.read_h5ad(par["input_solution"]) -input_embedding = ad.read_h5ad(par["input_embedding"]) - -high_dim = input_solution.layers["normalized"] -X_emb = input_embedding.obsm["X_emb"] - -print("Reduce dimensionality of raw data", flush=True) -trustworthiness = manifold.trustworthiness( - high_dim, X_emb, n_neighbors=15, metric="euclidean" -) - -print("Create output AnnData object", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": input_solution.uns["dataset_id"], - "normalization_id": input_solution.uns["normalization_id"], - "method_id": input_embedding.uns["method_id"], - "metric_ids": [ "trustworthiness" ], - "metric_values": [ trustworthiness ] - } -) - -print("Write data to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml b/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml deleted file mode 100644 index d6f62e0c7e..0000000000 --- a/src/tasks/dimensionality_reduction/process_dataset/config.vsh.yaml +++ /dev/null @@ -1,13 +0,0 @@ -__merge__: ../api/comp_process_dataset.yaml -functionality: - name: "process_dataset" - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/subset_anndata.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/dimensionality_reduction/process_dataset/script.py b/src/tasks/dimensionality_reduction/process_dataset/script.py deleted file mode 100644 index 9563ed56f0..0000000000 --- a/src/tasks/dimensionality_reduction/process_dataset/script.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys -import anndata as ad - -## VIASH START -par = { - "input": "resources_test/common/pancreas/dataset.h5ad", - "output_dataset": "train.h5ad", - "output_solution": "test.h5ad", -} -meta = { - "functionality_name": "split_data", - "config": "src/tasks/dimensionality_reduction/process_dataset/.config.vsh.yaml" -} -## VIASH END - -# import helper functions -sys.path.append(meta['resources_dir']) -from subset_anndata import read_config_slots_info, subset_anndata - -print(">> Load Data", flush=True) -adata = ad.read_h5ad(par["input"]) - -print(">> Figuring out which data needs to be copied to which output file", flush=True) -slot_info = read_config_slots_info(meta["config"]) - -print(">> Creating train data", flush=True) -output_dataset = subset_anndata(adata, slot_info["output_dataset"]) - -print(">> Creating test data", flush=True) -output_solution = subset_anndata(adata, slot_info["output_solution"]) - -print(">> Writing", flush=True) -output_dataset.write_h5ad(par["output_dataset"]) -output_solution.write_h5ad(par["output_solution"]) diff --git a/src/tasks/dimensionality_reduction/resources_scripts/process_datasets.sh b/src/tasks/dimensionality_reduction/resources_scripts/process_datasets.sh deleted file mode 100755 index 11e911edac..0000000000 --- a/src/tasks/dimensionality_reduction/resources_scripts/process_datasets.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -id: dimensionality_reduction_process_datasets -input_states: s3://openproblems-data/resources/datasets/**/state.yaml -rename_keys: 'input:output_dataset' -settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' -output_state: "$id/state.yaml" -publish_dir: s3://openproblems-data/resources/dimensionality_reduction/datasets -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' - withName:'.*publishStatesProc' { - memory = '16GB' - disk = '100GB' - } - withLabel:highmem { - memory = '350GB' - } -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/dimensionality_reduction/workflows/process_datasets/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels dimensionality_reduction,process_datasets \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark.sh b/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark.sh deleted file mode 100755 index 5cf975d3b5..0000000000 --- a/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -publish_dir="s3://openproblems-data/resources/dimensionality_reduction/results/${RUN_ID}" - -cat > /tmp/params.yaml << HERE -input_states: s3://openproblems-data/resources/dimensionality_reduction/datasets/**/state.yaml -rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' -output_state: "state.yaml" -publish_dir: "$publish_dir" -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/dimensionality_reduction/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config src/wf_utils/labels_tw.config \ - --labels dimensionality_reduction,full \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark_test.sh b/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark_test.sh deleted file mode 100755 index be6defda0f..0000000000 --- a/src/tasks/dimensionality_reduction/resources_scripts/run_benchmark_test.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -input_states: s3://openproblems-data/resources_test/dimensionality_reduction/**/state.yaml -rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' -output_state: "state.yaml" -publish_dir: s3://openproblems-nextflow/temp/dimensionality-reduction/ -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/dimensionality_reduction/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels dimensionality_reduction,test \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh b/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh deleted file mode 100755 index 03ec1659b6..0000000000 --- a/src/tasks/dimensionality_reduction/resources_test_scripts/pancreas.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -#make sure the following command has been executed -#viash ns build -q 'dimensionality_reduction|common' - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/dimensionality_reduction - -mkdir -p $DATASET_DIR - -# process dataset -echo Running process_dataset -nextflow run . \ - -main-script target/nextflow/dimensionality_reduction/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - --input_states "$RAW_DATA/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad"}' \ - --publish_dir "$DATASET_DIR" \ - --output_state '$id/state.yaml' -# output_state should be moved to settings once workaround is solved - - -# run one method -viash run src/tasks/dimensionality_reduction/methods/densmap/config.vsh.yaml -- \ - --input $DATASET_DIR/pancreas/dataset.h5ad \ - --output $DATASET_DIR/pancreas/embedding.h5ad - -# run one metric -viash run src/tasks/dimensionality_reduction/metrics/distance_correlation/config.vsh.yaml -- \ - --input_embedding $DATASET_DIR/pancreas/embedding.h5ad \ - --input_solution $DATASET_DIR/pancreas/solution.h5ad \ - --output $DATASET_DIR/pancreas/score.h5ad - -# # run benchmark -# export NXF_VER=22.04.5 - -# # after having added a split dataset component -# nextflow \ -# run . \ -# -main-script src/tasks/dimensionality_reduction/workflows/run/main.nf \ -# -profile docker \ -# --id pancreas \ -# --input_dataset $DATASET_DIR/dataset.h5ad \ -# --input_solution $DATASET_DIR/solution.h5ad \ -# --output scores.tsv \ -# --publish_dir $DATASET_DIR/ \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/process_datasets/config.vsh.yaml b/src/tasks/dimensionality_reduction/workflows/process_datasets/config.vsh.yaml deleted file mode 100644 index d6aa723b00..0000000000 --- a/src/tasks/dimensionality_reduction/workflows/process_datasets/config.vsh.yaml +++ /dev/null @@ -1,30 +0,0 @@ -functionality: - name: "process_datasets" - namespace: "dimensionality_reduction/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: "/src/tasks/dimensionality_reduction/api/file_common_dataset.yaml" - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_dataset" - __merge__: /src/tasks/dimensionality_reduction/api/file_dataset.yaml - required: true - direction: output - - name: "--output_solution" - __merge__: /src/tasks/dimensionality_reduction/api/file_solution.yaml - required: true - direction: output - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: common/check_dataset_schema - - name: dimensionality_reduction/process_dataset -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/process_datasets/main.nf b/src/tasks/dimensionality_reduction/workflows/process_datasets/main.nf deleted file mode 100644 index 8d34f77e82..0000000000 --- a/src/tasks/dimensionality_reduction/workflows/process_datasets/main.nf +++ /dev/null @@ -1,54 +0,0 @@ -include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" - -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - output_ch = input_ch - - | check_dataset_schema.run( - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset": checks["exit_code"] == 0 ? state.input : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset != null - } - - | process_dataset.run( - fromState: [input: "dataset"], - toState: [ - output_dataset: "output_dataset", - output_solution: "output_solution" - ] - ) - - // only output the files for which an output file was specified - | setState(["output_dataset", "output_solution"]) - - emit: - output_ch -} \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/process_datasets/run_test.sh b/src/tasks/dimensionality_reduction/workflows/process_datasets/run_test.sh deleted file mode 100644 index d16cd7736f..0000000000 --- a/src/tasks/dimensionality_reduction/workflows/process_datasets/run_test.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Run this prior to executing this script: -# bin/viash_build -q 'batch_integration' - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -export NXF_VER=22.04.5 - -nextflow run . \ - -main-script target/nextflow/dimensionality_reduction/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - -c src/wf_utils/labels_ci.config \ - --id run_test \ - --input_states "resources_test/common/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_dataset": "dataset.h5ad", "output_solution": "solution.h5ad"}' \ - --publish_dir "resources_test/dimensionality_reduction" \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/run_benchmark/config.vsh.yaml b/src/tasks/dimensionality_reduction/workflows/run_benchmark/config.vsh.yaml deleted file mode 100644 index aa751624d6..0000000000 --- a/src/tasks/dimensionality_reduction/workflows/run_benchmark/config.vsh.yaml +++ /dev/null @@ -1,82 +0,0 @@ -functionality: - name: "run_benchmark" - namespace: "dimensionality_reduction/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input_dataset" - __merge__: "/src/tasks/dimensionality_reduction/api/file_dataset.yaml" - required: true - direction: input - - name: "--input_solution" - __merge__: "/src/tasks/dimensionality_reduction/api/file_solution.yaml" - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--output_method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--output_metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.yaml - - name: Methods - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: "../../api/task_info.yaml" - dependencies: - - name: common/check_dataset_schema - - name: common/extract_metadata - - name: dimensionality_reduction/control_methods/random_features - - name: dimensionality_reduction/control_methods/spectral_features - - name: dimensionality_reduction/control_methods/true_features - - name: dimensionality_reduction/methods/densmap - - name: dimensionality_reduction/methods/diffusion_map - - name: dimensionality_reduction/methods/ivis - - name: dimensionality_reduction/methods/lmds - - name: dimensionality_reduction/methods/neuralee - - name: dimensionality_reduction/methods/pca - - name: dimensionality_reduction/methods/phate - - name: dimensionality_reduction/methods/pymde - - name: dimensionality_reduction/methods/simlr - - name: dimensionality_reduction/methods/tsne - - name: dimensionality_reduction/methods/umap - - name: dimensionality_reduction/metrics/clustering_performance - - name: dimensionality_reduction/metrics/coranking - - name: dimensionality_reduction/metrics/density_preservation - - name: dimensionality_reduction/metrics/distance_correlation - - name: dimensionality_reduction/metrics/trustworthiness - # test_resources: - # - type: nextflow_script - # path: main.nf - # entrypoint: test_wf -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/tasks/dimensionality_reduction/workflows/run_benchmark/main.nf b/src/tasks/dimensionality_reduction/workflows/run_benchmark/main.nf deleted file mode 100644 index 1ba9251f9f..0000000000 --- a/src/tasks/dimensionality_reduction/workflows/run_benchmark/main.nf +++ /dev/null @@ -1,210 +0,0 @@ -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - - // construct list of methods - methods = [ - // controls - random_features, - spectral_features, - true_features, - // methods - densmap, - diffusion_map, - ivis, - lmds, - neuralee, - pca, - phate, - pymde, - simlr, - tsne, - umap - ] - - // construct list of metrics - metrics = [ - clustering_performance, - coranking, - density_preservation, - distance_correlation, - trustworthiness - ] - - - /**************************** - * EXTRACT DATASET METADATA * - ****************************/ - dataset_ch = input_ch - // store join id - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - // extract the dataset metadata - | extract_metadata.run( - fromState: [input: "input_solution"], - toState: { id, output, state -> - state + [ - dataset_uns: readYaml(output.output).uns - ] - } - ) - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - score_ch = dataset_ch - - // run all methods - | runEach( - components: methods, - - // use the 'filter' argument to only run a method on the normalisation the component is asking for - filter: { id, state, comp -> - def norm = state.dataset_uns.normalization_id - def pref = comp.config.functionality.info.preferred_normalization - // if the preferred normalisation is none at all, - // we can pass whichever dataset we want - def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref - def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) - - method_check && norm_check - }, - - // define a new 'id' by appending the method name to the dataset id - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: { id, state, comp -> - def new_args = [ - input: state.input_dataset - ] - if (comp.config.functionality.info.type == "control_method") { - new_args.input_solution = state.input_solution - } - new_args - }, - - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - method_id: comp.config.functionality.name, - method_output: output.output - ] - } - ) - - // run all metrics - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: { id, state, comp -> - [ - input_solution: state.input_solution, - input_embedding: state.method_output - ] - }, - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.functionality.name, - metric_output: output.output - ] - } - ) - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // TODO: can we store everything below in a separate helper function? - - // extract the dataset metadata - dataset_meta_ch = dataset_ch - // only keep one of the normalization methods - | filter{ id, state -> - state.dataset_uns.normalization_id == "log_cp10k" - } - | joinStates { ids, states -> - // store the dataset metadata in a file - def dataset_uns = states.collect{state -> - def uns = state.dataset_uns.clone() - uns.remove("normalization_id") - uns - } - def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - def dataset_uns_file = tempFile("dataset_uns.yaml") - dataset_uns_file.write(dataset_uns_yaml_blob) - - ["output", [output_dataset_info: dataset_uns_file]] - } - - output_ch = score_ch - - // extract the scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - // store the method configs in a file - def method_configs = methods.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - def task_info_file = meta.resources_dir.resolve("task_info.yaml") - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - def new_state = [ - output_method_configs: method_configs_file, - output_metric_configs: metric_configs_file, - output_task_info: task_info_file, - output_scores: score_uns_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - | mix(dataset_meta_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -} diff --git a/src/tasks/dimensionality_reduction/workflows/run_benchmark/run_test.sh b/src/tasks/dimensionality_reduction/workflows/run_benchmark/run_test.sh deleted file mode 100755 index 4bd2b01008..0000000000 --- a/src/tasks/dimensionality_reduction/workflows/run_benchmark/run_test.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -DATASETS_DIR="resources_test/dimensionality_reduction" -OUTPUT_DIR="output/temp" - -if [ ! -d "$OUTPUT_DIR" ]; then - mkdir -p "$OUTPUT_DIR" -fi - -export NXF_VER=22.04.5 -nextflow run . \ - -main-script target/nextflow/dimensionality_reduction/workflows/run_benchmark/main.nf \ - -profile docker \ - -resume \ - -entry auto \ - -c src/wf_utils/labels_ci.config \ - --input_states "$DATASETS_DIR/**/state.yaml" \ - --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \ - --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ - --publish_dir "$OUTPUT_DIR" \ - --output_state "state.yaml" \ No newline at end of file diff --git a/src/tasks/label_projection/README.md b/src/tasks/label_projection/README.md index 7694bc0aa6..d47aa44b5b 100644 --- a/src/tasks/label_projection/README.md +++ b/src/tasks/label_projection/README.md @@ -1,370 +1,3 @@ # Label projection - -Automated cell type annotation from rich, labeled reference data - -Path: -[`src/tasks/label_projection`](https://github.com/openproblems-bio/openproblems/tree/main/src/tasks/label_projection) - -## Motivation - -A major challenge for integrating single cell datasets is creating -matching cell type annotations for each cell. One of the most common -strategies for annotating cell types is referred to as -[“cluster-then-annotate”](https://www.nature.com/articles/s41576-018-0088-9) -whereby cells are aggregated into clusters based on feature similarity -and then manually characterized based on differential gene expression or -previously identified marker genes. Recently, methods have emerged to -build on this strategy and annotate cells using [known marker -genes](https://www.nature.com/articles/s41592-019-0535-3). However, -these strategies pose a difficulty for integrating atlas-scale datasets -as the particular annotations may not match. - -## Description - -To ensure that the cell type labels in newly generated datasets match -existing reference datasets, some methods align cells to a previously -annotated [reference -dataset](https://academic.oup.com/bioinformatics/article/35/22/4688/54802990) -and then *project* labels from the reference to the new dataset. - -Here, we compare methods for annotation based on a reference dataset. -The datasets consist of two or more samples of single cell profiles that -have been manually annotated with matching labels. These datasets are -then split into training and test batches, and the task of each method -is to train a cell type classifer on the training set and project those -labels onto the test set. - -## Authors & contributors - -| name | roles | -|:------------------|:-------------------| -| Nikolay Markov | author, maintainer | -| Scott Gigante | author | -| Robrecht Cannoodt | author | - -## API - -``` mermaid -flowchart LR - file_common_dataset("Common Dataset") - comp_process_dataset[/"Data processor"/] - file_train("Training data") - file_test("Test data") - file_solution("Solution") - comp_control_method[/"Control method"/] - comp_method[/"Method"/] - comp_metric[/"Metric"/] - file_prediction("Prediction") - file_score("Score") - file_common_dataset---comp_process_dataset - comp_process_dataset-->file_train - comp_process_dataset-->file_test - comp_process_dataset-->file_solution - file_train---comp_control_method - file_train---comp_method - file_test---comp_control_method - file_test---comp_method - file_solution---comp_control_method - file_solution---comp_metric - comp_control_method-->file_prediction - comp_method-->file_prediction - comp_metric-->file_score - file_prediction---comp_metric -``` - -## File format: Common Dataset - -A subset of the common dataset. - -Example file: `resources_test/common/pancreas/dataset.h5ad` - -Format: - -
- - AnnData object - obs: 'cell_type', 'batch' - var: 'hvg', 'hvg_score' - obsm: 'X_pca' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["cell_type"]` | `string` | Cell type information. | -| `obs["batch"]` | `string` | Batch information. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## Component type: Data processor - -Path: -[`src/label_projection`](https://github.com/openproblems-bio/openproblems/tree/main/src/label_projection) - -A label projection dataset processor. - -Arguments: - -
- -| Name | Type | Description | -|:--------------------|:-------|:-------------------------------------------| -| `--input` | `file` | A subset of the common dataset. | -| `--output_train` | `file` | (*Output*) The training data. | -| `--output_test` | `file` | (*Output*) The test data (without labels). | -| `--output_solution` | `file` | (*Output*) The solution for the test data. | - -
- -## File format: Training data - -The training data - -Example file: `resources_test/label_projection/pancreas/train.h5ad` - -Format: - -
- - AnnData object - obs: 'label', 'batch' - var: 'hvg', 'hvg_score' - obsm: 'X_pca' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:----------|:-------------------------------------------------------------------------| -| `obs["label"]` | `string` | Ground truth cell type labels. | -| `obs["batch"]` | `string` | Batch information. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## File format: Test data - -The test data (without labels) - -Example file: `resources_test/label_projection/pancreas/test.h5ad` - -Format: - -
- - AnnData object - obs: 'batch' - var: 'hvg', 'hvg_score' - obsm: 'X_pca' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:----------|:-------------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## File format: Solution - -The solution for the test data - -Example file: `resources_test/label_projection/pancreas/solution.h5ad` - -Format: - -
- - AnnData object - obs: 'label', 'batch' - var: 'hvg', 'hvg_score' - obsm: 'X_pca' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["label"]` | `string` | Ground truth cell type labels. | -| `obs["batch"]` | `string` | Batch information. | -| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | -| `var["hvg_score"]` | `double` | A ranking of the features by hvg. | -| `obsm["X_pca"]` | `double` | The resulting PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## Component type: Control method - -Path: -[`src/label_projection/control_methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/label_projection/control_methods) - -Quality control methods for verifying the pipeline. - -Arguments: - -
- -| Name | Type | Description | -|:-------------------|:-------|:--------------------------------| -| `--input_train` | `file` | The training data. | -| `--input_test` | `file` | The test data (without labels). | -| `--input_solution` | `file` | The solution for the test data. | -| `--output` | `file` | (*Output*) The prediction file. | - -
- -## Component type: Method - -Path: -[`src/label_projection/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/label_projection/methods) - -A label projection method. - -Arguments: - -
- -| Name | Type | Description | -|:----------------|:-------|:--------------------------------| -| `--input_train` | `file` | The training data. | -| `--input_test` | `file` | The test data (without labels). | -| `--output` | `file` | (*Output*) The prediction file. | - -
- -## Component type: Metric - -Path: -[`src/label_projection/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/label_projection/metrics) - -A label projection metric. - -Arguments: - -
- -| Name | Type | Description | -|:---------------------|:-------|:--------------------------------| -| `--input_solution` | `file` | The solution for the test data. | -| `--input_prediction` | `file` | The prediction file. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## File format: Prediction - -The prediction file - -Example file: `resources_test/label_projection/pancreas/prediction.h5ad` - -Format: - -
- - AnnData object - obs: 'label_pred' - uns: 'dataset_id', 'normalization_id', 'method_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:-------------------------------------| -| `obs["label_pred"]` | `string` | Predicted labels for the test cells. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | - -
- -## File format: Score - -Metric score file - -Example file: `resources_test/label_projection/pancreas/score.h5ad` - -Format: - -
- - AnnData object - uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | -| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | - -
- +# This task has been moved to [https://github.com/openproblems-bio/task_label_projection](https://github.com/openproblems-bio/task_label_projection)! diff --git a/src/tasks/label_projection/api/comp_control_method.yaml b/src/tasks/label_projection/api/comp_control_method.yaml deleted file mode 100644 index d32de4ab2c..0000000000 --- a/src/tasks/label_projection/api/comp_control_method.yaml +++ /dev/null @@ -1,38 +0,0 @@ -functionality: - namespace: "label_projection/control_methods" - info: - type: control_method - type_info: - label: Control method - summary: Quality control methods for verifying the pipeline. - description: | - This folder contains control components for the task. - These components have the same interface as the regular methods - but also receive the solution object as input. It serves as a - starting point to test the relative accuracy of new methods in - the task, and also as a quality control for the metrics defined - in the task. - arguments: - - name: "--input_train" - __merge__: file_train.yaml - direction: input - required: true - - name: "--input_test" - __merge__: file_test.yaml - direction: input - required: true - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: "--output" - __merge__: file_prediction.yaml - direction: output - required: true - test_resources: - - path: /resources_test/label_projection/pancreas - dest: resources_test/label_projection/pancreas - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py \ No newline at end of file diff --git a/src/tasks/label_projection/api/comp_method.yaml b/src/tasks/label_projection/api/comp_method.yaml deleted file mode 100644 index 1b7cb0dabc..0000000000 --- a/src/tasks/label_projection/api/comp_method.yaml +++ /dev/null @@ -1,31 +0,0 @@ -functionality: - namespace: "label_projection/methods" - info: - type: method - type_info: - label: Method - summary: A label projection method. - description: | - A label projection method to predict the labels of a new "test" - dataset based on an annotated "training" dataset. - arguments: - - name: "--input_train" - __merge__: file_train.yaml - direction: input - required: true - - name: "--input_test" - __merge__: file_test.yaml - direction: input - required: true - - name: "--output" - __merge__: file_prediction.yaml - direction: output - required: true - test_resources: - - path: /resources_test/label_projection/pancreas - dest: resources_test/label_projection/pancreas - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib diff --git a/src/tasks/label_projection/api/comp_metric.yaml b/src/tasks/label_projection/api/comp_metric.yaml deleted file mode 100644 index ce81b0f89f..0000000000 --- a/src/tasks/label_projection/api/comp_metric.yaml +++ /dev/null @@ -1,31 +0,0 @@ -functionality: - namespace: "label_projection/metrics" - info: - type: metric - type_info: - label: Metric - summary: A label projection metric. - description: | - A metric for evaluating predicted labels. - arguments: - - name: "--input_solution" - __merge__: file_solution.yaml - direction: input - required: true - - name: "--input_prediction" - __merge__: file_prediction.yaml - direction: input - required: true - - name: "--output" - __merge__: file_score.yaml - required: true - direction: output - test_resources: - - path: /resources_test/label_projection/pancreas - dest: resources_test/label_projection/pancreas - - type: python_script - path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib - diff --git a/src/tasks/label_projection/api/comp_process_dataset.yaml b/src/tasks/label_projection/api/comp_process_dataset.yaml deleted file mode 100644 index 03c2ea3726..0000000000 --- a/src/tasks/label_projection/api/comp_process_dataset.yaml +++ /dev/null @@ -1,32 +0,0 @@ -functionality: - namespace: "label_projection" - info: - type: process_dataset - type_info: - label: Data processor - summary: A label projection dataset processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. - arguments: - - name: "--input" - __merge__: file_common_dataset.yaml - direction: input - required: true - - name: "--output_train" - __merge__: file_train.yaml - direction: output - required: true - - name: "--output_test" - __merge__: file_test.yaml - direction: output - required: true - - name: "--output_solution" - __merge__: file_solution.yaml - direction: output - required: true - test_resources: - - path: /resources_test/common/pancreas - dest: resources_test/common/pancreas - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - diff --git a/src/tasks/label_projection/api/file_common_dataset.yaml b/src/tasks/label_projection/api/file_common_dataset.yaml deleted file mode 100644 index eeb01ffd1e..0000000000 --- a/src/tasks/label_projection/api/file_common_dataset.yaml +++ /dev/null @@ -1,72 +0,0 @@ -type: file -example: "resources_test/common/pancreas/dataset.h5ad" -info: - label: "Common Dataset" - summary: A subset of the common dataset. - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: cell_type - description: Cell type information - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - diff --git a/src/tasks/label_projection/api/file_prediction.yaml b/src/tasks/label_projection/api/file_prediction.yaml deleted file mode 100644 index 36efa87af0..0000000000 --- a/src/tasks/label_projection/api/file_prediction.yaml +++ /dev/null @@ -1,24 +0,0 @@ -type: file -example: "resources_test/label_projection/pancreas/prediction.h5ad" -info: - label: "Prediction" - summary: "The prediction file" - slots: - obs: - - type: string - name: label_pred - description: Predicted labels for the test cells. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true diff --git a/src/tasks/label_projection/api/file_score.yaml b/src/tasks/label_projection/api/file_score.yaml deleted file mode 100644 index 7ee5eaa8ee..0000000000 --- a/src/tasks/label_projection/api/file_score.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/label_projection/pancreas/score.h5ad" -info: - label: "Score" - summary: "Metric score file" - slots: - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: string - name: metric_ids - description: "One or more unique metric identifiers" - multiple: true - required: true - - type: double - name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." - multiple: true - required: true diff --git a/src/tasks/label_projection/api/file_solution.yaml b/src/tasks/label_projection/api/file_solution.yaml deleted file mode 100644 index c7591678e0..0000000000 --- a/src/tasks/label_projection/api/file_solution.yaml +++ /dev/null @@ -1,71 +0,0 @@ -type: file -example: "resources_test/label_projection/pancreas/solution.h5ad" -info: - label: "Solution" - summary: "The solution for the test data" - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/label_projection/api/file_test.yaml b/src/tasks/label_projection/api/file_test.yaml deleted file mode 100644 index 9cb2177da5..0000000000 --- a/src/tasks/label_projection/api/file_test.yaml +++ /dev/null @@ -1,43 +0,0 @@ -type: file -example: "resources_test/label_projection/pancreas/test.h5ad" -info: - label: "Test data" - summary: "The test data (without labels)" - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/label_projection/api/file_train.yaml b/src/tasks/label_projection/api/file_train.yaml deleted file mode 100644 index d615fc5693..0000000000 --- a/src/tasks/label_projection/api/file_train.yaml +++ /dev/null @@ -1,47 +0,0 @@ -type: file -example: "resources_test/label_projection/pancreas/train.h5ad" -info: - label: "Training data" - summary: "The training data" - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: string - name: label - description: Ground truth cell type labels - required: true - - type: string - name: batch - description: Batch information - required: true - var: - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - obsm: - - type: double - name: X_pca - description: The resulting PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/label_projection/api/task_info.yaml b/src/tasks/label_projection/api/task_info.yaml deleted file mode 100644 index 07b6b0120d..0000000000 --- a/src/tasks/label_projection/api/task_info.yaml +++ /dev/null @@ -1,46 +0,0 @@ -name: label_projection -label: Label projection -v1: - path: openproblems/tasks/label_projection/README.md - commit: 817ea64a526c7251f74c9a7a6dba98e8602b94a8 -summary: Automated cell type annotation from rich, labeled reference data -image: "thumbnail.svg" -motivation: | - A major challenge for integrating single cell datasets is creating matching - cell type annotations for each cell. One of the most common strategies for - annotating cell types is referred to as - ["cluster-then-annotate"](https://www.nature.com/articles/s41576-018-0088-9) - whereby cells are aggregated into clusters based on feature similarity and - then manually characterized based on differential gene expression or previously - identified marker genes. Recently, methods have emerged to build on this - strategy and annotate cells using - [known marker genes](https://www.nature.com/articles/s41592-019-0535-3). - However, these strategies pose a difficulty for integrating atlas-scale - datasets as the particular annotations may not match. -description: | - To ensure that the cell type labels in newly generated datasets match - existing reference datasets, some methods align cells to a previously - annotated [reference dataset](https://academic.oup.com/bioinformatics/article/35/22/4688/54802990) - and then _project_ labels from the reference to the new dataset. - - Here, we compare methods for annotation based on a reference dataset. - The datasets consist of two or more samples of single cell profiles that - have been manually annotated with matching labels. These datasets are then - split into training and test batches, and the task of each method is to - train a cell type classifer on the training set and project those labels - onto the test set. -authors: - - name: "Nikolay Markov" - roles: [ author, maintainer ] - info: - github: mxposed - - name: "Scott Gigante" - roles: [ author ] - info: - github: scottgigante - orcid: "0000-0002-4544-2764" - - name: Robrecht Cannoodt - roles: [ author ] - info: - github: rcannood - orcid: "0000-0003-3641-729X" \ No newline at end of file diff --git a/src/tasks/label_projection/api/thumbnail.svg b/src/tasks/label_projection/api/thumbnail.svg deleted file mode 100644 index 3a0c47b5c2..0000000000 --- a/src/tasks/label_projection/api/thumbnail.svg +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml b/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml deleted file mode 100644 index 8f0915a1dd..0000000000 --- a/src/tasks/label_projection/control_methods/majority_vote/config.vsh.yaml +++ /dev/null @@ -1,22 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "majority_vote" - info: - label: Majority Vote - summary: "A control-type method that predicts all cells to belong to the most abundant cell type in the dataset" - description: "A control-type method that predicts all cells to belong to the most abundant cell type in the dataset" - v1: - path: openproblems/tasks/label_projection/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - variants: - majority_vote: - preferred_normalization: counts - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/control_methods/majority_vote/script.py b/src/tasks/label_projection/control_methods/majority_vote/script.py deleted file mode 100644 index 0fc6446f0d..0000000000 --- a/src/tasks/label_projection/control_methods/majority_vote/script.py +++ /dev/null @@ -1,26 +0,0 @@ -import anndata as ad - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo' -} -## VIASH END - -print("Load data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print("Compute majority vote", flush=True) -majority = input_train.obs.label.value_counts().index[0] - -print("Create prediction object", flush=True) -input_test.obs["label_pred"] = majority - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml deleted file mode 100644 index 728157a644..0000000000 --- a/src/tasks/label_projection/control_methods/random_labels/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "random_labels" - info: - label: Random Labels - summary: "a negative control, where the labels are randomly predicted." - description: "A negative control, where the labels are randomly predicted without training the data." - v1: - path: openproblems/tasks/label_projection/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: counts - variants: - random_labels: - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: scanpy - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/control_methods/random_labels/script.py b/src/tasks/label_projection/control_methods/random_labels/script.py deleted file mode 100644 index a57a9d37f2..0000000000 --- a/src/tasks/label_projection/control_methods/random_labels/script.py +++ /dev/null @@ -1,33 +0,0 @@ -import anndata as ad -import numpy as np - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo' -} -## VIASH END - -print("Load data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print("Compute label distribution", flush=True) -label_distribution = input_train.obs.label.value_counts() -label_distribution = label_distribution / label_distribution.sum() - -print("Create prediction object", flush=True) -input_test.obs["label_pred"] = np.random.choice( - label_distribution.index, - size=input_test.n_obs, - replace=True, - p=label_distribution -) - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml b/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml deleted file mode 100644 index ec536fcc7d..0000000000 --- a/src/tasks/label_projection/control_methods/true_labels/config.vsh.yaml +++ /dev/null @@ -1,22 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "true_labels" - info: - label: True labels - summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." - description: "A positive control, where the solution labels are copied 1 to 1 to the predicted data." - v1: - path: openproblems/tasks/label_projection/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: counts - variants: - true_labels: - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/label_projection/control_methods/true_labels/script.py b/src/tasks/label_projection/control_methods/true_labels/script.py deleted file mode 100644 index dc9354c290..0000000000 --- a/src/tasks/label_projection/control_methods/true_labels/script.py +++ /dev/null @@ -1,25 +0,0 @@ -import anndata as ad - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'input_solution': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo' -} -## VIASH END - -print("Load data", flush=True) -# input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) -input_solution = ad.read_h5ad(par['input_solution']) - -print("Create prediction object", flush=True) -input_test.obs["label_pred"] = input_solution.obs["label"] - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/methods/knn/config.vsh.yaml b/src/tasks/label_projection/methods/knn/config.vsh.yaml deleted file mode 100644 index 499fa69e81..0000000000 --- a/src/tasks/label_projection/methods/knn/config.vsh.yaml +++ /dev/null @@ -1,37 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "knn" - info: - label: KNN - summary: "Assumes cells with similar gene expression belong to the same cell type, and assigns an unlabelled cell the most common cell type among its k nearest neighbors in PCA space." - description: | - Using the "k-nearest neighbours" approach, which is a - popular machine learning algorithm for classification and regression tasks. - The assumption underlying KNN in this context is that cells with similar gene - expression profiles tend to belong to the same cell type. For each unlabelled - cell, this method computes the $k$ labelled cells (in this case, 5) with the - smallest distance in PCA space, and assigns that cell the most common cell - type among its $k$ nearest neighbors. - reference : "cover1967nearest" - repository_url: https://github.com/scikit-learn/scikit-learn - documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html" - v1: - path: openproblems/tasks/label_projection/methods/knn_classifier.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - knn_classifier_log_cp10k: - knn_classifier_scran: - preferred_normalization: log_scran_pooling - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: [scikit-learn, jsonschema] - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/knn/script.py b/src/tasks/label_projection/methods/knn/script.py deleted file mode 100644 index 44b8b6f4de..0000000000 --- a/src/tasks/label_projection/methods/knn/script.py +++ /dev/null @@ -1,28 +0,0 @@ -import anndata as ad -import sklearn.neighbors - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print("Fit to train data", flush=True) -classifier = sklearn.neighbors.KNeighborsClassifier() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print("Predict on test data", flush=True) -input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml b/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml deleted file mode 100644 index 88f4c2d5af..0000000000 --- a/src/tasks/label_projection/methods/logistic_regression/config.vsh.yaml +++ /dev/null @@ -1,34 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "logistic_regression" - info: - label: Logistic Regression - summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." - description: | - Logistic Regression estimates parameters of a logistic function for - multivariate classification tasks. Here, we use 100-dimensional whitened PCA - coordinates as independent variables, and the model minimises the cross - entropy loss over all cell type classes. - reference: "hosmer2013applied" - repository_url: https://github.com/scikit-learn/scikit-learn - documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" - v1: - path: openproblems/tasks/label_projection/methods/logistic_regression.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - logistic_regression_log_cp10k: - logistic_regression_scran: - preferred_normalization: log_scran_pooling - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: scikit-learn - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/logistic_regression/script.py b/src/tasks/label_projection/methods/logistic_regression/script.py deleted file mode 100644 index e8796c1b75..0000000000 --- a/src/tasks/label_projection/methods/logistic_regression/script.py +++ /dev/null @@ -1,28 +0,0 @@ -import anndata as ad -import sklearn.linear_model - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print("Fit to train data", flush=True) -classifier = sklearn.linear_model.LogisticRegression() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print("Predict on test data", flush=True) -input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/methods/mlp/config.vsh.yaml b/src/tasks/label_projection/methods/mlp/config.vsh.yaml deleted file mode 100644 index 9c7e92fc68..0000000000 --- a/src/tasks/label_projection/methods/mlp/config.vsh.yaml +++ /dev/null @@ -1,47 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "mlp" - info: - label: Multilayer perceptron - summary: "A neural network with 100-dimensional PCA input, two hidden layers, and gradient descent weight updates to minimize cross entropy loss." - description: | - Multi-Layer Perceptron is a type of artificial neural network that - consists of multiple layers of interconnected neurons. Each neuron computes a - weighted sum of all neurons in the previous layer and transforms it with - nonlinear activation function. The output layer provides the final - prediction, and network weights are updated by gradient descent to minimize - the cross entropy loss. Here, the input data is 100-dimensional whitened PCA - coordinates for each cell, and we use two hidden layers of 100 neurons each. - reference: "hinton1989connectionist" - repository_url: https://github.com/scikit-learn/scikit-learn - documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html" - v1: - path: openproblems/tasks/label_projection/methods/mlp.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - mlp_log_cp10k: - mlp_scran: - preferred_normalization: log_scran_pooling - arguments: - - name: "--hidden_layer_sizes" - type: "integer" - multiple: true - description: "The ith element represents the number of neurons in the ith hidden layer." - default: [100, 100] - - name: "--max_iter" - type: "integer" - default: 1000 - description: "Maximum number of iterations" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: scikit-learn - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/mlp/script.py b/src/tasks/label_projection/methods/mlp/script.py deleted file mode 100644 index c98fba3954..0000000000 --- a/src/tasks/label_projection/methods/mlp/script.py +++ /dev/null @@ -1,31 +0,0 @@ -import anndata as ad -from sklearn.neural_network import MLPClassifier - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print("Fit to train data", flush=True) -classifier = MLPClassifier( - max_iter=par["max_iter"], - hidden_layer_sizes=tuple(par["hidden_layer_sizes"]) -) -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print("Predict on test data", flush=True) -input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml b/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml deleted file mode 100644 index 90f6e72a52..0000000000 --- a/src/tasks/label_projection/methods/naive_bayes/config.vsh.yaml +++ /dev/null @@ -1,33 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "naive_bayes" - info: - label: Naive Bayesian Classifier - summary: "Naive Bayes classification using feature probabilities to project cell type labels from a reference dataset." - description: | - Naive Bayes classification leverages probabilistic models based on Bayes' theorem - to classify cells into different types. In the context of single-cell datasets, this method - utilizes the probabilities of features to project cell type labels from a reference dataset - to new datasets. The algorithm assumes independence between features, making it computationally - efficient and well-suited for high-dimensional data. It is particularly useful for annotating - cells in atlas-scale datasets, ensuring consistency and alignment with existing reference annotations. - reference: "hosmer2013applied" - repository_url: https://github.com/scikit-learn/scikit-learn - documentation_url: "https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html" - preferred_normalization: log_cp10k - variants: - naive_bayes_log_cp10k: - naive_bayes_scran: - preferred_normalization: log_scran_pooling - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: scikit-learn - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/tasks/label_projection/methods/naive_bayes/script.py b/src/tasks/label_projection/methods/naive_bayes/script.py deleted file mode 100644 index 542c088dca..0000000000 --- a/src/tasks/label_projection/methods/naive_bayes/script.py +++ /dev/null @@ -1,28 +0,0 @@ -import anndata as ad -import sklearn.naive_bayes - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print("Fit to train data", flush=True) -classifier = sklearn.naive_bayes.GaussianNB() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print("Predict on test data", flush=True) -input_test.obs["label_pred"] = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/methods/scanvi/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi/config.vsh.yaml deleted file mode 100644 index 6c36ead072..0000000000 --- a/src/tasks/label_projection/methods/scanvi/config.vsh.yaml +++ /dev/null @@ -1,46 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "scanvi" - info: - label: scANVI - summary: "scANVI predicts cell type labels for unlabelled test data by leveraging cell type labels, modelling uncertainty and using deep neural networks with stochastic optimization." - description: | - single-cell ANnotation using Variational Inference is a - semi-supervised variant of the scVI(Lopez et al. 2018) algorithm. Like scVI, - scANVI uses deep neural networks and stochastic optimization to model - uncertainty caused by technical noise and bias in single - cell - transcriptomics measurements. However, scANVI also leverages cell type labels - in the generative modelling. In this approach, scANVI is used to predict the - cell type labels of the unlabelled test data. - reference: "lotfollahi2020query" - repository_url: "https://github.com/scverse/scvi-tools" - documentation_url: https://scarches.readthedocs.io/en/latest/scanvi_surgery_pipeline.html - v1: - path: openproblems/tasks/label_projection/methods/scvi_tools.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - preferred_normalization: counts - variants: - scanvi_all_genes: - scanvi_hvg: - num_hvg: 2000 - arguments: - - name: "--num_hvg" - type: integer - description: "The number of HVG genes to subset to." - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_pytorch_nvidia:1.0.0 - setup: - - type: python - packages: - - scarches - - scvi-tools>=1.1.0 - - type: docker - run: | - pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - - type: nextflow - directives: - label: [midtime, midmem, highcpu, gpu] diff --git a/src/tasks/label_projection/methods/scanvi/script.py b/src/tasks/label_projection/methods/scanvi/script.py deleted file mode 100644 index d34fccd932..0000000000 --- a/src/tasks/label_projection/methods/scanvi/script.py +++ /dev/null @@ -1,78 +0,0 @@ -import anndata as ad -import scarches as sca -import pandas as pd - -# followed procedure from here: -# https://scarches.readthedocs.io/en/latest/scanvi_surgery_pipeline.html - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad', - 'num_hvg': 2000 -} -meta = { - 'functionality_name': 'scanvi' -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -if par["num_hvg"]: - print("Subsetting to HVG", flush=True) - hvg_idx = input_train.var['hvg_score'].to_numpy().argsort()[:par["num_hvg"]] - input_train = input_train[:,hvg_idx] - input_test = input_test[:,hvg_idx] - -print("Concatenating train and test data", flush=True) -input_train.obs['is_test'] = False -input_test.obs['is_test'] = True -input_test.obs['label'] = "Unknown" -adata = ad.concat([input_train, input_test], merge = "same") -del input_train - -print("Create SCANVI model and train it on fully labelled reference dataset", flush=True) -sca.models.SCVI.setup_anndata( - adata, - batch_key="batch", - labels_key="label", - layer="counts" -) - -vae = sca.models.SCVI( - adata, - n_layers=2, - encode_covariates=True, - deeply_inject_covariates=False, - use_layer_norm="both", - use_batch_norm="none", -) - -print("Create the SCANVI model instance with ZINB loss", flush=True) -scanvae = sca.models.SCANVI.from_scvi_model(vae, unlabeled_category = "Unknown") - -print("Train SCANVI model", flush=True) -scanvae.train() - -print("Make predictions", flush=True) -preds = scanvae.predict(adata) - -print("Store outputs", flush=True) -output = ad.AnnData( - obs=pd.DataFrame( - {"label_pred": preds[adata.obs['is_test'].values]}, - index=input_test.obs.index, - ), - var=input_test.var[[]], - uns={ - "dataset_id": input_test.uns["dataset_id"], - "normalization_id": input_test.uns["normalization_id"], - "method_id": meta["functionality_name"], - }, -) - -print("Write output to file", flush=True) -output.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml b/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml deleted file mode 100644 index ccf2f449b4..0000000000 --- a/src/tasks/label_projection/methods/scanvi_scarches/config.vsh.yaml +++ /dev/null @@ -1,53 +0,0 @@ -__merge__: ../../api/comp_method.yaml - -functionality: - name: scanvi_scarches - info: - label: scANVI+scArches - summary: 'Query to reference single-cell integration with transfer learning with scANVI and scArches' - description: 'scArches+scANVI or "Single-cell architecture surgery" is a deep learning method for mapping new datasets onto a pre-existing reference model, using transfer learning and parameter optimization. It first uses scANVI to build a reference model from the training data, and then apply scArches to map the test data onto the reference model and make predictions.' - reference: lotfollahi2020query - documentation_url: https://docs.scvi-tools.org - repository_url: https://github.com/scverse/scvi-tools - preferred_normalization: counts - v1: - path: openproblems/tasks/label_projection/methods/scvi_tools.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - variants: - scanvi_scarches: - arguments: - - name: "--n_latent" - type: "integer" - default: 30 - description: "Number of units in the latent layer" - - name: "--n_layers" - type: "integer" - default: 2 - description: "Number of hidden layers" - - name: "--n_hidden" - type: "integer" - default: 128 - description: "Number of units in the hidden layers" - - name: "--dropout_rate" - type: "double" - default: 0.2 - description: "Rate of dropout applied in training" - - name: "--max_epochs" - type: "integer" - default: 2 - description: "Maximum number of training epochs" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_pytorch_nvidia:1.0.0 - setup: - - type: python - pypi: scvi-tools>=1.1.0 - - type: docker - run: | - pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html - - type: nextflow - directives: - label: [midtime, midmem, midcpu, gpu] diff --git a/src/tasks/label_projection/methods/scanvi_scarches/script.py b/src/tasks/label_projection/methods/scanvi_scarches/script.py deleted file mode 100644 index 73c9c0f1fa..0000000000 --- a/src/tasks/label_projection/methods/scanvi_scarches/script.py +++ /dev/null @@ -1,61 +0,0 @@ -import anndata as ad -import numpy as np -import scvi - -## VIASH START -par = { - "input_train": "resources_test/label_projection/pancreas/train.h5ad", - "input_test": "resources_test/label_projection/pancreas/test.h5ad", - "output": "output.h5ad", - "n_latent": 30, - "n_layers": 2, - "n_hidden": 128, - "dropout_rate": 0.2, - "max_epochs": 200, -} -meta = {"functionality_name": "scanvi_xgboost"} -## VIASH END - -print("Reading input files", flush=True) -input_train = ad.read_h5ad(par["input_train"]) -input_test = ad.read_h5ad(par["input_test"]) -input_train.X = input_train.layers["counts"] -input_test.X = input_test.layers["counts"] - -print("Train model", flush=True) -unlabeled_category = "Unknown" - -scvi.model.SCVI.setup_anndata(input_train, batch_key="batch", labels_key="label") - -# specific scArches parameters -arches_params = dict( - use_layer_norm="both", - use_batch_norm="none", - encode_covariates=True, - dropout_rate=par["dropout_rate"], - n_hidden=par["n_hidden"], - n_layers=par["n_layers"], - n_latent=par["n_latent"], -) -scvi_model = scvi.model.SCVI(input_train, **arches_params) -train_kwargs = dict( - train_size=0.9, - early_stopping=True, -) -scvi_model.train(**train_kwargs) -model = scvi.model.SCANVI.from_scvi_model( - scvi_model, unlabeled_category=unlabeled_category -) -model.train(**train_kwargs) - -query_model = scvi.model.SCANVI.load_query_data(input_test, model) -train_kwargs = dict(max_epochs=par["max_epochs"], early_stopping=True) -query_model.train(plan_kwargs=dict(weight_decay=0.0), **train_kwargs) - -print("Generate predictions", flush=True) -input_test.obs["label"] = "Unknown" -input_test.obs["label_pred"] = query_model.predict(input_test) - -print("Write output AnnData to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml b/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml deleted file mode 100644 index d51b532917..0000000000 --- a/src/tasks/label_projection/methods/seurat_transferdata/config.vsh.yaml +++ /dev/null @@ -1,36 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - status: disabled - name: "seurat_transferdata" - info: - label: Seurat TransferData - summary: "Seurat reference mapping predicts cell types for unlabelled cells using PCA distances, labelled anchors, and transfer anchors from Seurat, with SCTransform normalization." - description: | - Seurat reference mapping is a cell type label transfer method provided by the - Seurat package. Gene expression counts are first normalised by SCTransform - before computing PCA. Then it finds mutual nearest neighbours, known as - transfer anchors, between the labelled and unlabelled part of the data in PCA - space, and computes each cell's distance to each of the anchor pairs. - Finally, it uses the labelled anchors to predict cell types for unlabelled - cells based on these distances. - reference: "hao2021integrated" - repository_url: "https://github.com/satijalab/seurat" - documentation_url: "https://satijalab.org/seurat/articles/integration_mapping.html" - v1: - path: openproblems/tasks/label_projection/methods/seurat.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - preferred_normalization: log_cp10k - variants: - seurat: - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ Matrix>=1.5.3, Seurat, rlang ] - - type: nextflow - directives: - label: [midtime, highmem, highcpu] diff --git a/src/tasks/label_projection/methods/seurat_transferdata/script.R b/src/tasks/label_projection/methods/seurat_transferdata/script.R deleted file mode 100644 index 999eb769ce..0000000000 --- a/src/tasks/label_projection/methods/seurat_transferdata/script.R +++ /dev/null @@ -1,81 +0,0 @@ -cat(">> Loading dependencies\n") -library(Matrix, warn.conflicts = FALSE) -library(anndata, warn.conflicts = FALSE) -requireNamespace("Seurat", quietly = TRUE) -library(magrittr, warn.conflicts = FALSE) - -## VIASH START -par <- list( - input_train = "resources_test/label_projection/pancreas/train.h5ad", - input_test = "resources_test/label_projection/pancreas/test.h5ad", - output = "output.h5ad" -) -## VIASH END - -packageVersion("Matrix") - -cat(">> Load input data\n") -input_train <- read_h5ad(par$input_train) -input_test <- read_h5ad(par$input_test) - -# sce_train <- zellkonverter::readH5AD(par$input_train) -# obj_train <- Seurat::as.Seurat(sce_train, data = "normalized") -# sce_test <- zellkonverter::readH5AD(par$input_test) -# obj_test <- Seurat::as.Seurat(sce_test, data = "normalized") - -cat(">> Converting AnnData to Seurat\n") -anndataToSeurat <- function(adata) { - # interpreted from https://github.com/satijalab/seurat/blob/v3.1.0/R/objects.R - obj <- - SeuratObject::CreateSeuratObject( - counts = as(Matrix::t(adata$layers[["counts"]]), "CsparseMatrix") - ) %>% - SeuratObject::SetAssayData( - slot = "data", - new.data = as(Matrix::t(adata$layers[["normalized"]]), "CsparseMatrix") - ) %>% - SeuratObject::AddMetaData( - adata$obs - ) - - # set hvg - SeuratObject::VariableFeatures(obj) <- adata$var_names[adata$var[["hvg"]]] - - # set embedding - # could add loadings and stdev - embed <- SeuratObject::CreateDimReducObject( - embeddings = adata$obsm[["X_pca"]], - key = "PC_" - ) - obj[["pca"]] <- embed - - # return - obj -} - -obj_train <- anndataToSeurat(input_train) -obj_test <- anndataToSeurat(input_test) - -cat(">> Find transfer anchors\n") -npcs <- ncol(obj_train[["pca"]]) -anchors <- Seurat::FindTransferAnchors( - reference = obj_train, - query = obj_test, - npcs = npcs, - dims = seq_len(npcs), - verbose = FALSE -) - -cat(">> Predict on test data\n") -query <- Seurat::TransferData( - anchorset = anchors, - reference = obj_train, - query = obj_test, - refdata = list(labels = "label"), - verbose = FALSE -) -input_test$obs[["label_pred"]] <- query$predicted.labels[input_test$obs_names] - -cat(">> Write output to file\n") -input_test$uns[["method_id"]] <- meta[["functionality_name"]] -input_test$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/label_projection/methods/xgboost/config.vsh.yaml b/src/tasks/label_projection/methods/xgboost/config.vsh.yaml deleted file mode 100644 index 516308fbdd..0000000000 --- a/src/tasks/label_projection/methods/xgboost/config.vsh.yaml +++ /dev/null @@ -1,34 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "xgboost" - info: - label: XGBoost - summary: "XGBoost is a decision tree model that averages multiple trees with gradient boosting." - description: | - XGBoost is a gradient boosting decision tree model that learns multiple tree - structures in the form of a series of input features and their values, - leading to a prediction decision, and averages predictions from all its - trees. Here, input features are normalised gene expression values. - reference: "chen2016xgboost" - repository_url: "https://github.com/dmlc/xgboost" - documentation_url: "https://xgboost.readthedocs.io/en/stable/index.html" - v1: - path: openproblems/tasks/label_projection/methods/xgboost.py - commit: e3be930c6d4bbd656ab1e656badb52bb50e6cdd6 - preferred_normalization: log_cp10k - variants: - xgboost_log_cp10k: - xgboost_scran: - preferred_normalization: log_scran_pooling - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: xgboost - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/tasks/label_projection/methods/xgboost/script.py b/src/tasks/label_projection/methods/xgboost/script.py deleted file mode 100644 index c56eae59d5..0000000000 --- a/src/tasks/label_projection/methods/xgboost/script.py +++ /dev/null @@ -1,39 +0,0 @@ -import anndata as ad -import xgboost as xgb - -## VIASH START -par = { - 'input_train': 'resources_test/label_projection/pancreas/train.h5ad', - 'input_test': 'resources_test/label_projection/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'foo', -} -## VIASH END - -print("Load input data", flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) -input_layer = "normalized" - -print("Transform into integers", flush=True) -input_train.obs["label_int"] = input_train.obs["label"].cat.codes -categories = input_train.obs["label"].cat.categories - -print("Convert AnnDatas into datasets", flush=True) -xg_train = xgb.DMatrix(input_train.layers[input_layer], label=input_train.obs["label_int"]) -xg_test = xgb.DMatrix(input_test.layers[input_layer]) - -print("Fit on train data", flush=True) -param = {'objective': 'multi:softmax', 'num_class': len(categories)} -watchlist = [(xg_train, "train")] -xgb_op = xgb.train(param, xg_train, evals=watchlist) - -print("Predict on test data", flush=True) -pred = xgb_op.predict(xg_test).astype(int) -input_test.obs["label_pred"] = categories[pred] - -print("Write output to file", flush=True) -input_test.uns["method_id"] = meta["functionality_name"] -input_test.write_h5ad(par['output'], compression="gzip") \ No newline at end of file diff --git a/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml b/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml deleted file mode 100644 index 8fc7021ffa..0000000000 --- a/src/tasks/label_projection/metrics/accuracy/config.vsh.yaml +++ /dev/null @@ -1,28 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "accuracy" - info: - metrics: - - name: accuracy - label: Accuracy - summary: "The percentage of correctly predicted labels." - description: "The percentage of correctly predicted labels." - min: 0 - max: 1 - maximize: true - reference: grandini2020metrics - v1: - path: openproblems/tasks/label_projection/metrics/accuracy.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: scikit-learn - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/tasks/label_projection/metrics/accuracy/script.py b/src/tasks/label_projection/metrics/accuracy/script.py deleted file mode 100644 index 80795111d5..0000000000 --- a/src/tasks/label_projection/metrics/accuracy/script.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -import sklearn.preprocessing -import anndata as ad - -## VIASH START -par = { - 'input_prediction': 'resources_test/label_projection/pancreas/knn.h5ad', - 'input_solution': 'resources_test/label_projection/pancreas/solution.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'accuracy' -} -## VIASH END - -print("Load data", flush=True) -input_prediction = ad.read_h5ad(par['input_prediction']) -input_solution = ad.read_h5ad(par['input_solution']) - -assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" - -print("Encode labels", flush=True) -cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) -encoder = sklearn.preprocessing.LabelEncoder().fit(cats) -input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) -input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) - -print("Compute prediction accuracy", flush=True) -accuracy = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) - -print("Store metric value", flush=True) -input_prediction.uns["metric_ids"] = "accuracy" -input_prediction.uns["metric_values"] = accuracy - -print("Writing adata to file", flush=True) -input_prediction.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/label_projection/metrics/f1/config.vsh.yaml b/src/tasks/label_projection/metrics/f1/config.vsh.yaml deleted file mode 100644 index f5abc0caa6..0000000000 --- a/src/tasks/label_projection/metrics/f1/config.vsh.yaml +++ /dev/null @@ -1,50 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "f1" - info: - metrics: - - name: f1_weighted - label: F1 weighted - summary: "Average weigthed support between each labels F1 score" - description: "Calculates the F1 score for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall." - reference: grandini2020metrics - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/label_projection/metrics/f1.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - - name: f1_macro - label: F1 macro - summary: "Unweighted mean of each label F1-score" - description: "Calculates the F1 score for each label, and find their unweighted mean. This does not take label imbalance into account." - reference: grandini2020metrics - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/label_projection/metrics/f1.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - - name: f1_micro - label: F1 micro - summary: "Calculation of TP, FN and FP." - description: "Calculates the F1 score globally by counting the total true positives, false negatives and false positives." - reference: grandini2020metrics - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/label_projection/metrics/f1.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: scikit-learn - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/tasks/label_projection/metrics/f1/script.py b/src/tasks/label_projection/metrics/f1/script.py deleted file mode 100644 index 4d4b1a2395..0000000000 --- a/src/tasks/label_projection/metrics/f1/script.py +++ /dev/null @@ -1,43 +0,0 @@ -from sklearn.metrics import f1_score -import sklearn.preprocessing -import anndata as ad - -## VIASH START -par = { - 'input_prediction': 'resources_test/label_projection/pancreas/knn.h5ad', - 'input_solution': 'resources_test/label_projection/pancreas/solution.h5ad', - 'average': 'weighted', - 'output': 'output.h5ad' -} -meta = { - 'functionality_name': 'f1' -} -## VIASH END - -print("Load data", flush=True) -input_prediction = ad.read_h5ad(par['input_prediction']) -input_solution = ad.read_h5ad(par['input_solution']) - -assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" - -print("Encode labels", flush=True) -cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) -encoder = sklearn.preprocessing.LabelEncoder().fit(cats) -input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) -input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) - -print("Compute F1 score", flush=True) -metric_type = [ "macro", "micro", "weighted" ] -metric_id = [ "f1_" + x for x in metric_type] -metric_value = [ f1_score( - input_solution.obs["label"], - input_prediction.obs["label_pred"], - average=x - ) for x in metric_type ] - -print("Store metric value", flush=True) -input_prediction.uns["metric_ids"] = metric_id -input_prediction.uns["metric_values"] = metric_value - -print("Writing adata to file", flush=True) -input_prediction.write_h5ad(par['output'], compression="gzip") diff --git a/src/tasks/label_projection/process_dataset/config.vsh.yaml b/src/tasks/label_projection/process_dataset/config.vsh.yaml deleted file mode 100644 index aa010876cb..0000000000 --- a/src/tasks/label_projection/process_dataset/config.vsh.yaml +++ /dev/null @@ -1,31 +0,0 @@ -__merge__: ../api/comp_process_dataset.yaml -functionality: - name: "process_dataset" - arguments: - - name: "--method" - type: "string" - description: "The process method to assign train/test." - choices: ["batch", "random"] - default: "batch" - - name: "--obs_label" - type: "string" - description: "Which .obs slot to use as label." - default: "cell_type" - - name: "--obs_batch" - type: "string" - description: "Which .obs slot to use as batch covariate." - default: "batch" - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/subset_anndata.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/tasks/label_projection/process_dataset/script.py b/src/tasks/label_projection/process_dataset/script.py deleted file mode 100644 index 0f2c5482b6..0000000000 --- a/src/tasks/label_projection/process_dataset/script.py +++ /dev/null @@ -1,78 +0,0 @@ -import sys -import random -import numpy as np -import anndata as ad - -## VIASH START -par = { - 'input': 'resources_test/common/pancreas/dataset.h5ad', - 'method': 'batch', - 'seed': None, - 'obs_batch': 'batch', - 'obs_label': 'cell_type', - 'output_train': 'train.h5ad', - 'output_test': 'test.h5ad', - 'output_solution': 'solution.h5ad' -} -meta = { - 'resources_dir': 'src/tasks/label_projection/process_dataset', - 'config': 'src/tasks/label_projection/process_dataset/.config.vsh.yaml' -} -## VIASH END - -# import helper functions -sys.path.append(meta['resources_dir']) -from subset_anndata import read_config_slots_info, subset_anndata - -# set seed if need be -if par["seed"]: - print(f">> Setting seed to {par['seed']}") - random.seed(par["seed"]) - -print(">> Load data", flush=True) -adata = ad.read_h5ad(par["input"]) -print("input:", adata) - -print(f">> Process data using {par['method']} method") -if par["method"] == "batch": - batch_info = adata.obs[par["obs_batch"]] - batch_categories = batch_info.dtype.categories - test_batches = random.sample(list(batch_categories), 1) - is_test = [ x in test_batches for x in batch_info ] -elif par["method"] == "random": - train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) - is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] - -# subset the different adatas -print(">> Figuring which data needs to be copied to which output file", flush=True) -# use par arguments to look for label and batch value in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - "batch": par["obs_batch"], - } -} -slot_info = read_config_slots_info(meta["config"], slot_mapping) - -print(">> Creating train data", flush=True) -output_train = subset_anndata( - adata[[not x for x in is_test]], - slot_info["output_train"] -) - -print(">> Creating test data", flush=True) -output_test = subset_anndata( - adata[is_test], - slot_info["output_test"] -) - -print(">> Creating solution data", flush=True) -output_solution = subset_anndata( - adata[is_test], - slot_info['output_solution'] -) - -print(">> Writing data", flush=True) -output_train.write_h5ad(par["output_train"]) -output_test.write_h5ad(par["output_test"]) -output_solution.write_h5ad(par["output_solution"]) diff --git a/src/tasks/label_projection/resources_scripts/process_datasets.sh b/src/tasks/label_projection/resources_scripts/process_datasets.sh deleted file mode 100755 index d5c6353ff5..0000000000 --- a/src/tasks/label_projection/resources_scripts/process_datasets.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -id: label_projection_process_datasets -input_states: s3://openproblems-data/resources/datasets/**/state.yaml -rename_keys: 'input:output_dataset' -settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}' -output_state: "$id/state.yaml" -publish_dir: s3://openproblems-data/resources/label_projection/datasets -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' - withName:'.*publishStatesProc' { - memory = '16GB' - disk = '100GB' - } - withLabel:highmem { - memory = '350GB' - } -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/label_projection/workflows/process_datasets/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels label_projection,process_datasets \ No newline at end of file diff --git a/src/tasks/label_projection/resources_scripts/run_benchmark.sh b/src/tasks/label_projection/resources_scripts/run_benchmark.sh deleted file mode 100755 index 8733e22f52..0000000000 --- a/src/tasks/label_projection/resources_scripts/run_benchmark.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -publish_dir="s3://openproblems-data/resources/label_projection/results/${RUN_ID}" - -cat > /tmp/params.yaml << HERE -input_states: s3://openproblems-data/resources/label_projection/datasets/**/state.yaml -rename_keys: 'input_train:output_train,input_test:output_test,input_solution:output_solution' -output_state: "state.yaml" -settings: '{"method_ids": "scanvi_scarches"}' -publish_dir: "$publish_dir" -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/label_projection/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config src/wf_utils/labels_tw.config \ - --labels label_projection,full \ No newline at end of file diff --git a/src/tasks/label_projection/resources_scripts/run_benchmark_test.sh b/src/tasks/label_projection/resources_scripts/run_benchmark_test.sh deleted file mode 100755 index caf699a384..0000000000 --- a/src/tasks/label_projection/resources_scripts/run_benchmark_test.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -input_states: s3://openproblems-data/resources_test/label_projection/**/state.yaml -rename_keys: 'input_train:output_train,input_test:output_test,input_solution:output_solution' -output_state: "state.yaml" -publish_dir: s3://openproblems-nextflow/temp/label_projection/ -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/label_projection/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels label_projection,test \ No newline at end of file diff --git a/src/tasks/label_projection/resources_test_scripts/pancreas.sh b/src/tasks/label_projection/resources_test_scripts/pancreas.sh deleted file mode 100755 index 5a69340510..0000000000 --- a/src/tasks/label_projection/resources_test_scripts/pancreas.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/label_projection - -mkdir -p $DATASET_DIR - -# process dataset -echo Running process_dataset -nextflow run . \ - -main-script target/nextflow/label_projection/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - --input_states "$RAW_DATA/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad", "output_solution": "$id/solution.h5ad"}' \ - --publish_dir "$DATASET_DIR" \ - --output_state '$id/state.yaml' -# output_state should be moved to settings once workaround is solved - -# run one method -viash run src/tasks/label_projection/methods/knn/config.vsh.yaml -- \ - --input_train $DATASET_DIR/pancreas/train.h5ad \ - --input_test $DATASET_DIR/pancreas/test.h5ad \ - --output $DATASET_DIR/pancreas/prediction.h5ad - -# run one metric -viash run src/tasks/label_projection/metrics/accuracy/config.vsh.yaml -- \ - --input_prediction $DATASET_DIR/pancreas/prediction.h5ad \ - --input_solution $DATASET_DIR/pancreas/solution.h5ad \ - --output $DATASET_DIR/pancreas/score.h5ad diff --git a/src/tasks/label_projection/workflows/process_datasets/config.vsh.yaml b/src/tasks/label_projection/workflows/process_datasets/config.vsh.yaml deleted file mode 100644 index 09b2e9a829..0000000000 --- a/src/tasks/label_projection/workflows/process_datasets/config.vsh.yaml +++ /dev/null @@ -1,34 +0,0 @@ -functionality: - name: "process_datasets" - namespace: "label_projection/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: "/src/tasks/label_projection/api/file_common_dataset.yaml" - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_train" - __merge__: /src/tasks/label_projection/api/file_train.yaml - required: true - direction: output - - name: "--output_test" - __merge__: /src/tasks/label_projection/api/file_test.yaml - required: true - direction: output - - name: "--output_solution" - __merge__: /src/tasks/label_projection/api/file_solution.yaml - required: true - direction: output - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: common/check_dataset_schema - - name: label_projection/process_dataset -platforms: - - type: nextflow diff --git a/src/tasks/label_projection/workflows/process_datasets/main.nf b/src/tasks/label_projection/workflows/process_datasets/main.nf deleted file mode 100644 index 88cf24935c..0000000000 --- a/src/tasks/label_projection/workflows/process_datasets/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" - -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - output_ch = input_ch - - | check_dataset_schema.run( - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset": checks["exit_code"] == 0 ? state.input : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset != null - } - - | process_dataset.run( - fromState: [ input: "dataset" ], - toState: [ - output_train: "output_train", - output_test: "output_test", - output_solution: "output_solution" - ] - ) - - // only output the files for which an output file was specified - | setState(["output_train", "output_test", "output_solution"]) - - emit: - output_ch -} diff --git a/src/tasks/label_projection/workflows/run_benchmark/config.vsh.yaml b/src/tasks/label_projection/workflows/run_benchmark/config.vsh.yaml deleted file mode 100644 index 083bb47a5a..0000000000 --- a/src/tasks/label_projection/workflows/run_benchmark/config.vsh.yaml +++ /dev/null @@ -1,77 +0,0 @@ -functionality: - name: "run_benchmark" - namespace: "label_projection/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input_train" - __merge__: /src/tasks/label_projection/api/file_train.yaml - type: file - direction: input - required: true - - name: "--input_test" - __merge__: /src/tasks/label_projection/api/file_test.yaml - type: file - direction: input - required: true - - name: "--input_solution" - __merge__: /src/tasks/label_projection/api/file_solution.yaml - type: file - direction: input - required: true - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--output_method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--output_metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.yaml - - name: Methods - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: "../../api/task_info.yaml" - dependencies: - - name: common/check_dataset_schema - - name: common/extract_metadata - - name: label_projection/control_methods/true_labels - - name: label_projection/control_methods/majority_vote - - name: label_projection/control_methods/random_labels - - name: label_projection/methods/knn - - name: label_projection/methods/logistic_regression - - name: label_projection/methods/mlp - - name: label_projection/methods/scanvi - - name: label_projection/methods/scanvi_scarches - - name: label_projection/methods/xgboost - - name: label_projection/metrics/accuracy - - name: label_projection/metrics/f1 -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/tasks/label_projection/workflows/run_benchmark/main.nf b/src/tasks/label_projection/workflows/run_benchmark/main.nf deleted file mode 100644 index 5dafc98d1e..0000000000 --- a/src/tasks/label_projection/workflows/run_benchmark/main.nf +++ /dev/null @@ -1,200 +0,0 @@ -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - - // construct list of methods - methods = [ - true_labels, - majority_vote, - random_labels, - knn, - logistic_regression, - mlp, - scanvi, - scanvi_scarches, - // seurat_transferdata, - xgboost - ] - - // construct list of metrics - metrics = [ - accuracy, - f1 - ] - - /**************************** - * EXTRACT DATASET METADATA * - ****************************/ - dataset_ch = input_ch - // store join id - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - // extract the dataset metadata - | extract_metadata.run( - fromState: [input: "input_solution"], - toState: { id, output, state -> - state + [ - dataset_uns: readYaml(output.output).uns - ] - } - ) - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - score_ch = dataset_ch - - // run all methods - | runEach( - components: methods, - - // use the 'filter' argument to only run a method on the normalisation the component is asking for - filter: { id, state, comp -> - def norm = state.dataset_uns.normalization_id - def pref = comp.config.functionality.info.preferred_normalization - // if the preferred normalisation is none at all, - // we can pass whichever dataset we want - def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref - def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) - - method_check && norm_check - }, - - // define a new 'id' by appending the method name to the dataset id - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: { id, state, comp -> - def new_args = [ - input_train: state.input_train, - input_test: state.input_test - ] - if (comp.config.functionality.info.type == "control_method") { - new_args.input_solution = state.input_solution - } - new_args - }, - - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - method_id: comp.config.functionality.name, - method_output: output.output - ] - } - ) - - // run all metrics - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [ - input_solution: "input_solution", - input_prediction: "method_output" - ], - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.functionality.name, - metric_output: output.output - ] - } - ) - - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // TODO: can we store everything below in a separate helper function? - - // extract the dataset metadata - dataset_meta_ch = dataset_ch - // only keep one of the normalization methods - | filter{ id, state -> - state.dataset_uns.normalization_id == "log_cp10k" - } - | joinStates { ids, states -> - // store the dataset metadata in a file - def dataset_uns = states.collect{state -> - def uns = state.dataset_uns.clone() - uns.remove("normalization_id") - uns - } - def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - def dataset_uns_file = tempFile("dataset_uns.yaml") - dataset_uns_file.write(dataset_uns_yaml_blob) - - ["output", [output_dataset_info: dataset_uns_file]] - } - - output_ch = score_ch - - // extract the scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - // store the method configs in a file - def method_configs = methods.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - def task_info_file = meta.resources_dir.resolve("task_info.yaml") - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - def new_state = [ - output_method_configs: method_configs_file, - output_metric_configs: metric_configs_file, - output_task_info: task_info_file, - output_scores: score_uns_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - | mix(dataset_meta_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -} \ No newline at end of file diff --git a/src/tasks/label_projection/workflows/run_benchmark/run_test.sh b/src/tasks/label_projection/workflows/run_benchmark/run_test.sh deleted file mode 100755 index e9c712af48..0000000000 --- a/src/tasks/label_projection/workflows/run_benchmark/run_test.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -# export TOWER_WORKSPACE_ID=53907369739130 - -DATASETS_DIR="resources_test/label_projection" -OUTPUT_DIR="output/temp" - -if [ ! -d "$OUTPUT_DIR" ]; then - mkdir -p "$OUTPUT_DIR" -fi - -export NXF_VER=22.04.5 -nextflow run . \ - -main-script target/nextflow/label_projection/workflows/run_benchmark/main.nf \ - -profile docker \ - -resume \ - -entry auto \ - -c src/wf_utils/labels_ci.config \ - --input_states "$DATASETS_DIR/**/state.yaml" \ - --rename_keys 'input_train:output_train,input_test:output_test,input_solution:output_solution' \ - --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ - --publish_dir "$OUTPUT_DIR" \ - --output_state "state.yaml" diff --git a/src/tasks/match_modalities/README.md b/src/tasks/match_modalities/README.md index 777f367507..c09ef0f8a8 100644 --- a/src/tasks/match_modalities/README.md +++ b/src/tasks/match_modalities/README.md @@ -1,499 +1,3 @@ # Match Modalities - -Match cells across datasets of the same set of samples on different -technologies / modalities. - -Path: -[`src/tasks/match_modalities`](https://github.com/openproblems-bio/openproblems/tree/main/src/tasks/match_modalities) - -## Motivation - -Cellular function is regulated by the complex interplay of different -types of biological molecules (DNA, RNA, proteins, etc.), which -determine the state of a cell. Several recently described technologies -allow for simultaneous measurement of different aspects of cellular -state. For example, sci-CAR \[@cao2018joint\] jointly profiles RNA -expression and chromatin accessibility on the same cell and CITE-seq -\[@stoeckius2017simultaneous\] measures surface protein abundance and -RNA expression from each cell. These technologies enable us to better -understand cellular function, however datasets are still rare and there -are tradeoffs that these measurements make for to profile multiple -modalities. - -Joint methods can be more expensive or lower throughput or more noisy -than measuring a single modality at a time. Therefore it is useful to -develop methods that are capable of integrating measurements of the same -biological system but obtained using different technologies on different -cells. - -## Description - -In this task, the goal is to learn a latent space where cells profiled -by different technologies in different modalities are matched if they -have the same state. We use jointly profiled data as ground truth so -that we can evaluate when the observations from the same cell acquired -using different modalities are similar. A perfect result has each of the -paired observations sharing the same coordinates in the latent space. A -method that can achieve this would be able to match datasets across -modalities to enable multimodal cellular analysis from separately -measured profiles. - -## Authors & contributors - -| name | roles | -|:------------------|:-------------------| -| Scott Gigante | author, maintainer | -| Alex Tong | author | -| Robrecht Cannoodt | author | -| Kai Waldrant | contributor | - -## API - -``` mermaid -flowchart LR - file_common_dataset_mod1("Common dataset mod1") - comp_process_dataset[/"Data processor"/] - file_dataset_mod1("Modality 1") - file_dataset_mod2("Modality 2") - file_solution_mod1("Solution mod1") - file_solution_mod2("Solution mod1") - comp_control_method[/"Control method"/] - comp_method[/"Method"/] - comp_metric[/"Metric"/] - file_integrated_mod1("Integrated mod1") - file_integrated_mod2("Integrated mod2") - file_score("Score") - file_common_dataset_mod2("Common dataset mod2") - file_common_dataset_mod1---comp_process_dataset - comp_process_dataset-->file_dataset_mod1 - comp_process_dataset-->file_dataset_mod2 - comp_process_dataset-->file_solution_mod1 - comp_process_dataset-->file_solution_mod2 - file_dataset_mod1---comp_control_method - file_dataset_mod1---comp_method - file_dataset_mod2---comp_control_method - file_dataset_mod2---comp_method - file_solution_mod1---comp_control_method - file_solution_mod1---comp_metric - file_solution_mod2---comp_control_method - file_solution_mod2---comp_metric - comp_control_method-->file_integrated_mod1 - comp_control_method-->file_integrated_mod2 - comp_method-->file_integrated_mod1 - comp_method-->file_integrated_mod2 - comp_metric-->file_score - file_integrated_mod1---comp_metric - file_integrated_mod2---comp_metric - file_common_dataset_mod2---comp_process_dataset -``` - -## File format: Common dataset mod1 - -The first modality (RNA) of a dataset processed by the common multimodal -dataset processing pipeline. - -Example file: -`resources_test/common/scicar_cell_lines/dataset_mod1.h5ad` - -Description: - -This dataset contains both raw counts and normalized data matrices, as -well as a PCA embedding, HVG selection and a kNN graph. - -Format: - -
- - AnnData object - obsm: 'X_svd' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## Component type: Data processor - -Path: -[`src/match_modalities`](https://github.com/openproblems-bio/openproblems/tree/main/src/match_modalities) - -A match modalities dataset processor. - -Arguments: - -
- -| Name | Type | Description | -|:-------------------------|:-------|:---------------------------------------------------------------------------------------------------------------| -| `--input_mod1` | `file` | The first modality (RNA) of a dataset processed by the common multimodal dataset processing pipeline. | -| `--input_mod2` | `file` | The second modality (ADT or ATAC) of a dataset processed by the common multimodal dataset processing pipeline. | -| `--output_mod1` | `file` | (*Output*) The first modality of a multimodal dataset. The cells of this dataset are randomly permuted. | -| `--output_mod2` | `file` | (*Output*) The second modality of a multimodal dataset. The cells of this dataset are randomly permuted. | -| `--output_solution_mod1` | `file` | (*Output*) The ground truth information for the first modality. | -| `--output_solution_mod2` | `file` | (*Output*) The ground truth information for the second modality. | - -
- -## File format: Modality 1 - -The first modality of a multimodal dataset. The cells of this dataset -are randomly permuted. - -Example file: -`resources_test/match_modalities/scicar_cell_lines/dataset_mod1.h5ad` - -Format: - -
- - AnnData object - obsm: 'X_svd' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:----------|:-------------------------------------| -| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## File format: Modality 2 - -The second modality of a multimodal dataset. The cells of this dataset -are randomly permuted. - -Example file: -`resources_test/match_modalities/scicar_cell_lines/dataset_mod2.h5ad` - -Format: - -
- - AnnData object - obsm: 'X_svd' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:----------|:-------------------------------------| -| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## File format: Solution mod1 - -The ground truth information for the first modality - -Example file: -`resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad` - -Format: - -
- - AnnData object - obs: 'permutation_indices' - obsm: 'X_svd' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["permutation_indices"]` | `integer` | Indices with which to revert the permutation of the cells. | -| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## File format: Solution mod1 - -The ground truth information for the second modality - -Example file: -`resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad` - -Format: - -
- - AnnData object - obs: 'permutation_indices' - obsm: 'X_svd' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["permutation_indices"]` | `integer` | Indices with which to revert the permutation of the cells. | -| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- -## Component type: Control method - -Path: -[`src/match_modalities/control_methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/match_modalities/control_methods) - -A multimodal data integration control method. - -Arguments: - -
- -| Name | Type | Description | -|:------------------------|:-------|:----------------------------------------------------------------------------------------------| -| `--input_mod1` | `file` | The first modality of a multimodal dataset. The cells of this dataset are randomly permuted. | -| `--input_mod2` | `file` | The second modality of a multimodal dataset. The cells of this dataset are randomly permuted. | -| `--input_solution_mod1` | `file` | The ground truth information for the first modality. | -| `--input_solution_mod2` | `file` | The ground truth information for the second modality. | -| `--output_mod1` | `file` | (*Output*) The integrated embedding for the first modality. | -| `--output_mod2` | `file` | (*Output*) The integrated embedding for the second modality. | - -
- -## Component type: Method - -Path: -[`src/match_modalities/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/match_modalities/methods) - -A multimodal data integration method. - -Arguments: - -
- -| Name | Type | Description | -|:----------------|:-------|:----------------------------------------------------------------------------------------------| -| `--input_mod1` | `file` | The first modality of a multimodal dataset. The cells of this dataset are randomly permuted. | -| `--input_mod2` | `file` | The second modality of a multimodal dataset. The cells of this dataset are randomly permuted. | -| `--output_mod1` | `file` | (*Output*) The integrated embedding for the first modality. | -| `--output_mod2` | `file` | (*Output*) The integrated embedding for the second modality. | - -
- -## Component type: Metric - -Path: -[`src/match_modalities/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/match_modalities/metrics) - -A multimodal data integration metric. - -Arguments: - -
- -| Name | Type | Description | -|:--------------------------|:-------|:------------------------------------------------------| -| `--input_integrated_mod1` | `file` | The integrated embedding for the first modality. | -| `--input_integrated_mod2` | `file` | The integrated embedding for the second modality. | -| `--input_solution_mod1` | `file` | The ground truth information for the first modality. | -| `--input_solution_mod2` | `file` | The ground truth information for the second modality. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## File format: Integrated mod1 - -The integrated embedding for the first modality - -Example file: -`resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad` - -Format: - -
- - AnnData object - obsm: 'integrated' - uns: 'dataset_id', 'normalization_id', 'method_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:-------------------------------------| -| `obsm["integrated"]` | `double` | An integrated embedding. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | Which method was used. | - -
- -## File format: Integrated mod2 - -The integrated embedding for the second modality - -Example file: -`resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad` - -Format: - -
- - AnnData object - obsm: 'integrated' - uns: 'dataset_id', 'normalization_id', 'method_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:-------------------------------------| -| `obsm["integrated"]` | `double` | An integrated embedding. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | Which method was used. | - -
- -## File format: Score - -Metric score file - -Example file: -`resources_test/match_modalities/scicar_cell_lines/score.h5ad` - -Format: - -
- - AnnData object - uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:--------------------------|:---------|:---------------------------------------------------------------------------------------------| -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | -| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | - -
- -## File format: Common dataset mod2 - -The second modality (ADT or ATAC) of a dataset processed by the common -multimodal dataset processing pipeline. - -Example file: -`resources_test/common/scicar_cell_lines/dataset_mod2.h5ad` - -Description: - -This dataset contains both raw counts and normalized data matrices, as -well as a PCA embedding, HVG selection and a kNN graph. - -Format: - -
- - AnnData object - obsm: 'X_svd' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------------|:----------|:-------------------------------------------------------------------------------| -| `obsm["X_svd"]` | `double` | The resulting SVD PCA embedding. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized counts. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | Which normalization was used. | - -
- +# This task has been moved to [https://github.com/openproblems-bio/task_label_projection](https://github.com/openproblems-bio/task_label_projection)! diff --git a/src/tasks/match_modalities/api/comp_control_method.yaml b/src/tasks/match_modalities/api/comp_control_method.yaml deleted file mode 100644 index 446ee8a41a..0000000000 --- a/src/tasks/match_modalities/api/comp_control_method.yaml +++ /dev/null @@ -1,47 +0,0 @@ -functionality: - namespace: "match_modalities/control_methods" - info: - type: control_method - type_info: - label: Control method - summary: A multimodal data integration control method. - description: | - This folder contains control components for the task. - These components have the same interface as the regular methods - but also receive the solution object as input. It serves as a - starting point to test the relative accuracy of new methods in - the task, and also as a quality control for the metrics defined - in the task. - arguments: - - name: "--input_mod1" - __merge__: file_dataset_mod1.yaml - direction: input - required: true - - name: "--input_mod2" - __merge__: file_dataset_mod2.yaml - direction: input - required: true - - name: "--input_solution_mod1" - __merge__: file_solution_mod1.yaml - direction: input - required: true - - name: "--input_solution_mod2" - __merge__: file_solution_mod2.yaml - direction: input - required: true - - name: "--output_mod1" - __merge__: file_integrated_mod1.yaml - direction: output - required: true - - name: "--output_mod2" - __merge__: file_integrated_mod2.yaml - direction: output - required: true - test_resources: - - path: /resources_test/match_modalities/scicar_cell_lines - dest: resources_test/match_modalities/scicar_cell_lines - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/match_modalities/api/comp_method.yaml b/src/tasks/match_modalities/api/comp_method.yaml deleted file mode 100644 index 37a5e90b0e..0000000000 --- a/src/tasks/match_modalities/api/comp_method.yaml +++ /dev/null @@ -1,34 +0,0 @@ -functionality: - namespace: "match_modalities/methods" - info: - type: method - type_info: - label: Method - summary: A multimodal data integration method. - description: | - A multimodal method to integrate data. - arguments: - - name: "--input_mod1" - __merge__: file_dataset_mod1.yaml - direction: input - required: true - - name: "--input_mod2" - __merge__: file_dataset_mod2.yaml - direction: input - required: true - - name: "--output_mod1" - __merge__: file_integrated_mod1.yaml - direction: output - required: true - - name: "--output_mod2" - __merge__: file_integrated_mod2.yaml - direction: output - required: true - test_resources: - - path: /resources_test/match_modalities/scicar_cell_lines - dest: resources_test/match_modalities/scicar_cell_lines - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib diff --git a/src/tasks/match_modalities/api/comp_metric.yaml b/src/tasks/match_modalities/api/comp_metric.yaml deleted file mode 100644 index 220598bbbf..0000000000 --- a/src/tasks/match_modalities/api/comp_metric.yaml +++ /dev/null @@ -1,39 +0,0 @@ -functionality: - namespace: "match_modalities/metrics" - info: - type: metric - type_info: - label: Metric - summary: A multimodal data integration metric. - description: | - A metric for evaluating integrated data. - arguments: - - name: "--input_integrated_mod1" - __merge__: file_integrated_mod1.yaml - direction: input - required: true - - name: "--input_integrated_mod2" - __merge__: file_integrated_mod2.yaml - direction: input - required: true - - name: "--input_solution_mod1" - __merge__: file_solution_mod1.yaml - direction: input - required: true - - name: "--input_solution_mod2" - __merge__: file_solution_mod2.yaml - direction: input - required: true - - name: "--output" - __merge__: file_score.yaml - required: true - direction: output - test_resources: - - path: /resources_test/match_modalities/scicar_cell_lines - dest: resources_test/match_modalities/scicar_cell_lines - - type: python_script - path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /src/common/library.bib - diff --git a/src/tasks/match_modalities/api/comp_process_dataset.yaml b/src/tasks/match_modalities/api/comp_process_dataset.yaml deleted file mode 100644 index a48a0957b1..0000000000 --- a/src/tasks/match_modalities/api/comp_process_dataset.yaml +++ /dev/null @@ -1,40 +0,0 @@ -functionality: - namespace: "match_modalities" - info: - type: process_dataset - type_info: - label: Data processor - summary: A match modalities dataset processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. - arguments: - - name: "--input_mod1" - __merge__: file_common_dataset_mod1.yaml - direction: input - required: true - - name: "--input_mod2" - __merge__: file_common_dataset_mod2.yaml - direction: input - required: true - - name: "--output_mod1" - __merge__: file_dataset_mod1.yaml - direction: output - required: true - - name: "--output_mod2" - __merge__: file_dataset_mod2.yaml - direction: output - required: true - - name: "--output_solution_mod1" - __merge__: file_solution_mod1.yaml - direction: output - required: true - - name: "--output_solution_mod2" - __merge__: file_solution_mod2.yaml - direction: output - required: true - test_resources: - - path: /resources_test/common/scicar_cell_lines - dest: resources_test/common/scicar_cell_lines - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - diff --git a/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml b/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml deleted file mode 100644 index cfb98e04ea..0000000000 --- a/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml +++ /dev/null @@ -1,56 +0,0 @@ -type: file -example: "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad" -info: - label: "Common dataset mod1" - summary: The first modality (RNA) of a dataset processed by the common multimodal dataset processing pipeline. - description: | - This dataset contains both raw counts and normalized data matrices, - as well as a PCA embedding, HVG selection and a kNN graph. - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obsm: - - type: double - name: X_svd - description: The resulting SVD PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml b/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml deleted file mode 100644 index c42fbf525c..0000000000 --- a/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml +++ /dev/null @@ -1,56 +0,0 @@ -type: file -example: "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad" -info: - label: "Common dataset mod2" - summary: The second modality (ADT or ATAC) of a dataset processed by the common multimodal dataset processing pipeline. - description: | - This dataset contains both raw counts and normalized data matrices, - as well as a PCA embedding, HVG selection and a kNN graph. - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obsm: - - type: double - name: X_svd - description: The resulting SVD PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/match_modalities/api/file_dataset_mod1.yaml b/src/tasks/match_modalities/api/file_dataset_mod1.yaml deleted file mode 100644 index aece4dc975..0000000000 --- a/src/tasks/match_modalities/api/file_dataset_mod1.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/match_modalities/scicar_cell_lines/dataset_mod1.h5ad" -info: - label: "Modality 1" - summary: "The first modality of a multimodal dataset. The cells of this dataset are randomly permuted." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obsm: - - type: double - name: X_svd - description: The resulting SVD PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/match_modalities/api/file_dataset_mod2.yaml b/src/tasks/match_modalities/api/file_dataset_mod2.yaml deleted file mode 100644 index 9c140e3de8..0000000000 --- a/src/tasks/match_modalities/api/file_dataset_mod2.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/match_modalities/scicar_cell_lines/dataset_mod2.h5ad" -info: - label: "Modality 2" - summary: "The second modality of a multimodal dataset. The cells of this dataset are randomly permuted." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obsm: - - type: double - name: X_svd - description: The resulting SVD PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/match_modalities/api/file_integrated_mod1.yaml b/src/tasks/match_modalities/api/file_integrated_mod1.yaml deleted file mode 100644 index 72f363de1f..0000000000 --- a/src/tasks/match_modalities/api/file_integrated_mod1.yaml +++ /dev/null @@ -1,24 +0,0 @@ -type: file -example: "resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad" -info: - label: "Integrated mod1" - summary: "The integrated embedding for the first modality" - slots: - obsm: - - type: double - name: integrated - description: An integrated embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "Which method was used" - required: true diff --git a/src/tasks/match_modalities/api/file_integrated_mod2.yaml b/src/tasks/match_modalities/api/file_integrated_mod2.yaml deleted file mode 100644 index 644bf052d4..0000000000 --- a/src/tasks/match_modalities/api/file_integrated_mod2.yaml +++ /dev/null @@ -1,24 +0,0 @@ -type: file -example: "resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad" -info: - label: "Integrated mod2" - summary: "The integrated embedding for the second modality" - slots: - obsm: - - type: double - name: integrated - description: An integrated embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "Which method was used" - required: true diff --git a/src/tasks/match_modalities/api/file_score.yaml b/src/tasks/match_modalities/api/file_score.yaml deleted file mode 100644 index 7d66bde3c3..0000000000 --- a/src/tasks/match_modalities/api/file_score.yaml +++ /dev/null @@ -1,29 +0,0 @@ -type: file -example: "resources_test/match_modalities/scicar_cell_lines/score.h5ad" -info: - label: "Score" - summary: "Metric score file" - slots: - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: normalization_id - description: "Which normalization was used" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: string - name: metric_ids - description: "One or more unique metric identifiers" - multiple: true - required: true - - type: double - name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." - multiple: true - required: true diff --git a/src/tasks/match_modalities/api/file_solution_mod1.yaml b/src/tasks/match_modalities/api/file_solution_mod1.yaml deleted file mode 100644 index 490e005e0a..0000000000 --- a/src/tasks/match_modalities/api/file_solution_mod1.yaml +++ /dev/null @@ -1,58 +0,0 @@ -type: file -example: "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad" -info: - label: "Solution mod1" - summary: "The ground truth information for the first modality" - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: integer - name: permutation_indices - description: "Indices with which to revert the permutation of the cells" - required: true - obsm: - - type: double - name: X_svd - description: The resulting SVD PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/match_modalities/api/file_solution_mod2.yaml b/src/tasks/match_modalities/api/file_solution_mod2.yaml deleted file mode 100644 index 7cb21fef8e..0000000000 --- a/src/tasks/match_modalities/api/file_solution_mod2.yaml +++ /dev/null @@ -1,58 +0,0 @@ -type: file -example: "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad" -info: - label: "Solution mod1" - summary: "The ground truth information for the second modality" - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized counts - required: true - obs: - - type: integer - name: permutation_indices - description: "Indices with which to revert the permutation of the cells" - required: true - obsm: - - type: double - name: X_svd - description: The resulting SVD PCA embedding. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: normalization_id - description: "Which normalization was used" - required: true diff --git a/src/tasks/match_modalities/api/task_info.yaml b/src/tasks/match_modalities/api/task_info.yaml deleted file mode 100644 index bc5550df16..0000000000 --- a/src/tasks/match_modalities/api/task_info.yaml +++ /dev/null @@ -1,47 +0,0 @@ -name: match_modalities -label: Match Modalities -summary: | - Match cells across datasets of the same set of samples on different technologies / modalities. -image: "thumbnail.svg" -motivation: | - Cellular function is regulated by the complex interplay of different types of biological - molecules (DNA, RNA, proteins, etc.), which determine the state of a cell. Several - recently described technologies allow for simultaneous measurement of different aspects - of cellular state. For example, sci-CAR [@cao2018joint] - jointly profiles RNA expression and chromatin accessibility on the same cell and - CITE-seq [@stoeckius2017simultaneous] measures - surface protein abundance and RNA expression from each cell. These technologies enable - us to better understand cellular function, however datasets are still rare and there are - tradeoffs that these measurements make for to profile multiple modalities. - - Joint methods can be more expensive or lower throughput or more noisy than measuring a - single modality at a time. Therefore it is useful to develop methods that are capable - of integrating measurements of the same biological system but obtained using different - technologies on different cells. -description: | - In this task, the goal is to learn a latent space where cells profiled by different - technologies in different modalities are matched if they have the same state. We use - jointly profiled data as ground truth so that we can evaluate when the observations - from the same cell acquired using different modalities are similar. A perfect result - has each of the paired observations sharing the same coordinates in the latent space. - A method that can achieve this would be able to match datasets across modalities to - enable multimodal cellular analysis from separately measured profiles. -authors: - - name: "Scott Gigante" - roles: [ author, maintainer ] - info: - github: scottgigante - orcid: "0000-0002-4544-2764" - - name: Alex Tong - roles: [ author ] - info: - github: atong01 - - name: Robrecht Cannoodt - roles: [ author ] - info: - github: rcannood - orcid: "0000-0003-3641-729X" - - name: Kai Waldrant - roles: [ contributor ] - info: - github: KaiWaldrant \ No newline at end of file diff --git a/src/tasks/match_modalities/api/thumbnail.svg b/src/tasks/match_modalities/api/thumbnail.svg deleted file mode 100644 index 07e326bc4a..0000000000 --- a/src/tasks/match_modalities/api/thumbnail.svg +++ /dev/null @@ -1 +0,0 @@ -RNAATACdim-2dim-1dim-2dim-1 \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml b/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml deleted file mode 100644 index 8c021c3bdf..0000000000 --- a/src/tasks/match_modalities/control_methods/random_features/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "random_features" - info: - label: Random Features - summary: "Randomly permutated features" - description: | - "Randomly permuted twice, once for use as the output for each modality, producing random features with no correlation between modalities." - preferred_normalization: log_cp10k - v1: - path: openproblems/tasks/matching_modalities/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - numpy - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/random_features/script.py b/src/tasks/match_modalities/control_methods/random_features/script.py deleted file mode 100644 index d10bb72b27..0000000000 --- a/src/tasks/match_modalities/control_methods/random_features/script.py +++ /dev/null @@ -1,32 +0,0 @@ -import anndata as ad -import numpy as np - -## VIASH START - -par = { - "input_mod1": "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", - "input_mod2": "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", - "output_mod1": "output.mod1.h5ad", - "output_mod2": "output.mod2.h5ad", -} - -meta = { - "functionality_name": "random_features" -} - -## VIASH END - -print("Reading input h5ad file", flush=True) -adata_mod1 = ad.read_h5ad(par["input_mod1"]) -adata_mod2 = ad.read_h5ad(par["input_mod2"]) - -print("Generating random features", flush=True) -# todo: do we actually need to permute this once more -adata_mod1.obsm["integrated"] = adata_mod1.obsm["X_svd"][np.random.permutation(np.arange(adata_mod1.shape[0]))] -adata_mod2.obsm["integrated"] = adata_mod1.obsm["X_svd"][np.random.permutation(np.arange(adata_mod1.shape[0]))] - -print("Write output to file", flush=True) -adata_mod1.uns["method_id"] = meta["functionality_name"] -adata_mod2.uns["method_id"] = meta["functionality_name"] -adata_mod1.write_h5ad(par["output_mod1"], compression="gzip") -adata_mod2.write_h5ad(par["output_mod2"], compression="gzip") \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml b/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml deleted file mode 100644 index bc897dd821..0000000000 --- a/src/tasks/match_modalities/control_methods/true_features/config.vsh.yaml +++ /dev/null @@ -1,21 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: "true_features" - info: - label: True Features - summary: "A 1 to 1 mapping of features between modalities" - description: | - "use the same features for both modalities" - preferred_normalization: log_cp10k - v1: - path: openproblems/tasks/matching_modalities/methods/baseline.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] \ No newline at end of file diff --git a/src/tasks/match_modalities/control_methods/true_features/script.py b/src/tasks/match_modalities/control_methods/true_features/script.py deleted file mode 100644 index cf7abac8e5..0000000000 --- a/src/tasks/match_modalities/control_methods/true_features/script.py +++ /dev/null @@ -1,59 +0,0 @@ -import anndata as ad -import numpy as np - -## VIASH START -par = { - "input_mod1": "resources_test/match_modalities/scicar_cell_lines/dataset_mod1.h5ad", - "input_mod2": "resources_test/match_modalities/scicar_cell_lines/dataset_mod2.h5ad", - "input_solution_mod1": "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad", - "input_solution_mod2": "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad", - "output_mod1": "output.mod1.h5ad", - "output_mod2": "output.mod2.h5ad", -} -meta = { - "functionality_name": "true_features" -} -## VIASH END - -print("Reading input h5ad file", flush=True) -adata_mod1 = ad.read_h5ad(par["input_mod1"]) -adata_mod2 = ad.read_h5ad(par["input_mod2"]) - -solution_mod1 = ad.read_h5ad(par["input_solution_mod1"]) -solution_mod2 = ad.read_h5ad(par["input_solution_mod2"]) - -print("Storing true features", flush=True) -output_mod1 = ad.AnnData( - obs=adata_mod1.obs[[]], - var=adata_mod1.var[[]], - obsm={ - "integrated": adata_mod1.obsm["X_svd"] - }, - uns={ - "dataset_id": adata_mod1.uns["dataset_id"], - "normalization_id": adata_mod1.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -# Permutate mod1 according to mod2 -mod2_obsm = adata_mod1.obsm["X_svd"][solution_mod1.obs["permutation_indices"]] -reverse_indices_mod2 = np.argsort(solution_mod2.obs["permutation_indices"]) -mod2_obsm = mod2_obsm[reverse_indices_mod2] - -output_mod2 = ad.AnnData( - obs=adata_mod2.obs[[]], - var=adata_mod2.var[[]], - obsm={ - "integrated": mod2_obsm - }, - uns={ - "dataset_id": adata_mod2.uns["dataset_id"], - "normalization_id": adata_mod2.uns["normalization_id"], - "method_id": meta["functionality_name"] - } -) - -print("Write output to file", flush=True) -output_mod1.write_h5ad(par["output_mod1"], compression="gzip") -output_mod2.write_h5ad(par["output_mod2"], compression="gzip") diff --git a/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml b/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml deleted file mode 100644 index 4e143ec67b..0000000000 --- a/src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml +++ /dev/null @@ -1,29 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "fastmnn" - info: - label: "fastMNN" - summary: "A simpler version of the original mnnCorrect algorithm." - description: | - FastMNN is a simplified version of the mnnCorrect algorithm. Both use Mutual Nearest Neighbors to integrate multimodal single-cell data. - preferred_normalization: "log_cp10k" - variants: - mnn_log_cp10k: - mnn_log_scran_pooling: - # "The normalization only changes for the first modality dataset, the second still uses log_cp10k" - preferred_normalization: "log_scran_pooling" - reference: "haghverdi2018batch" - repository_url: "https://github.com/LTLA/batchelor" - documentation_url: "https://github.com/LTLA/batchelor#readme" - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - bioc: batchelor - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/methods/fastmnn/script.R b/src/tasks/match_modalities/methods/fastmnn/script.R deleted file mode 100644 index 129f134e16..0000000000 --- a/src/tasks/match_modalities/methods/fastmnn/script.R +++ /dev/null @@ -1,37 +0,0 @@ -library(anndata, warn.conflicts = FALSE) -library(Matrix, warn.conflicts = FALSE) -requireNamespace("batchelor", quietly = TRUE) - -## VIASH START -par <- list( - input_mod1 = "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", - input_mod2 = "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", - output_mod1 = "output_mod1.h5ad", - output_mod2 = "output_mod2.h5ad" -) -## VIASH END - -cat("Reading input h5ad file\n") -adata_mod1 <- read_h5ad(par$input_mod1) -adata_mod2 <- read_h5ad(par$input_mod2) - -cat("Running MNN\n") -sce_mnn <- batchelor::fastMNN( - t(adata_mod1$obsm[["X_svd"]]), - t(adata_mod2$obsm[["X_svd"]]) -) - -cat("Storing output\n") -combined_recons <- t(SummarizedExperiment::assay(sce_mnn, "reconstructed")) -mode1_recons <- combined_recons[seq_len(nrow(adata_mod1$obsm[["X_svd"]])), , drop = FALSE] -mode2_recons <- combined_recons[-seq_len(nrow(adata_mod1$obsm[["X_svd"]])), , drop = FALSE] - -adata_mod1$obsm[["integrated"]] <- as.matrix(mode1_recons) -adata_mod2$obsm[["integrated"]] <- as.matrix(mode2_recons) - -cat("Writing to file\n") -adata_mod1$uns["method_id"] <- meta$functionality_name -adata_mod2$uns["method_id"] <- meta$functionality_name - -yyy <- adata_mod1$write_h5ad(par$output_mod1, compression = "gzip") -zzz <- adata_mod2$write_h5ad(par$output_mod2, compression = "gzip") diff --git a/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml b/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml deleted file mode 100644 index 3146db56e0..0000000000 --- a/src/tasks/match_modalities/methods/harmonic_alignment/config.vsh.yaml +++ /dev/null @@ -1,38 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "harmonic_alignment" - info: - label: "Harmonic Alignment" - summary: "Harmonic Alignment" - description: | - Harmonic Alignment is a method for integrating multimodal single-cell data. It is based on the idea of aligning the eigenvectors of the Laplacian matrices of the two modalities. The alignment is achieved by solving a generalized eigenvalue problem. The method is described in the following paper: https://doi.org/10.1137/1.9781611976236.36 - preferred_normalization: "log_cp10k" - v1: - path: openproblems/tasks/matching_modalities/methods/harmonic_alignment.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - reference: "stanley2020harmonic" - documentation_url: "https://github.com/KrishnaswamyLab/harmonic-alignment#readme" - repository_url: "https://github.com/KrishnaswamyLab/harmonic-alignment" - arguments: - - name: "--n_pca_XY" - type: "integer" - default: 100 - description: "Default number of principal components on which to build graph." - - name: "--n_eigenvectors" - type: "integer" - default: 100 - description: "Number of eigenvectors of the normalized Laplacian on which to perform alignment." - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - github: - - KrishnaswamyLab/harmonic-alignment#subdirectory=python - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] - diff --git a/src/tasks/match_modalities/methods/harmonic_alignment/script.py b/src/tasks/match_modalities/methods/harmonic_alignment/script.py deleted file mode 100644 index abe2eece7c..0000000000 --- a/src/tasks/match_modalities/methods/harmonic_alignment/script.py +++ /dev/null @@ -1,48 +0,0 @@ -import anndata as ad -import harmonicalignment - -## VIASH START -par = { - "mod1" : "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", - "mod2" : "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", - "output" : "output.scot.h5ad", - "n_pca_XY" : 100, - "eigenvectors" : 100 -} -meta = { - "functionality_name" : "harmonic_alignment" -} -## VIASH END - - -print("Reading input h5ad file", flush=True) -adata_mod1 = ad.read_h5ad(par["input_mod1"]) -adata_mod2 = ad.read_h5ad(par["input_mod2"]) - -print("Check parameters", flush=True) -n_eigenvectors = par["n_eigenvectors"] -n_pca_XY = par["n_pca_XY"] - -if adata_mod1.layers["normalized"].shape[0] <= n_eigenvectors: - n_eigenvectors = None -if adata_mod1.layers["normalized"].shape[0] <= n_pca_XY: - n_pca_XY = None - - -print("Running Harmonic Alignment", flush=True) -ha_op = harmonicalignment.HarmonicAlignment( - n_filters=8, n_pca_XY=n_pca_XY, n_eigenvectors=n_eigenvectors -) -ha_op.align(adata_mod1.obsm["X_svd"], adata_mod2.obsm["X_svd"]) -XY_aligned = ha_op.diffusion_map(n_eigenvectors=n_eigenvectors) - -print("Storing output data structures", flush=True) - -adata_mod1.obsm["integrated"] = XY_aligned[: adata_mod1.obsm["X_svd"].shape[0]] -adata_mod2.obsm["integrated"] = XY_aligned[-adata_mod2.obsm["X_svd"].shape[0] :] - -print("Write output to file", flush=True) -adata_mod1.uns["method_id"] = meta["functionality_name"] -adata_mod2.uns["method_id"] = meta["functionality_name"] -adata_mod1.write_h5ad(par["output_mod1"], compression = "gzip") -adata_mod2.write_h5ad(par["output_mod2"], compression = "gzip") diff --git a/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml b/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml deleted file mode 100644 index db7b49383b..0000000000 --- a/src/tasks/match_modalities/methods/procrustes/config.vsh.yaml +++ /dev/null @@ -1,29 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "procrustes" - info: - label: Procrustes - summary: | - "Procrustes superimposition embeds cellular data from each modality into a common space." - description: | - "Procrustes superimposition embeds cellular data from each modality into a common space by aligning the 100-dimensional SVD embeddings to one another by using an isomorphic transformation that minimizes the root mean squared distance between points. The unmodified SVD embedding and the transformed second modality are used as output for the task." - v1: - path: openproblems/tasks/matching_modalities/methods/procrustes.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - reference: gower1975generalized - documentation_url: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.procrustes.html - repository_url: https://github.com/scipy/scipy - preferred_normalization: "log_cp10k" - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - pypi: - - scipy - - type: nextflow - directives: - label: [midtime, midmem, midcpu] \ No newline at end of file diff --git a/src/tasks/match_modalities/methods/procrustes/script.py b/src/tasks/match_modalities/methods/procrustes/script.py deleted file mode 100644 index fad63fa658..0000000000 --- a/src/tasks/match_modalities/methods/procrustes/script.py +++ /dev/null @@ -1,34 +0,0 @@ -import anndata as ad -import scipy.spatial - -## VIASH START - -par = { - "input_mod1" : "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", - "input_mod2" : "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", - "output_mod1" : "output.mod1.h5ad", - "output_mod2" : "output.mod2.h5ad", -} - -meta = { - "functionality_name" : "procrustes" -} - -## VIASH END - -print("Reading input h5ad file", flush=True) -adata_mod1 = ad.read_h5ad(par["input_mod1"]) -adata_mod2 = ad.read_h5ad(par["input_mod2"]) - -print("procrustes alignment", flush=True) -X_proc, Y_proc, _ = scipy.spatial.procrustes(adata_mod1.obsm["X_svd"], adata_mod2.obsm["X_svd"]) - -print("Storing output data", flush=True) -adata_mod1.obsm["integrated"] = X_proc -adata_mod2.obsm["integrated"] = Y_proc - -print("Write output to file", flush=True) -adata_mod1.uns["method_id"] = meta["functionality_name"] -adata_mod2.uns["method_id"] = meta["functionality_name"] -adata_mod1.write_h5ad(par["output_mod1"], compression = "gzip") -adata_mod2.write_h5ad(par["output_mod2"], compression = "gzip") diff --git a/src/tasks/match_modalities/methods/scot/config.vsh.yaml b/src/tasks/match_modalities/methods/scot/config.vsh.yaml deleted file mode 100644 index e86fe4438a..0000000000 --- a/src/tasks/match_modalities/methods/scot/config.vsh.yaml +++ /dev/null @@ -1,30 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: "scot" - info: - label: "Single Cell Optimal Transport" - description: | - Single Cell Optimal Transport (SCOT) is a method for integrating multimodal single-cell data. It is based on the idea of aligning the distributions of the two modalities using optimal transport. - summary: "Run Single Cell Optimal Transport" - preferred_normalization: "log_cp10k" - reference: Demetci2020scot - documentation_url: "https://github.com/rsinghlab/SCOT#readme" - repository_url: "https://github.com/rsinghlab/SCOT" - arguments: - - name: "--balanced" - type: "boolean_true" - description: "Determines whether balanced or unbalanced optimal transport. In the balanced case, the target and source distributions are assumed to have equal mass." - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: apt - packages: git - - type: docker - run: "cd /opt && git clone --depth 1 https://github.com/rsinghlab/SCOT.git && cd SCOT && pip install -r requirements.txt" - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/methods/scot/script.py b/src/tasks/match_modalities/methods/scot/script.py deleted file mode 100644 index d6e629c565..0000000000 --- a/src/tasks/match_modalities/methods/scot/script.py +++ /dev/null @@ -1,45 +0,0 @@ -import anndata as ad -import sys -sys.path.append("/opt/SCOT/src/") -import scotv1 -import pandas as pd - -# importing helper functions from common preprocessing.py file in resources dir -import sys - - -## VIASH START -par = { - "input_mod1" : "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", - "input_mod2" : "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", - "output_mod1" : "integrated_mod1.h5ad", - "output_mod2" : "integrated_mod2.h5ad", - "balanced":False, -} -## VIASH END - - -print("Reading input h5ad file", flush=True) -adata_mod1 = ad.read_h5ad(par["input_mod1"]) -adata_mod2 = ad.read_h5ad(par["input_mod2"]) - - -print("Initialize SCOT", flush=True) -scot = scotv1.SCOT(adata_mod1.obsm["X_svd"], adata_mod2.obsm["X_svd"]) - -print("Call the unbalanced alignment", flush=True) -# From https://github.com/rsinghlab/SCOT/blob/master/examples/unbalanced_GW_SNAREseq.ipynb # noqa: 501 -X_new_unbal, y_new_unbal = scot.align( - k=50, e=1e-3, normalize=True -) - - -print("store output", flush=True) -adata_mod1.obsm["integrated"] = X_new_unbal -adata_mod2.obsm["integrated"] = y_new_unbal - -print("Write output to file", flush=True) -adata_mod1.uns["method_id"] = meta["functionality_name"] -adata_mod2.uns["method_id"] = meta["functionality_name"] -adata_mod1.write_h5ad(par["output_mod1"], compression = "gzip") -adata_mod2.write_h5ad(par["output_mod2"], compression = "gzip") diff --git a/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml b/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml deleted file mode 100644 index e7067a20b5..0000000000 --- a/src/tasks/match_modalities/metrics/knn_auc/config.vsh.yaml +++ /dev/null @@ -1,36 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "knn_auc" - info: - metrics: - - label: kNN Area Under the Curve - name: knn_auc - summary: "Compute the kNN Area Under the Curve" - description: | - Let $f(i) \in F$ be the scRNA-seq measurement of cell $i$, and $g(i) \in G$ be the scATAC- seq measurement of cell $i$. kNN-AUC calculates the average percentage overlap of neighborhoods of $f(i)$ in $F$ with neighborhoods of $g(i)$ in $G$. Higher is better. - reference: "lance2022multimodal" - min: 0 - max: 1 - maximize: true - v1: - path: openproblems/tasks/matching_modalities/metrics/knn_auc.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - arguments: - - name: "--proportion_neighbors" - type: "double" - default: 0.1 - description: The proportion of neighbours to use in computing the KNN. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - numpy - - scikit-learn - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/metrics/knn_auc/script.py b/src/tasks/match_modalities/metrics/knn_auc/script.py deleted file mode 100644 index cf5c14b473..0000000000 --- a/src/tasks/match_modalities/metrics/knn_auc/script.py +++ /dev/null @@ -1,75 +0,0 @@ -import anndata as ad -import numpy as np -import sklearn.decomposition -import sklearn.neighbors - -## VIASH START -par = { - "input_integrated_mod1": "resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad", - "input_integrated_mod2": "resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad", - "input_solution_mod1": "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad", - "input_solution_mod2": "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad", - "output": "resources_test/multimodal/score.h5ad", - "proportion_neighbors": 0.1, -} -meta = { - "functionality_name": "knn_auc" -} -## VIASH END - -print("Reading adata file", flush=True) -input_solution_mod1 = ad.read_h5ad(par["input_solution_mod1"]) -input_solution_mod2 = ad.read_h5ad(par["input_solution_mod2"]) - -input_integrated_mod1 = ad.read_h5ad(par["input_integrated_mod1"])[input_solution_mod1.obs["permutation_indices"]] -input_integrated_mod2 = ad.read_h5ad(par["input_integrated_mod2"])[input_solution_mod2.obs["permutation_indices"]] - -print("Checking parameters", flush=True) -n_neighbors = int(np.ceil(par["proportion_neighbors"] * input_solution_mod1.n_obs)) - -print("Compute KNN on PCA", flush=True) -_, indices_true = ( - sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors) - .fit(input_solution_mod1.obsm["X_svd"]) - .kneighbors(input_solution_mod1.obsm["X_svd"]) -) - -_, indices_pred = ( - sklearn.neighbors.NearestNeighbors(n_neighbors=n_neighbors) - .fit(input_integrated_mod1.obsm["integrated"]) - .kneighbors(input_integrated_mod2.obsm["integrated"]) -) - -print("Check which neighbours match", flush=True) -neighbors_match = np.zeros(n_neighbors, dtype=int) -for i in range(input_solution_mod1.n_obs): - _, pred_matches, true_matches = np.intersect1d( - indices_pred[i], indices_true[i], return_indices=True - ) - neighbors_match_idx = np.maximum(pred_matches, true_matches) - neighbors_match += np.sum( - np.arange(n_neighbors) >= neighbors_match_idx[:, None], - axis=0, - ) - -print("Compute area under neighbours match curve", flush=True) -neighbors_match_curve = neighbors_match / ( - np.arange(1, n_neighbors + 1) * input_solution_mod1.n_obs -) -area_under_curve = np.mean(neighbors_match_curve) - -print("Store metric value", flush=True) -uns = { - "dataset_id": input_solution_mod1.uns["dataset_id"], - "normalization_id": input_solution_mod1.uns["normalization_id"], - "method_id": input_integrated_mod1.uns["method_id"], - "metric_ids": "knn_auc", - "metric_values": area_under_curve -} -output_metric = ad.AnnData( - shape=(0,0), - uns=uns -) - -print("Writing adata to file", flush=True) -output_metric.write_h5ad(par["output"], compression = "gzip") diff --git a/src/tasks/match_modalities/metrics/mse/config.vsh.yaml b/src/tasks/match_modalities/metrics/mse/config.vsh.yaml deleted file mode 100644 index b1dfc15746..0000000000 --- a/src/tasks/match_modalities/metrics/mse/config.vsh.yaml +++ /dev/null @@ -1,32 +0,0 @@ -__merge__: ../../api/comp_metric.yaml -functionality: - name: "mse" - info: - metrics: - - label: "Mean Squared Error" - name: "mse" - summary: Compute the mean squared error. - description: | - Mean squared error (MSE) is the average distance between each pair of matched observations of the same cell in the learned latent space. Lower is better. - reference: "lance2022multimodal" - maximize: false - min: 0 - max: "+.inf" - v1: - path: openproblems/tasks/matching_modalities/metrics/mse.py - commit: b3456fd73c04c28516f6df34c57e6e3e8b0dab32 - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - numpy<2 - - scipy - - scprep - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/match_modalities/metrics/mse/script.py b/src/tasks/match_modalities/metrics/mse/script.py deleted file mode 100644 index b03487c6eb..0000000000 --- a/src/tasks/match_modalities/metrics/mse/script.py +++ /dev/null @@ -1,56 +0,0 @@ -import anndata as ad -import numpy as np -from scipy import sparse - -## VIASH START -par = { - "input_integrated_mod1": "resources_test/match_modalities/scicar_cell_lines/integrated_mod1.h5ad", - "input_integrated_mod2": "resources_test/match_modalities/scicar_cell_lines/integrated_mod2.h5ad", - "input_solution_mod1": "resources_test/match_modalities/scicar_cell_lines/solution_mod1.h5ad", - "input_solution_mod2": "resources_test/match_modalities/scicar_cell_lines/solution_mod2.h5ad", - "output": "resources_test/multimodal/score.h5ad", -} -meta = { - "functionality_name": "knn_auc" -} -## VIASH END - -print("Reading adata file", flush=True) -input_solution_mod1 = ad.read_h5ad(par["input_solution_mod1"]) -input_solution_mod2 = ad.read_h5ad(par["input_solution_mod2"]) - -input_integrated_mod1 = ad.read_h5ad(par["input_integrated_mod1"])[input_solution_mod1.obs["permutation_indices"]] -input_integrated_mod2 = ad.read_h5ad(par["input_integrated_mod2"])[input_solution_mod2.obs["permutation_indices"]] - -print("Computing MSE", flush=True) -def _square(X): - if sparse.issparse(X): - X.data = X.data ** 2 - return X - else: - return X ** 2 - - -X = input_integrated_mod1.obsm["integrated"].toarray() -Y = input_integrated_mod2.obsm["integrated"].toarray() - -X_shuffled = X[np.random.permutation(np.arange(X.shape[0])), :] -error_random = np.mean(np.sum(_square(X_shuffled - Y))) -error_abs = np.mean(np.sum(_square(X - Y))) -metric_value = (error_abs / error_random).item() - -print("Store metric value", flush=True) -uns = { - "dataset_id": input_solution_mod1.uns["dataset_id"], - "normalization_id": input_solution_mod1.uns["normalization_id"], - "method_id": input_integrated_mod1.uns["method_id"], - "metric_ids": "mse", - "metric_values": metric_value -} -output_metric = ad.AnnData( - shape=(0,0), - uns=uns -) - -print("Writing adata to file", flush=True) -output_metric.write_h5ad(par["output"], compression = "gzip") diff --git a/src/tasks/match_modalities/process_dataset/config.vsh.yaml b/src/tasks/match_modalities/process_dataset/config.vsh.yaml deleted file mode 100644 index 35dc757809..0000000000 --- a/src/tasks/match_modalities/process_dataset/config.vsh.yaml +++ /dev/null @@ -1,18 +0,0 @@ -__merge__: ../api/comp_process_dataset.yaml -functionality: - name: "process_dataset" - arguments: - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - resources: - - type: python_script - path: script.py - - path: /src/common/helper_functions/subset_anndata.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [highmem, midcpu , midtime] diff --git a/src/tasks/match_modalities/process_dataset/script.py b/src/tasks/match_modalities/process_dataset/script.py deleted file mode 100644 index d90d5e3965..0000000000 --- a/src/tasks/match_modalities/process_dataset/script.py +++ /dev/null @@ -1,64 +0,0 @@ -import sys -import random -import numpy as np -import anndata as ad - -## VIASH START -par = { - "input_mod1": "resources_test/common/scicar_cell_lines/dataset_mod1.h5ad", - "input_mod2": "resources_test/common/scicar_cell_lines/dataset_mod2.h5ad", - "output_mod1": "output_mod1.h5ad", - "output_mod2": "output_mod2.h5ad", - "output_solution_mod1": "output_solution_mod1.h5ad", - "output_solution_mod2": "output_solution_mod2.h5ad", - "seed": 123 -} -meta = { - "resources_dir": "src/common/helper_functions/", - "config": "src/tasks/match_modalities/process_dataset/.config.vsh.yaml" -} -## VIASH END - -# import helper functions -sys.path.append(meta["resources_dir"]) -from subset_anndata import read_config_slots_info, subset_anndata - -# set seed if need be -if par["seed"]: - print(f">> Setting seed to {par['seed']}") - random.seed(par["seed"]) - -print(">> Load data", flush=True) -input_mod1 = ad.read_h5ad(par["input_mod1"]) -input_mod2 = ad.read_h5ad(par["input_mod2"]) - -print(f">> Permute input data") -mod1_perm = np.random.permutation(np.arange(input_mod1.n_obs)) -mod2_perm = np.random.permutation(np.arange(input_mod2.n_obs)) - -output_mod1 = input_mod1[mod1_perm] -output_mod1.obs_names = [f"cell_mod1_{i}" for i in range(output_mod1.n_obs)] -output_mod2 = input_mod2[mod2_perm] -output_mod2.obs_names = [f"cell_mod2_{i}" for i in range(output_mod2.n_obs)] - -print(f">> Create solution objects") -output_solution_mod1 = input_mod1.copy() -output_solution_mod1.obs["permutation_indices"] = np.argsort(mod1_perm) -output_solution_mod2 = input_mod2.copy() -output_solution_mod2.obs["permutation_indices"] = np.argsort(mod2_perm) - -# subset the different adatas -print(">> Read slot info from config file", flush=True) -slot_info = read_config_slots_info(meta["config"]) - -print(">> Subset anndatas", flush=True) -output_mod1 = subset_anndata(output_mod1, slot_info["output_mod1"]) -output_mod2 = subset_anndata(output_mod2, slot_info["output_mod2"]) -output_solution_mod1 = subset_anndata(output_solution_mod1, slot_info["output_solution_mod1"]) -output_solution_mod2 = subset_anndata(output_solution_mod2, slot_info["output_solution_mod2"]) - -print(">> Writing data", flush=True) -output_mod1.write_h5ad(par["output_mod1"]) -output_mod2.write_h5ad(par["output_mod2"]) -output_solution_mod1.write_h5ad(par["output_solution_mod1"]) -output_solution_mod2.write_h5ad(par["output_solution_mod2"]) diff --git a/src/tasks/match_modalities/resources_scripts/process_datasets.sh b/src/tasks/match_modalities/resources_scripts/process_datasets.sh deleted file mode 100755 index e5796bd641..0000000000 --- a/src/tasks/match_modalities/resources_scripts/process_datasets.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -cat > /tmp/params.yaml << 'HERE' -id: match_modalities_process_datasets -input_states: s3://openproblems-data/resources/datasets/openproblems_v1_multimodal/**/state.yaml -rename_keys: 'input_mod1:output_mod1,input_mod2:output_mod2' -settings: '{"output_mod1": "$id/output_mod1.h5ad", "output_mod2": "$id/output_mod2.h5ad", "output_solution_mod1": "$id/output_solution_mod1.h5ad", "output_solution_mod2": "$id/output_solution_mod2.h5ad"}' -output_state: "$id/state.yaml" -publish_dir: s3://openproblems-data/resources/match_modalities/datasets/openproblems_v1_multimodal -HERE - -cat > /tmp/nextflow.config << HERE -process { - executor = 'awsbatch' - withName:'.*publishStatesProc' { - memory = '16GB' - disk = '100GB' - } - withLabel:highmem { - memory = '350GB' - } -} -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/match_modalities/workflows/process_datasets/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config /tmp/nextflow.config \ - --labels match_modalities,process_datasets diff --git a/src/tasks/match_modalities/resources_scripts/run_benchmark.sh b/src/tasks/match_modalities/resources_scripts/run_benchmark.sh deleted file mode 100755 index 41789c6a0f..0000000000 --- a/src/tasks/match_modalities/resources_scripts/run_benchmark.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -publish_dir="s3://openproblems-data/resources/match_modalities/results/${RUN_ID}" - -cat > /tmp/params.yaml << HERE -id: match_modalities -input_states: s3://openproblems-data/resources/match_modalities/datasets/**/state.yaml -rename_keys: 'input_mod1:output_mod1,input_mod2:output_mod2,input_solution_mod1:output_solution_mod1,input_solution_mod2:output_solution_mod2' -output_state: "state.yaml" -publish_dir: "$publish_dir" -HERE - -tw launch https://github.com/openproblems-bio/openproblems.git \ - --revision main_build \ - --pull-latest \ - --main-script target/nextflow/match_modalities/workflows/run_benchmark/main.nf \ - --workspace 53907369739130 \ - --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ - --params-file /tmp/params.yaml \ - --entry-name auto \ - --config src/wf_utils/labels_tw.config \ - --labels match_modalities,full \ No newline at end of file diff --git a/src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh b/src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh deleted file mode 100755 index 6a35138815..0000000000 --- a/src/tasks/match_modalities/resources_test_scripts/scicar_cell_lines.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/match_modalities - -mkdir -p $DATASET_DIR - -# process dataset -echo Running process_dataset -nextflow run . \ - -main-script target/nextflow/match_modalities/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - --input_states "$RAW_DATA/**/state.yaml" \ - --rename_keys 'input_mod1:output_mod1,input_mod2:output_mod2' \ - --settings '{"output_mod1": "$id/dataset_mod1.h5ad", "output_mod2": "$id/dataset_mod2.h5ad", "output_solution_mod1": "$id/solution_mod1.h5ad", "output_solution_mod2": "$id/solution_mod2.h5ad"}' \ - --publish_dir "$DATASET_DIR" \ - --output_state '$id/state.yaml' -# output_state should be moved to settings once workaround is solved - -# run one method -viash run src/tasks/match_modalities/methods/fastmnn/config.vsh.yaml -- \ - --input_mod1 $DATASET_DIR/scicar_cell_lines/dataset_mod1.h5ad \ - --input_mod2 $DATASET_DIR/scicar_cell_lines/dataset_mod2.h5ad \ - --output_mod1 $DATASET_DIR/scicar_cell_lines/integrated_mod1.h5ad \ - --output_mod2 $DATASET_DIR/scicar_cell_lines/integrated_mod2.h5ad diff --git a/src/tasks/match_modalities/workflows/process_datasets/config.vsh.yaml b/src/tasks/match_modalities/workflows/process_datasets/config.vsh.yaml deleted file mode 100644 index 5427343f9f..0000000000 --- a/src/tasks/match_modalities/workflows/process_datasets/config.vsh.yaml +++ /dev/null @@ -1,42 +0,0 @@ -functionality: - name: "process_datasets" - namespace: "match_modalities/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input_mod1" - __merge__: "/src/tasks/match_modalities/api/file_common_dataset_mod1.yaml" - required: true - direction: input - - name: "--input_mod2" - __merge__: "/src/tasks/match_modalities/api/file_common_dataset_mod2.yaml" - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_mod1" - __merge__: /src/tasks/match_modalities/api/file_dataset_mod1.yaml - required: true - direction: output - - name: "--output_mod2" - __merge__: /src/tasks/match_modalities/api/file_dataset_mod2.yaml - required: true - direction: output - - name: "--output_solution_mod1" - __merge__: /src/tasks/match_modalities/api/file_solution_mod1.yaml - required: true - direction: output - - name: "--output_solution_mod2" - __merge__: /src/tasks/match_modalities/api/file_solution_mod2.yaml - required: true - direction: output - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /src/wf_utils/helper.nf - dependencies: - - name: common/check_dataset_schema - - name: match_modalities/process_dataset -platforms: - - type: nextflow diff --git a/src/tasks/match_modalities/workflows/process_datasets/main.nf b/src/tasks/match_modalities/workflows/process_datasets/main.nf deleted file mode 100644 index ab5e9a83b0..0000000000 --- a/src/tasks/match_modalities/workflows/process_datasets/main.nf +++ /dev/null @@ -1,82 +0,0 @@ -include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" - -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - output_ch = input_ch - - | check_dataset_schema.run( - key: "check_dataset_schema_mod1", - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input_mod1") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input_mod1, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset_mod1": checks["exit_code"] == 0 ? state.input_mod1 : null, - ] - } - ) - - | check_dataset_schema.run( - key: "check_dataset_schema_mod2", - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input_mod2") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input_mod2, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset_mod2": checks["exit_code"] == 0 ? state.input_mod2 : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset_mod1 != null && state.dataset_mod2 != null - } - - | process_dataset.run( - fromState: [ input_mod1: "dataset_mod1", input_mod2: "dataset_mod2" ], - toState: [ - "output_mod1", - "output_mod2", - "output_solution_mod1", - "output_solution_mod2" - ] - ) - - // only output the files for which an output file was specified - | setState([ - "output_mod1", - "output_mod2", - "output_solution_mod1", - "output_solution_mod2" - ]) - - emit: - output_ch -} diff --git a/src/tasks/match_modalities/workflows/run_benchmark/config.vsh.yaml b/src/tasks/match_modalities/workflows/run_benchmark/config.vsh.yaml deleted file mode 100644 index 89da796600..0000000000 --- a/src/tasks/match_modalities/workflows/run_benchmark/config.vsh.yaml +++ /dev/null @@ -1,75 +0,0 @@ -functionality: - name: "run_benchmark" - namespace: "match_modalities/workflows" - argument_groups: - - name: Inputs - arguments: - - name: "--input_mod1" - __merge__: /src/tasks/match_modalities/api/file_dataset_mod1.yaml - direction: input - required: true - - name: "--input_mod2" - __merge__: /src/tasks/match_modalities/api/file_dataset_mod2.yaml - direction: input - required: true - - name: "--input_solution_mod1" - __merge__: /src/tasks/match_modalities/api/file_solution_mod1.yaml - direction: input - required: true - - name: "--input_solution_mod2" - __merge__: /src/tasks/match_modalities/api/file_solution_mod2.yaml - direction: input - required: true - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--output_method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--output_metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.yaml - - name: Methods - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: "/src/tasks/match_modalities/api/task_info.yaml" - dependencies: - - name: common/check_dataset_schema - - name: common/extract_metadata - - name: match_modalities/control_methods/random_features - - name: match_modalities/control_methods/true_features - - name: match_modalities/methods/fastmnn - - name: match_modalities/methods/scot - - name: match_modalities/methods/harmonic_alignment - - name: match_modalities/methods/procrustes - - name: match_modalities/metrics/knn_auc - - name: match_modalities/metrics/mse -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/tasks/match_modalities/workflows/run_benchmark/main.nf b/src/tasks/match_modalities/workflows/run_benchmark/main.nf deleted file mode 100644 index 53753f3981..0000000000 --- a/src/tasks/match_modalities/workflows/run_benchmark/main.nf +++ /dev/null @@ -1,202 +0,0 @@ -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - - // construct list of methods - methods = [ - random_features, - true_features, - scot, - harmonic_alignment, - fastmnn, - procrustes - ] - - // construct list of metrics - metrics = [ - knn_auc, - mse - ] - - /**************************** - * EXTRACT DATASET METADATA * - ****************************/ - dataset_ch = input_ch - // store join id - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - // extract the dataset metadata - | extract_metadata.run( - fromState: [input: "input_solution_mod1"], - toState: { id, output, state -> - state + [ - dataset_uns: readYaml(output.output).uns - ] - } - ) - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - score_ch = dataset_ch - - // run all methods - | runEach( - components: methods, - - // use the 'filter' argument to only run a method on the normalisation the component is asking for - filter: { id, state, comp -> - def norm = state.dataset_uns.normalization_id - def pref = comp.config.functionality.info.preferred_normalization - // if the preferred normalisation is none at all, - // we can pass whichever dataset we want - def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref - def method_check = !state.method_ids || state.method_ids.contains(comp.config.functionality.name) - - method_check && norm_check - }, - - // define a new 'id' by appending the method name to the dataset id - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: { id, state, comp -> - def new_args = [ - input_mod1: state.input_mod1, - input_mod2: state.input_mod2 - ] - if (comp.config.functionality.info.type == "control_method") { - new_args.input_solution_mod1 = state.input_solution_mod1 - new_args.input_solution_mod2 = state.input_solution_mod2 - } - new_args - }, - - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - method_id: comp.config.functionality.name, - method_output_mod1: output.output_mod1, - method_output_mod2: output.output_mod2 - ] - } - ) - - // run all metrics - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.functionality.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [ - input_integrated_mod1: "method_output_mod1", - input_integrated_mod2: "method_output_mod2", - input_solution_mod1: "input_solution_mod1", - input_solution_mod2: "input_solution_mod2" - ], - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.functionality.name, - metric_output: output.output - ] - } - ) - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // TODO: can we store everything below in a separate helper function? - - // extract the dataset metadata - dataset_meta_ch = dataset_ch - // only keep one of the normalization methods - | filter{ id, state -> - state.dataset_uns.normalization_id == "log_cp10k" - } - - | joinStates { ids, states -> - // store the dataset metadata in a file - def dataset_uns = states.collect{state -> - def uns = state.dataset_uns.clone() - uns.remove("normalization_id") - uns - } - def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - def dataset_uns_file = tempFile("dataset_uns.yaml") - dataset_uns_file.write(dataset_uns_yaml_blob) - - ["output", [output_dataset_info: dataset_uns_file]] - } - - output_ch = score_ch - - // extract the scores - | extract_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - - // store the method configs in a file - def method_configs = methods.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - def task_info_file = meta.resources_dir.resolve("task_info.yaml") - - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - def new_state = [ - output_method_configs: method_configs_file, - output_metric_configs: metric_configs_file, - output_task_info: task_info_file, - output_scores: score_uns_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - | mix(dataset_meta_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch - -} diff --git a/src/tasks/match_modalities/workflows/run_benchmark/run_test.sh b/src/tasks/match_modalities/workflows/run_benchmark/run_test.sh deleted file mode 100644 index ee7c4c9909..0000000000 --- a/src/tasks/match_modalities/workflows/run_benchmark/run_test.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -# export TOWER_WORKSPACE_ID=53907369739130 - -DATASETS_DIR="resources_test/match_modalities" -OUTPUT_DIR="resources_test/match_modalities/benchmarks/openproblems_v1" - -if [ ! -d "$OUTPUT_DIR" ]; then - mkdir -p "$OUTPUT_DIR" -fi - -export NXF_VER=22.04.5 -nextflow run . \ - -main-script target/nextflow/match_modalities/workflows/run_benchmark/main.nf \ - -profile docker \ - -resume \ - -entry auto \ - -c src/wf_utils/labels_ci.config \ - --id resources_test \ - --input_states "$DATASETS_DIR/**/state.yaml" \ - --rename_keys 'input_mod1:output_mod1,input_mod2:output_mod2,input_solution_mod1:output_solution_mod1,input_solution_mod2:output_solution_mod2' \ - --settings '{"output_scores": "scores.yaml", "output_dataset_info": "dataset_info.yaml", "output_method_configs": "method_configs.yaml", "output_metric_configs": "metric_configs.yaml", "output_task_info": "task_info.yaml"}' \ - --publish_dir "$OUTPUT_DIR" \ No newline at end of file diff --git a/src/tasks/predict_modality/README.md b/src/tasks/predict_modality/README.md index 4b361c52fb..80178393cc 100644 --- a/src/tasks/predict_modality/README.md +++ b/src/tasks/predict_modality/README.md @@ -1,486 +1,3 @@ # Predict Modality - -Predicting the profiles of one modality (e.g. protein abundance) from -another (e.g. mRNA expression). - -Path: -[`src/tasks/predict_modality`](https://github.com/openproblems-bio/openproblems/tree/main/src/tasks/predict_modality) - -## Motivation - -Experimental techniques to measure multiple modalities within the same -single cell are increasingly becoming available. The demand for these -measurements is driven by the promise to provide a deeper insight into -the state of a cell. Yet, the modalities are also intrinsically linked. -We know that DNA must be accessible (ATAC data) to produce mRNA -(expression data), and mRNA in turn is used as a template to produce -protein (protein abundance). These processes are regulated often by the -same molecules that they produce: for example, a protein may bind DNA to -prevent the production of more mRNA. Understanding these regulatory -processes would be transformative for synthetic biology and drug target -discovery. Any method that can predict a modality from another must have -accounted for these regulatory processes, but the demand for multi-modal -data shows that this is not trivial. - -## Description - -In this task, the goal is to take one modality and predict the other -modality for all features in each cell. This task requires translating -information between multiple layers of gene regulation. In some ways, -this is similar to the task of machine translation. In machine -translation, the same sentiment is expressed in multiple languages and -the goal is to train a model to represent the same meaning in a -different language. In this context, the same cellular state is measured -in two different feature sets and the goal of this task is to translate -the information about cellular state from one modality to the other. - -## Authors & contributors - -| name | roles | -|:-------------------|:-------------------| -| Robrecht Cannoodt | author, maintainer | -| Kai Waldrant | contributor | -| Louise Deconinck | author | -| Alex Tong | author | -| Bastian Rieck | author | -| Daniel Burkhardt | author | -| Alejandro Granados | author | - -## API - -``` mermaid -flowchart LR - file_common_dataset_mod1("Raw dataset RNA") - comp_process_dataset[/"Data processor"/] - file_train_mod1("Train mod1") - file_train_mod2("Train mod2") - file_test_mod1("Test mod1") - file_test_mod2("Test mod2") - comp_control_method[/"Control method"/] - comp_method[/"Method"/] - comp_metric[/"Metric"/] - file_prediction("Prediction") - file_score("Score") - file_common_dataset_mod2("Raw dataset mod2") - file_common_dataset_mod1---comp_process_dataset - comp_process_dataset-->file_train_mod1 - comp_process_dataset-->file_train_mod2 - comp_process_dataset-->file_test_mod1 - comp_process_dataset-->file_test_mod2 - file_train_mod1---comp_control_method - file_train_mod1---comp_method - file_train_mod2---comp_control_method - file_train_mod2---comp_method - file_test_mod1---comp_control_method - file_test_mod1---comp_method - file_test_mod2---comp_control_method - file_test_mod2---comp_metric - comp_control_method-->file_prediction - comp_method-->file_prediction - comp_metric-->file_score - file_prediction---comp_metric - file_common_dataset_mod2---comp_process_dataset -``` - -## File format: Raw dataset RNA - -The RNA modality of the raw dataset. - -Example file: -`resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod1.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'size_factors' - var: 'feature_id', 'feature_name' - obsm: 'gene_activity' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:---------------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | -| `var["feature_id"]` | `string` | Unique identifier for the feature, usually a ENSEMBL gene id. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | -| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | - -
- -## Component type: Data processor - -Path: -[`src/predict_modality`](https://github.com/openproblems-bio/openproblems/tree/main/src/predict_modality) - -A predict modality dataset processor. - -Arguments: - -
- -| Name | Type | Description | -|:----------------------|:----------|:---------------------------------------------------------------------------| -| `--input_mod1` | `file` | The RNA modality of the raw dataset. | -| `--input_mod2` | `file` | The second modality of the raw dataset. Must be an ADT or an ATAC dataset. | -| `--output_train_mod1` | `file` | (*Output*) The mod1 expression values of the train cells. | -| `--output_train_mod2` | `file` | (*Output*) The mod2 expression values of the train cells. | -| `--output_test_mod1` | `file` | (*Output*) The mod1 expression values of the test cells. | -| `--output_test_mod2` | `file` | (*Output*) The mod2 expression values of the test cells. | -| `--seed` | `integer` | (*Optional*) The seed for determining the train/test split. Default: `1`. | - -
- -## File format: Train mod1 - -The mod1 expression values of the train cells. - -Example file: -`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'size_factors' - var: 'gene_ids' - obsm: 'gene_activity' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'common_dataset_id', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:---------------------------------|:----------|:-------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | -| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | -| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | -| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | - -
- -## File format: Train mod2 - -The mod2 expression values of the train cells. - -Example file: -`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'size_factors' - var: 'gene_ids' - obsm: 'gene_activity' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'common_dataset_id', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:---------------------------------|:----------|:-------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | -| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | -| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | -| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | - -
- -## File format: Test mod1 - -The mod1 expression values of the test cells. - -Example file: -`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'size_factors' - var: 'gene_ids' - obsm: 'gene_activity' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'common_dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:---------------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | -| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | -| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | -| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | - -
- -## File format: Test mod2 - -The mod2 expression values of the test cells. - -Example file: -`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod2.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'size_factors' - var: 'gene_ids' - obsm: 'gene_activity' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'common_dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'gene_activity_var_names' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:---------------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | -| `var["gene_ids"]` | `string` | (*Optional*) The gene identifiers (if available). | -| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["common_dataset_id"]` | `string` | (*Optional*) A common identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | - -
- -## Component type: Control method - -Path: -[`src/predict_modality/control_methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/predict_modality/control_methods) - -Quality control methods for verifying the pipeline. - -Arguments: - -
- -| Name | Type | Description | -|:---------------------|:-------|:-------------------------------------------------------------------------| -| `--input_train_mod1` | `file` | The mod1 expression values of the train cells. | -| `--input_train_mod2` | `file` | The mod2 expression values of the train cells. | -| `--input_test_mod1` | `file` | The mod1 expression values of the test cells. | -| `--input_test_mod2` | `file` | The mod2 expression values of the test cells. | -| `--output` | `file` | (*Output*) A prediction of the mod2 expression values of the test cells. | - -
- -## Component type: Method - -Path: -[`src/predict_modality/methods`](https://github.com/openproblems-bio/openproblems/tree/main/src/predict_modality/methods) - -A regression method. - -Arguments: - -
- -| Name | Type | Description | -|:---------------------|:-------|:-------------------------------------------------------------------------| -| `--input_train_mod1` | `file` | The mod1 expression values of the train cells. | -| `--input_train_mod2` | `file` | The mod2 expression values of the train cells. | -| `--input_test_mod1` | `file` | The mod1 expression values of the test cells. | -| `--output` | `file` | (*Output*) A prediction of the mod2 expression values of the test cells. | - -
- -## Component type: Metric - -Path: -[`src/predict_modality/metrics`](https://github.com/openproblems-bio/openproblems/tree/main/src/predict_modality/metrics) - -A predict modality metric. - -Arguments: - -
- -| Name | Type | Description | -|:---------------------|:-------|:--------------------------------------------------------------| -| `--input_prediction` | `file` | A prediction of the mod2 expression values of the test cells. | -| `--input_test_mod2` | `file` | The mod2 expression values of the test cells. | -| `--output` | `file` | (*Output*) Metric score file. | - -
- -## File format: Prediction - -A prediction of the mod2 expression values of the test cells - -Example file: -`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/prediction.h5ad` - -Format: - -
- - AnnData object - layers: 'normalized' - uns: 'dataset_id', 'method_id' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------|:---------|:----------------------------------------| -| `layers["normalized"]` | `double` | Predicted normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | - -
- -## File format: Score - -Metric score file - -Example file: -`resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/score.h5ad` - -Format: - -
- - AnnData object - uns: 'dataset_id', 'method_id', 'metric_ids', 'metric_values' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:-----------------------|:---------|:---------------------------------------------------------------------------------------------| -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["method_id"]` | `string` | A unique identifier for the method. | -| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | -| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | - -
- -## File format: Raw dataset mod2 - -The second modality of the raw dataset. Must be an ADT or an ATAC -dataset - -Example file: -`resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod2.h5ad` - -Format: - -
- - AnnData object - obs: 'batch', 'size_factors' - var: 'feature_id', 'feature_name' - obsm: 'gene_activity' - layers: 'counts', 'normalized' - uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'gene_activity_var_names' - -
- -Slot description: - -
- -| Slot | Type | Description | -|:---------------------------------|:----------|:-------------------------------------------------------------------------------| -| `obs["batch"]` | `string` | Batch information. | -| `obs["size_factors"]` | `double` | (*Optional*) The size factors of the cells prior to normalization. | -| `var["feature_id"]` | `string` | Unique identifier for the feature, usually a ENSEMBL gene id. | -| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | -| `obsm["gene_activity"]` | `double` | (*Optional*) ATAC gene activity. | -| `layers["counts"]` | `integer` | Raw counts. | -| `layers["normalized"]` | `double` | Normalized expression values. | -| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | -| `uns["dataset_name"]` | `string` | Nicely formatted name. | -| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | -| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | -| `uns["dataset_summary"]` | `string` | Short description of the dataset. | -| `uns["dataset_description"]` | `string` | Long description of the dataset. | -| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | -| `uns["normalization_id"]` | `string` | The unique identifier of the normalization method used. | -| `uns["gene_activity_var_names"]` | `string` | (*Optional*) Names of the gene activity matrix. | - -
- +# This task has been moved to [https://github.com/openproblems-bio/task_predict_modality](https://github.com/openproblems-bio/task_predict_modality)! diff --git a/src/tasks/predict_modality/api/comp_control_method.yaml b/src/tasks/predict_modality/api/comp_control_method.yaml deleted file mode 100644 index 82ab6e441f..0000000000 --- a/src/tasks/predict_modality/api/comp_control_method.yaml +++ /dev/null @@ -1,42 +0,0 @@ -functionality: - namespace: "predict_modality/control_methods" - info: - type: control_method - preferred_normalization: counts # there is currently only one type of normalization - type_info: - label: Control method - summary: Quality control methods for verifying the pipeline. - description: | - These components have the same interface as the regular methods - but also receive the solution object as input. It serves as a - starting point to test the relative accuracy of new methods in - the task, and also as a quality control for the metrics defined - in the task. - arguments: - - name: "--input_train_mod1" - __merge__: file_train_mod1.yaml - direction: input - required: true - - name: "--input_train_mod2" - __merge__: file_train_mod2.yaml - direction: input - required: true - - name: "--input_test_mod1" - __merge__: file_test_mod1.yaml - direction: input - required: true - - name: "--input_test_mod2" - __merge__: file_test_mod2.yaml - direction: input - required: true - - name: "--output" - __merge__: file_prediction.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap - dest: resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_method.yaml b/src/tasks/predict_modality/api/comp_method.yaml deleted file mode 100644 index 49ccc1e27b..0000000000 --- a/src/tasks/predict_modality/api/comp_method.yaml +++ /dev/null @@ -1,34 +0,0 @@ -functionality: - namespace: "predict_modality/methods" - info: - type: method - type_info: - label: Method - summary: A regression method. - description: | - A regression method to predict the expression of one modality from another. - arguments: - - name: "--input_train_mod1" - __merge__: file_train_mod1.yaml - direction: input - required: true - - name: "--input_train_mod2" - __merge__: file_train_mod2.yaml - direction: input - required: true - - name: "--input_test_mod1" - __merge__: file_test_mod1.yaml - direction: input - required: true - - name: "--output" - __merge__: file_prediction.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_method_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap - dest: resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap - - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_method_predict.yaml b/src/tasks/predict_modality/api/comp_method_predict.yaml deleted file mode 100644 index a43cd1e5c5..0000000000 --- a/src/tasks/predict_modality/api/comp_method_predict.yaml +++ /dev/null @@ -1,30 +0,0 @@ -functionality: - namespace: "predict_modality/methods" - info: - type: method_predict - type_info: - label: Predict - summary: Make predictions using a trained model. - description: | - This method makes predictions using a trained model. - arguments: - - name: "--input_train_mod1" - __merge__: file_train_mod1.yaml - direction: input - required: false - - name: "--input_train_mod2" - __merge__: file_train_mod2.yaml - direction: input - required: false - - name: "--input_test_mod1" - __merge__: file_test_mod1.yaml - direction: input - required: true - - name: "--input_model" - __merge__: file_pretrained_model.yaml - direction: input - required: true - - name: "--output" - __merge__: file_prediction.yaml - direction: output - required: true \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_method_train.yaml b/src/tasks/predict_modality/api/comp_method_train.yaml deleted file mode 100644 index 3f07c1efcf..0000000000 --- a/src/tasks/predict_modality/api/comp_method_train.yaml +++ /dev/null @@ -1,26 +0,0 @@ -functionality: - namespace: "predict_modality/methods" - info: - type: method_train - type_info: - label: Train - summary: Train a model to predict the expression of one modality from another. - description: | - This method trains a model to predict the expression of one modality from another. - arguments: - - name: "--input_train_mod1" - __merge__: file_train_mod1.yaml - direction: input - required: true - - name: "--input_train_mod2" - __merge__: file_train_mod2.yaml - direction: input - required: true - - name: "--input_test_mod1" - __merge__: file_test_mod1.yaml - direction: input - required: false - - name: "--output" - __merge__: file_pretrained_model.yaml - direction: output - required: true \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_metric.yaml b/src/tasks/predict_modality/api/comp_metric.yaml deleted file mode 100644 index c85f900e46..0000000000 --- a/src/tasks/predict_modality/api/comp_metric.yaml +++ /dev/null @@ -1,30 +0,0 @@ -functionality: - namespace: "predict_modality/metrics" - info: - type: metric - type_info: - label: Metric - summary: A predict modality metric. - description: | - A metric for evaluating predicted expression. - arguments: - - name: --input_prediction - __merge__: file_prediction.yaml - direction: input - required: true - - name: --input_test_mod2 - __merge__: file_test_mod2.yaml - direction: input - required: true - - name: --output - __merge__: file_score.yaml - direction: output - required: true - test_resources: - - type: python_script - path: /src/common/comp_tests/check_metric_config.py - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap - dest: resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap - - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/predict_modality/api/comp_process_dataset.yaml b/src/tasks/predict_modality/api/comp_process_dataset.yaml deleted file mode 100644 index c2c5feb2eb..0000000000 --- a/src/tasks/predict_modality/api/comp_process_dataset.yaml +++ /dev/null @@ -1,43 +0,0 @@ -functionality: - namespace: "predict_modality" - info: - type: process_dataset - type_info: - label: Data processor - summary: A predict modality dataset processor. - description: | - A component for processing a Common Dataset into a task-specific dataset. - arguments: - - name: "--input_mod1" - __merge__: file_common_dataset_mod1.yaml - direction: input - required: true - - name: "--input_mod2" - __merge__: file_common_dataset_mod2.yaml - direction: input - required: true - - name: "--output_train_mod1" - __merge__: file_train_mod1.yaml - direction: output - required: true - - name: "--output_train_mod2" - __merge__: file_train_mod2.yaml - direction: output - required: true - - name: "--output_test_mod1" - __merge__: file_test_mod1.yaml - direction: "output" - required: true - - name: "--output_test_mod2" - __merge__: file_test_mod2.yaml - direction: output - required: true - - name: "--seed" - type: integer - default: 1 - description: "The seed for determining the train/test split." - test_resources: - - type: python_script - path: /src/common/comp_tests/run_and_check_adata.py - - path: /resources_test/common/openproblems_neurips2021/bmmc_cite - dest: resources_test/common/openproblems_neurips2021/bmmc_cite \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml b/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml deleted file mode 100644 index 4824a05c46..0000000000 --- a/src/tasks/predict_modality/api/file_common_dataset_mod1.yaml +++ /dev/null @@ -1,98 +0,0 @@ -type: file -example: "resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod1.h5ad" -info: - label: "Raw dataset RNA" - summary: "The RNA modality of the raw dataset." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: double - name: size_factors - description: The size factors of the cells prior to normalization. - required: false - var: - - type: string - name: feature_id - description: Unique identifier for the feature, usually a ENSEMBL gene id. - # TODO: make this required once openproblems_v1 dataloader supports it - required: true - - - type: string - name: feature_name - description: A human-readable name for the feature, usually a gene symbol. - # TODO: make this required once the dataloader supports it - required: false - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A score for the feature indicating how highly variable it is. - required: true - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: normalization_id - type: string - description: The unique identifier of the normalization method used. - required: true - - type: string - name: gene_activity_var_names - description: "Names of the gene activity matrix" - required: false - obsm: - - type: double - name: gene_activity - description: ATAC gene activity - required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml b/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml deleted file mode 100644 index e0b1b3bae9..0000000000 --- a/src/tasks/predict_modality/api/file_common_dataset_mod2.yaml +++ /dev/null @@ -1,98 +0,0 @@ -type: file -example: "resources_test/common/openproblems_neurips2021/bmmc_cite/dataset_mod2.h5ad" -info: - label: "Raw dataset mod2" - summary: "The second modality of the raw dataset. Must be an ADT or an ATAC dataset" - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: double - name: size_factors - description: The size factors of the cells prior to normalization. - required: false - var: - - type: string - name: feature_id - description: Unique identifier for the feature, usually a ENSEMBL gene id. - # TODO: make this required once openproblems_v1 dataloader supports it - required: true - - - type: string - name: feature_name - description: A human-readable name for the feature, usually a gene symbol. - # TODO: make this required once the dataloader supports it - required: false - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A score for the feature indicating how highly variable it is. - required: true - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A ranking of the features by hvg. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: normalization_id - type: string - description: The unique identifier of the normalization method used. - required: true - - type: string - name: gene_activity_var_names - description: "Names of the gene activity matrix" - required: false - obsm: - - type: double - name: gene_activity - description: ATAC gene activity - required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_prediction.yaml b/src/tasks/predict_modality/api/file_prediction.yaml deleted file mode 100644 index 0464b323d1..0000000000 --- a/src/tasks/predict_modality/api/file_prediction.yaml +++ /dev/null @@ -1,20 +0,0 @@ -type: file -example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/prediction.h5ad" -info: - label: "Prediction" - summary: "A prediction of the mod2 expression values of the test cells" - slots: - layers: - - type: double - name: normalized - description: Predicted normalized expression values - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_pretrained_model.yaml b/src/tasks/predict_modality/api/file_pretrained_model.yaml deleted file mode 100644 index f8c4a717ac..0000000000 --- a/src/tasks/predict_modality/api/file_pretrained_model.yaml +++ /dev/null @@ -1,4 +0,0 @@ -type: file -info: - label: "Pretrained model" - summary: "A pretrained model for predicting the expression of one modality from another." diff --git a/src/tasks/predict_modality/api/file_score.yaml b/src/tasks/predict_modality/api/file_score.yaml deleted file mode 100644 index 928e18eebf..0000000000 --- a/src/tasks/predict_modality/api/file_score.yaml +++ /dev/null @@ -1,25 +0,0 @@ -type: file -example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/score.h5ad" -info: - label: "Score" - summary: "Metric score file" - slots: - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: method_id - description: "A unique identifier for the method" - required: true - - type: string - name: metric_ids - description: "One or more unique metric identifiers" - multiple: true - required: true - - type: double - name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." - multiple: true - required: true diff --git a/src/tasks/predict_modality/api/file_test_mod1.yaml b/src/tasks/predict_modality/api/file_test_mod1.yaml deleted file mode 100644 index fa67672104..0000000000 --- a/src/tasks/predict_modality/api/file_test_mod1.yaml +++ /dev/null @@ -1,85 +0,0 @@ -type: file -example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod1.h5ad" -info: - label: "Test mod1" - summary: "The mod1 expression values of the test cells." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: double - name: size_factors - description: The size factors of the cells prior to normalization. - required: false - var: - - type: string - name: gene_ids - description: The gene identifiers (if available) - required: false - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A score for the feature indicating how highly variable it is. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: common_dataset_id - description: "A common identifier for the dataset" - required: false - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: normalization_id - type: string - description: The unique identifier of the normalization method used. - required: true - - type: string - name: gene_activity_var_names - description: "Names of the gene activity matrix" - required: false - obsm: - - type: double - name: gene_activity - description: ATAC gene activity - required: false diff --git a/src/tasks/predict_modality/api/file_test_mod2.yaml b/src/tasks/predict_modality/api/file_test_mod2.yaml deleted file mode 100644 index 417edf6162..0000000000 --- a/src/tasks/predict_modality/api/file_test_mod2.yaml +++ /dev/null @@ -1,81 +0,0 @@ -type: file -example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/test_mod2.h5ad" -info: - label: "Test mod2" - summary: "The mod2 expression values of the test cells." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: double - name: size_factors - description: The size factors of the cells prior to normalization. - required: false - var: - - type: string - name: gene_ids - description: The gene identifiers (if available) - required: false - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A score for the feature indicating how highly variable it is. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: common_dataset_id - description: "A common identifier for the dataset" - required: false - - name: dataset_name - type: string - description: Nicely formatted name. - required: true - - type: string - name: dataset_url - description: Link to the original source of the dataset. - required: false - - name: dataset_reference - type: string - description: Bibtex reference of the paper in which the dataset was published. - required: false - - name: dataset_summary - type: string - description: Short description of the dataset. - required: true - - name: dataset_description - type: string - description: Long description of the dataset. - required: true - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - type: string - name: gene_activity_var_names - description: "Names of the gene activity matrix" - required: false - obsm: - - type: double - name: gene_activity - description: ATAC gene activity - required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_train_mod1.yaml b/src/tasks/predict_modality/api/file_train_mod1.yaml deleted file mode 100644 index a4919ee7bd..0000000000 --- a/src/tasks/predict_modality/api/file_train_mod1.yaml +++ /dev/null @@ -1,65 +0,0 @@ -type: file -example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad" -info: - label: "Train mod1" - summary: "The mod1 expression values of the train cells." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: double - name: size_factors - description: The size factors of the cells prior to normalization. - required: false - var: - - type: string - name: gene_ids - description: The gene identifiers (if available) - required: false - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A score for the feature indicating how highly variable it is. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: common_dataset_id - description: "A common identifier for the dataset" - required: false - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: normalization_id - type: string - description: The unique identifier of the normalization method used. - required: true - - type: string - name: gene_activity_var_names - description: "Names of the gene activity matrix" - required: false - obsm: - - type: double - name: gene_activity - description: ATAC gene activity - required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/file_train_mod2.yaml b/src/tasks/predict_modality/api/file_train_mod2.yaml deleted file mode 100644 index dcbfae45de..0000000000 --- a/src/tasks/predict_modality/api/file_train_mod2.yaml +++ /dev/null @@ -1,65 +0,0 @@ -type: file -example: "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad" -info: - label: "Train mod2" - summary: "The mod2 expression values of the train cells." - slots: - layers: - - type: integer - name: counts - description: Raw counts - required: true - - type: double - name: normalized - description: Normalized expression values - required: true - obs: - - type: string - name: batch - description: Batch information - required: true - - type: double - name: size_factors - description: The size factors of the cells prior to normalization. - required: false - var: - - type: string - name: gene_ids - description: The gene identifiers (if available) - required: false - - - type: boolean - name: hvg - description: Whether or not the feature is considered to be a 'highly variable gene' - required: true - - - type: double - name: hvg_score - description: A score for the feature indicating how highly variable it is. - required: true - uns: - - type: string - name: dataset_id - description: "A unique identifier for the dataset" - required: true - - type: string - name: common_dataset_id - description: "A common identifier for the dataset" - required: false - - name: dataset_organism - type: string - description: The organism of the sample in the dataset. - required: false - - name: normalization_id - type: string - description: The unique identifier of the normalization method used. - required: true - - type: string - name: gene_activity_var_names - description: "Names of the gene activity matrix" - required: false - obsm: - - type: double - name: gene_activity - description: ATAC gene activity - required: false \ No newline at end of file diff --git a/src/tasks/predict_modality/api/task_info.yaml b/src/tasks/predict_modality/api/task_info.yaml deleted file mode 100644 index e0d1ed9da7..0000000000 --- a/src/tasks/predict_modality/api/task_info.yaml +++ /dev/null @@ -1,67 +0,0 @@ -name: predict_modality -label: Predict Modality -summary: "Predicting the profiles of one modality (e.g. protein abundance) from another (e.g. mRNA expression)." -image: "thumbnail.svg" -motivation: | - Experimental techniques to measure multiple modalities within the same single cell are increasingly becoming available. - The demand for these measurements is driven by the promise to provide a deeper insight into the state of a cell. - Yet, the modalities are also intrinsically linked. We know that DNA must be accessible (ATAC data) to produce mRNA - (expression data), and mRNA in turn is used as a template to produce protein (protein abundance). These processes - are regulated often by the same molecules that they produce: for example, a protein may bind DNA to prevent the production - of more mRNA. Understanding these regulatory processes would be transformative for synthetic biology and drug target discovery. - Any method that can predict a modality from another must have accounted for these regulatory processes, but the demand for - multi-modal data shows that this is not trivial. -description: | - In this task, the goal is to take one modality and predict the other modality for all - features in each cell. This task requires translating information between multiple layers of - gene regulation. In some ways, this is similar to the task of machine translation. In machine translation, the same - sentiment is expressed in multiple languages and the goal is to train a model to represent the same meaning in a different - language. In this context, the same cellular state is measured in two different feature sets and the goal of this task - is to translate the information about cellular state from one modality to the other. -authors: - - name: Robrecht Cannoodt - roles: [ author, maintainer ] - info: - github: rcannood - orcid: "0000-0003-3641-729X" - - name: Kai Waldrant - roles: [ contributor ] - info: - github: KaiWaldrant - orcid: "0009-0003-8555-1361" - - name: Louise Deconinck - roles: [ author ] - info: - github: LouiseDck - - name: Alex Tong - roles: [ author ] - info: - github: atong01 - - name: Bastian Rieck - roles: [ author ] - info: - github: Pseudomanifold - - name: Daniel Burkhardt - roles: [ author ] - info: - github: dburkhardt - - name: Alejandro Granados - roles: [ author ] - info: - github: agranado - - name: Kaiwen Deng - roles: [ contributor ] - info: - email: dengkw@umich.edu - github: nonztalk - - name: Xueer Chen - roles: [ contributor ] - info: - github: xuerchen - email: xc2579@columbia.edu - - name: Jiwei Liu - roles: [ contributor ] - info: - github: daxiongshu - email: jiweil@nvidia.com - orcid: "0000-0002-8799-9763" diff --git a/src/tasks/predict_modality/api/thumbnail.svg b/src/tasks/predict_modality/api/thumbnail.svg deleted file mode 100644 index 59436e6187..0000000000 --- a/src/tasks/predict_modality/api/thumbnail.svg +++ /dev/null @@ -1,666 +0,0 @@ - - - - - - - - Gene - Expression - A - B - C - - - - - - True - Predicted - - - - - - - - - - - Chromatin Accessibility - Gene Expression - - Cell 1 - Cell 2 - - - - - - - - - - - - - - - Cell 3 - - - - - - - - A - B - C - Gene - - - - - - - - Task - Metric - Root mean square error - - - - - - - - - - - - - - A - B - C - Gene - - - - - - - - - - - - - - Ground-truth - Predicted - - Value Type - - - - - - Gene A - Genes - Gene B - Gene C - - diff --git a/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml b/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml deleted file mode 100644 index 9521b90508..0000000000 --- a/src/tasks/predict_modality/control_methods/meanpergene/config.vsh.yaml +++ /dev/null @@ -1,17 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: mean_per_gene - info: - label: Mean per gene - summary: Returns the mean expression value per gene. - description: Returns the mean expression value per gene. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] - \ No newline at end of file diff --git a/src/tasks/predict_modality/control_methods/meanpergene/script.py b/src/tasks/predict_modality/control_methods/meanpergene/script.py deleted file mode 100644 index 043f19d42a..0000000000 --- a/src/tasks/predict_modality/control_methods/meanpergene/script.py +++ /dev/null @@ -1,37 +0,0 @@ -import anndata as ad -from scipy.sparse import csc_matrix -import numpy as np - -# VIASH START -par = { - "input_train_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad", - "input_test_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad", - "input_train_mod2": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad", - "output": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/prediction.h5ad", -} - -meta = { - "functionality_name": "foo" -} -# VIASH END - -input_test_mod1 = ad.read_h5ad(par["input_test_mod1"]) -input_train_mod2 = ad.read_h5ad(par["input_train_mod2"]) - - -# Find the correct shape -mean = np.array(input_train_mod2.layers["normalized"].mean(axis=0)).flatten() -prediction = csc_matrix(np.tile(mean, (input_test_mod1.shape[0], 1))) - -# Write out prediction -out = ad.AnnData( - layers={"normalized": prediction}, - shape=prediction.shape, - obs=input_test_mod1.obs, - var=input_train_mod2.var, - uns={ - "dataset_id": input_test_mod1.uns["dataset_id"], - "method_id": meta["functionality_name"], - } -) -out.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml b/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml deleted file mode 100644 index 3324c53a91..0000000000 --- a/src/tasks/predict_modality/control_methods/random_predict/config.vsh.yaml +++ /dev/null @@ -1,16 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: random_predict - info: - label: Random predictions - summary: Returns random training profiles. - description: Returns random training profiles. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/random_predict/script.R b/src/tasks/predict_modality/control_methods/random_predict/script.R deleted file mode 100644 index ab96dcc26a..0000000000 --- a/src/tasks/predict_modality/control_methods/random_predict/script.R +++ /dev/null @@ -1,34 +0,0 @@ -cat("Loading dependencies\n") -requireNamespace("anndata", quietly = TRUE) -library(Matrix, warn.conflicts = FALSE, quietly = TRUE) - -## VIASH START -par <- list( - input_train_mod1 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad", - input_test_mod1 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad", - input_train_mod2 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad", - output = "output.h5ad" -) -meta <- list(functionality_name = "foo") -## VIASH END - -cat("Reading h5ad files\n") -input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) -input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) - -cat("Creating outputs object\n") -sample_ix <- sample.int(nrow(input_train_mod2), nrow(input_test_mod1), replace = TRUE) -prediction <- input_train_mod2$layers[["normalized"]][sample_ix, , drop = FALSE] -rownames(prediction) <- rownames(input_test_mod1) - -out <- anndata::AnnData( - layers = list(normalized = prediction), - shape = dim(prediction), - uns = list( - dataset_id = input_train_mod2$uns[["dataset_id"]], - method_id = meta[["functionality_name"]] - ) -) - -cat("Writing predictions to file\n") -zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml b/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml deleted file mode 100644 index 350b0e79ea..0000000000 --- a/src/tasks/predict_modality/control_methods/solution/config.vsh.yaml +++ /dev/null @@ -1,16 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: solution - info: - label: Solution - summary: Returns the ground-truth solution. - description: Returns the ground-truth solution. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/solution/script.R b/src/tasks/predict_modality/control_methods/solution/script.R deleted file mode 100644 index ae7c288e29..0000000000 --- a/src/tasks/predict_modality/control_methods/solution/script.R +++ /dev/null @@ -1,20 +0,0 @@ -cat("Loading dependencies\n") -requireNamespace("anndata", quietly = TRUE) - -## VIASH START -par <- list( - input_test_mod2 = "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod2.h5ad", - output = "output.h5ad" -) - -meta <- list( - functionality_name = "foo" -) -## VIASH END - -cat("Reading h5ad files\n") -ad2_test <- anndata::read_h5ad(par$input_test_mod2) -ad2_test$uns[["method_id"]] <- meta$functionality_name - -cat("Writing predictions to file\n") -zzz <- ad2_test$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml b/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml deleted file mode 100644 index 344df9c338..0000000000 --- a/src/tasks/predict_modality/control_methods/zeros/config.vsh.yaml +++ /dev/null @@ -1,16 +0,0 @@ -__merge__: ../../api/comp_control_method.yaml -functionality: - name: zeros - info: - label: Zeros - summary: Returns a prediction consisting of all zeros. - description: Returns a prediction consisting of all zeros. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/control_methods/zeros/script.py b/src/tasks/predict_modality/control_methods/zeros/script.py deleted file mode 100644 index 600b5c696c..0000000000 --- a/src/tasks/predict_modality/control_methods/zeros/script.py +++ /dev/null @@ -1,37 +0,0 @@ -import anndata -from scipy.sparse import csc_matrix -import numpy as np - -# VIASH START -par = { - "input_train_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad", - "input_test_mod1": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad", - "input_train_mod2": "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad", - "output": "output.h5ad", -} - -meta = { - "functionality_name": "foo" -} -# VIASH END - -print("Reading h5ad files", flush=True) -ad_mod1_test = anndata.read_h5ad(par["input_test_mod1"]) -ad_mod2 = anndata.read_h5ad(par["input_train_mod2"]) - -print("create output objects", flush=True) -prediction = csc_matrix((ad_mod1_test.n_obs, ad_mod2.n_vars), dtype = np.float32) - -out = anndata.AnnData( - layers={"normalized": prediction}, - shape=prediction.shape, - obs=ad_mod1_test.obs, - var=ad_mod2.var, - uns={ - "dataset_id": ad_mod2.uns["dataset_id"], - "method_id": meta["functionality_name"], - } -) - -print("write predictions to file", flush=True) -out.write_h5ad(par["output"], compression="gzip") diff --git a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml deleted file mode 100644 index 8663123ad9..0000000000 --- a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/config.vsh.yaml +++ /dev/null @@ -1,43 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: guanlab_dengkw_pm - info: - label: Guanlab-dengkw - summary: A kernel ridge regression method with RBF kernel. - description: | - This is a solution developed by Team Guanlab - dengkw in the Neurips 2021 competition to predict one modality - from another using kernel ridge regression (KRR) with RBF kernel. Truncated SVD is applied on the combined - training and test data from modality 1 followed by row-wise z-score normalization on the reduced matrix. The - truncated SVD of modality 2 is predicted by training a KRR model on the normalized training matrix of modality 1. - Predictions on the normalized test matrix are then re-mapped to the modality 2 feature space via the right - singular vectors. - preferred_normalization: log_cp10k - reference: lance2022multimodal - documentation_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/Guanlab-dengkw - repository_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/Guanlab-dengkw - competition_submission_id: 170636 - arguments: - - name: "--distance_method" - type: "string" - default: "minkowski" - description: The distance metric to use. Possible values include `euclidean` and `minkowski`. - choices: [euclidean, minkowski] - - name: "--n_pcs" - type: "integer" - default: 50 - description: Number of components to use for dimensionality reduction. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - setup: - - type: python - packages: - - scikit-learn - - pandas - - numpy - - type: nextflow - directives: - label: [hightime, highmem, highcpu] diff --git a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/script.py b/src/tasks/predict_modality/methods/guanlab_dengkw_pm/script.py deleted file mode 100644 index aafd2948c8..0000000000 --- a/src/tasks/predict_modality/methods/guanlab_dengkw_pm/script.py +++ /dev/null @@ -1,136 +0,0 @@ -import anndata as ad -import numpy as np -from scipy.sparse import csc_matrix -from sklearn.decomposition import TruncatedSVD -from sklearn.gaussian_process.kernels import RBF -from sklearn.kernel_ridge import KernelRidge - -## VIASH START -par = { - 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad', - 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad', - 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/test_mod1.h5ad', - 'output': 'output.h5ad', - 'distance_method': 'minkowski', - 'n_pcs': 50 -} -meta = { - 'functionality_name': 'guanlab_dengkw_pm' -} -## VIASH END - - -## Removed PCA and normalization steps, as they arr already performed with the input data -print('Reading input files', flush=True) -input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) -input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) -input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) - -batches = input_train_mod1.obs.batch.unique().tolist() -batch_len = len(batches) - -# combine the train and test data -input_train = ad.concat( - {"train": input_train_mod1, "test": input_test_mod1}, - axis=0, - join="outer", - label="group", - fill_value=0, - index_unique="-" -) - -print('Determine parameters by the modalities', flush=True) -mod1_type = input_train_mod1.uns["modality"].upper() -mod2_type = input_train_mod2.uns["modality"].upper() -n_comp_dict = { - ("GEX", "ADT"): (300, 70, 10, 0.2), - ("ADT", "GEX"): (None, 50, 10, 0.2), - ("GEX", "ATAC"): (1000, 50, 10, 0.1), - ("ATAC", "GEX"): (100, 70, 10, 0.1) -} -print(f"{mod1_type}, {mod2_type}", flush=True) -n_mod1, n_mod2, scale, alpha = n_comp_dict[(mod1_type, mod2_type)] -print(f"{n_mod1}, {n_mod2}, {scale}, {alpha}", flush=True) - -# Perform PCA on the input data -print('Models using the Truncated SVD to reduce the dimension', flush=True) - -if n_mod1 is not None and n_mod1 < input_train.n_vars: - embedder_mod1 = TruncatedSVD(n_components=n_mod1) - mod1_pca = embedder_mod1.fit_transform(input_train.layers["normalized"]).astype(np.float32) - train_matrix = mod1_pca[input_train.obs['group'] == 'train'] - test_matrix = mod1_pca[input_train.obs['group'] == 'test'] -else: - train_matrix = input_train_mod1.to_df(layer="normalized").values.astype(np.float32) - test_matrix = input_test_mod1.to_df(layer="normalized").values.astype(np.float32) - -if n_mod2 is not None and n_mod2 < input_train_mod2.n_vars: - embedder_mod2 = TruncatedSVD(n_components=n_mod2) - train_gs = embedder_mod2.fit_transform(input_train_mod2.layers["normalized"]).astype(np.float32) -else: - train_gs = input_train_mod2.to_df(layer="normalized").values.astype(np.float32) - -del input_train - -print('Running normalization ...', flush=True) -train_sd = np.std(train_matrix, axis=1).reshape(-1, 1) -train_sd[train_sd == 0] = 1 -train_norm = (train_matrix - np.mean(train_matrix, axis=1).reshape(-1, 1)) / train_sd -train_norm = train_norm.astype(np.float32) -del train_matrix - -test_sd = np.std(test_matrix, axis=1).reshape(-1, 1) -test_sd[test_sd == 0] = 1 -test_norm = (test_matrix - np.mean(test_matrix, axis=1).reshape(-1, 1)) / test_sd -test_norm = test_norm.astype(np.float32) -del test_matrix - -print('Running KRR model ...', flush=True) -if batch_len == 1: - # just in case there is only one batch - batch_subsets = [batches] -elif mod1_type == "ADT" or mod2_type == "ADT": - # two fold consensus predictions - batch_subsets = [ - batches[:batch_len//2], - batches[batch_len//2:] - ] -else: - # leave-one-batch-out consensus predictions - batch_subsets = [ - batches[:i] + batches[i+1:] - for i in range(batch_len) - ] - -y_pred = np.zeros((input_test_mod1.n_obs, input_train_mod2.n_vars), dtype=np.float32) -for batch in batch_subsets: - print(batch, flush=True) - kernel = RBF(length_scale = scale) - krr = KernelRidge(alpha=alpha, kernel=kernel) - print('Fitting KRR ... ', flush=True) - krr.fit( - train_norm[input_train_mod1.obs.batch.isin(batch)], - train_gs[input_train_mod2.obs.batch.isin(batch)] - ) - y_pred += (krr.predict(test_norm) @ embedder_mod2.components_) - -np.clip(y_pred, a_min=0, a_max=None, out=y_pred) -y_pred /= len(batch_subsets) - -# Store as sparse matrix to be efficient. -# Note that this might require different classifiers/embedders before-hand. -# Not every class is able to support such data structures. -## Changed from csr to csc matrix as this is more supported. -y_pred = csc_matrix(y_pred) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - layers = { 'normalized': y_pred }, - obs = input_test_mod1.obs[[]], - var = input_train_mod2.var[[]], - uns = { - 'dataset_id': input_train_mod1.uns['dataset_id'], - 'method_id': meta['functionality_name'] - } -) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml b/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml deleted file mode 100644 index 543ee71fa1..0000000000 --- a/src/tasks/predict_modality/methods/knnr_py/config.vsh.yaml +++ /dev/null @@ -1,33 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: knnr_py - info: - label: KNNR (Py) - summary: K-nearest neighbor regression in Python. - description: K-nearest neighbor regression in Python. - reference: fix1989discriminatory - documentation_url: https://scikit-learn.org/stable/modules/neighbors.html - repository_url: https://github.com/scikit-learn/scikit-learn - preferred_normalization: log_cp10k - arguments: - - name: "--distance_method" - type: "string" - default: "minkowski" - description: The distance metric to use. Possible values include `euclidean` and `minkowski`. - - name: "--n_pcs" - type: "integer" - default: 50 - description: Number of components to use for dimensionality reduction. - - name: "--n_neighbors" - type: "integer" - default: 100 - description: Number of neighbors to use. - resources: - - type: python_script - path: script.py -platforms: - - type: docker - image: openproblems/base_python:1.0.0 - - type: nextflow - directives: - label: [hightime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/methods/knnr_py/script.py b/src/tasks/predict_modality/methods/knnr_py/script.py deleted file mode 100644 index f08c335ffe..0000000000 --- a/src/tasks/predict_modality/methods/knnr_py/script.py +++ /dev/null @@ -1,67 +0,0 @@ -import anndata as ad -from scipy.sparse import csc_matrix -from sklearn.decomposition import TruncatedSVD -from sklearn.neighbors import KNeighborsRegressor - -## VIASH START -par = { - 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod1.h5ad', - 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/train_mod2.h5ad', - 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/test_mod1.h5ad', - 'distance_method': 'minkowski', - 'output': 'output.h5ad', - 'n_pcs': 4, - 'n_neighbors': 5, -} -meta = { 'functionality_name': 'foo' } -## VIASH END - -print('Reading `h5ad` files...', flush=True) -input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) -input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) -input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) - -input_train = ad.concat( - {"train": input_train_mod1, "test": input_test_mod1}, - axis=0, - join="outer", - label="group", - fill_value=0, - index_unique="-" -) - -print('Performing dimensionality reduction on modality 1 values...', flush=True) -embedder = TruncatedSVD(n_components=par['n_pcs']) -X = embedder.fit_transform(input_train.layers["normalized"]) - -# split dimred back up -X_train = X[input_train.obs['group'] == 'train'] -X_test = X[input_train.obs['group'] == 'test'] -y_train = input_train_mod2.layers["normalized"].toarray() - -assert len(X_train) + len(X_test) == len(X) - -print('Running KNN regression...', flush=True) - -reg = KNeighborsRegressor( - n_neighbors=par['n_neighbors'], - metric=par['distance_method'] -) - -reg.fit(X_train, y_train) -y_pred = reg.predict(X_test) - -y_pred = csc_matrix(y_pred) - -adata = ad.AnnData( - layers={"normalized": y_pred}, - obs=input_test_mod1.obs, - var=input_train_mod2.var, - uns={ - 'dataset_id': input_train_mod1.uns['dataset_id'], - 'method_id': meta["functionality_name"], - }, -) - -print('Storing annotated data...', flush=True) -adata.write_h5ad(par['output'], compression = "gzip") diff --git a/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml b/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml deleted file mode 100644 index 448b3ca0b8..0000000000 --- a/src/tasks/predict_modality/methods/knnr_r/config.vsh.yaml +++ /dev/null @@ -1,36 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: knnr_r - info: - label: KNNR (R) - summary: K-nearest neighbor regression in R. - description: K-nearest neighbor regression in R. - reference: fix1989discriminatory - documentation_url: https://cran.r-project.org/package=FNN - repository_url: https://github.com/cran/FNN - preferred_normalization: log_cp10k - arguments: - - name: "--distance_method" - type: "string" - default: "spearman" - description: The distance method to use. Possible values are euclidean, pearson, spearman and others. - - name: "--n_pcs" - type: "integer" - default: 50 - description: Number of principal components to use. - - name: "--n_neighbors" - type: "integer" - default: 20 - description: Number of neighbors to use in the knn regression. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ lmds, FNN, proxyC] - - type: nextflow - directives: - label: [hightime, lowmem, lowcpu] diff --git a/src/tasks/predict_modality/methods/knnr_r/script.R b/src/tasks/predict_modality/methods/knnr_r/script.R deleted file mode 100644 index 5679f8dd2d..0000000000 --- a/src/tasks/predict_modality/methods/knnr_r/script.R +++ /dev/null @@ -1,81 +0,0 @@ -cat("Loading dependencies\n") -requireNamespace("anndata", quietly = TRUE) -library(Matrix, warn.conflicts = FALSE, quietly = TRUE) - -## VIASH START -path <- "output/datasets/predict_modality/openproblems_bmmc_multiome_phase1_mod1/openproblems_bmmc_multiome_phase1_mod1.censor_dataset.output_" -par <- list( - input_train_mod1 = paste0(path, "train_mod1.h5ad"), - input_test_mod1 = paste0(path, "test_mod1.h5ad"), - input_train_mod2 = paste0(path, "train_mod2.h5ad"), - output = "output.h5ad", - n_pcs = 4L, - n_neighbors = 3, - distance_method = "pearson" -) -## VIASH END - -cat("Reading mod1 h5ad files\n") -input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) -dataset_id <- input_train_mod1$uns[["dataset_id"]] - -# subset to HVG to reduce memory consumption -train_mod1_sd <- proxyC::colSds(input_train_mod1$layers[["normalized"]]) -ix <- order(train_mod1_sd, decreasing = TRUE)[seq_len(min(1000, length(train_mod1_sd)))] -input_train_mod1 <- input_train_mod1[,ix]$copy() -gc() - -# subset to HVG to reduce memory consumption -input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) -input_test_mod1 <- input_test_mod1[,ix]$copy() -gc() - -cat("Performing DR on the mod1 values\n") -# LMDS is more efficient than regular MDS because -# it does not compure a square distance matrix. -dr_mod1 <- lmds::lmds( - rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), - ndim = par$n_pcs, - distance_method = par$distance_method -) - -ix <- seq_len(nrow(input_train_mod1)) -dr_mod1_train <- dr_mod1[ix, , drop = FALSE] -dr_mod1_test <- dr_mod1[-ix, , drop = FALSE] - -# remove previous objects to save memory -rm(input_train_mod1, input_test_mod1) -gc() - -cat("Reading mod2 h5ad files\n") -input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) - -cat("Predicting for each column in modality 2\n") -# precompute knn indices -knn_ix <- FNN::get.knnx( - dr_mod1_train, - dr_mod1_test, - k = par$n_neighbors -)$nn.index - -# perform knn regression. -pred <- input_train_mod2$layers[["normalized"]][knn_ix[, 1], , drop = FALSE] -if (par$n_neighbors > 1) { - for (k in seq(2, par$n_neighbors)) { - pred <- pred + input_train_mod2$layers[["normalized"]][knn_ix[, k], , drop = FALSE] - } -} -pred <- pred / par$n_neighbors -rownames(pred) <- rownames(dr_mod1_test) - -out <- anndata::AnnData( - layers = list(normalized = pred), - shape = dim(pred), - uns = list( - dataset_id = dataset_id, - method_id = meta$functionality_name - ) -) - -cat("Writing predictions to file\n") -zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/lm/config.vsh.yaml b/src/tasks/predict_modality/methods/lm/config.vsh.yaml deleted file mode 100644 index 3fdbc0f243..0000000000 --- a/src/tasks/predict_modality/methods/lm/config.vsh.yaml +++ /dev/null @@ -1,32 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: lm - info: - label: Linear Model - summary: Linear model regression. - description: A linear model regression method. - reference: wilkinson1973symbolic - repository_url: https://github.com/RcppCore/RcppArmadillo - documentation_url: https://cran.r-project.org/package=RcppArmadillo - preferred_normalization: log_cp10k - arguments: - - name: "--distance_method" - type: "string" - default: "spearman" - description: The distance method to use. Possible values are euclidean, pearson, spearman and others. - - name: "--n_pcs" - type: "integer" - default: 50 - description: Number of principal components to use. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ lmds, RcppArmadillo, pbapply] - - type: nextflow - directives: - label: [hightime, highmem, highcpu] diff --git a/src/tasks/predict_modality/methods/lm/script.R b/src/tasks/predict_modality/methods/lm/script.R deleted file mode 100644 index 58d3febfb5..0000000000 --- a/src/tasks/predict_modality/methods/lm/script.R +++ /dev/null @@ -1,74 +0,0 @@ -cat("Loading dependencies\n") -requireNamespace("anndata", quietly = TRUE) -requireNamespace("pbapply", quietly = TRUE) -library(Matrix, warn.conflicts = FALSE, quietly = TRUE) - -## VIASH START -path <- "output/datasets/predict_modality/openproblems_bmmc_multiome_phase1_mod1/openproblems_bmmc_multiome_phase1_mod1.censor_dataset.output_" -par <- list( - input_train_mod1 = paste0(path, "train_mod1.h5ad"), - input_test_mod1 = paste0(path, "test_mod1.h5ad"), - input_train_mod2 = paste0(path, "train_mod2.h5ad"), - output = "output.h5ad", - n_pcs = 4L -) -meta <- list(functionality_name = "foo") -## VIASH END - -n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) - -cat("Reading mod1 files\n") -input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) -input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) - - -cat("Performing DR on the mod1 values\n") -dr <- lmds::lmds( - rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), - ndim = par$n_pcs, - distance_method = par$distance_method -) - -ix <- seq_len(nrow(input_train_mod1)) -dr_train <- dr[ix, , drop = FALSE] -dr_test <- dr[-ix, , drop = FALSE] - -rm(input_test_mod1) -gc() - - -cat("Reading mod2 files\n") -X_mod2 <- anndata::read_h5ad(par$input_train_mod2)$layers[["normalized"]] - -cat("Predicting for each column in modality 2\n") -preds <- pbapply::pblapply( - seq_len(ncol(X_mod2)), - function(i) { - y <- X_mod2[, i] - uy <- unique(y) - if (length(uy) > 1) { - fit <- RcppArmadillo::fastLm(dr_train, y) - # fit <- lm(y ~ ., dr_train) - stats::predict(fit, dr_test) - } else { - rep(uy, nrow(dr_test)) - } - } -) - -cat("Creating outputs object\n") -prediction <- Matrix::Matrix(do.call(cbind, preds), sparse = TRUE) -rownames(prediction) <- rownames(dr_test) -colnames(prediction) <- colnames(X_mod2) - -out <- anndata::AnnData( - layers = list(normalized = prediction), - shape = dim(prediction), - uns = list( - dataset_id = input_train_mod1$uns[["dataset_id"]], - method_id = meta$functionality_name - ) -) - -cat("Writing predictions to file\n") -zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml b/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml deleted file mode 100644 index 0ed08b89aa..0000000000 --- a/src/tasks/predict_modality/methods/lmds_irlba_rf/config.vsh.yaml +++ /dev/null @@ -1,37 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: lmds_irlba_rf - info: - label: LMDS + IRLBA + RF - summary: A random forest regression using LMDS of modality 1 to predict a PCA embedding of modality 2, which is then reversed to predict the original modality 2. - description: | - A random forest regression using LMDS of modality 1 to predict a PCA embedding of modality 2, which is then reversed to predict the original modality 2. - reference: lance2022multimodal - documentation_url: https://github.com/openproblems-bio/openproblems/tree/main/src/tasks/predict_modality/methods #/lmds_irlba_rf - repository_url: https://github.com/openproblems-bio/openproblems - preferred_normalization: log_cp10k - arguments: - - name: "--distance_method" - type: "string" - default: "pearson" - description: The distance method to use. Possible values are euclidean, pearson, spearman and others. - - name: "--n_pcs" - type: "integer" - default: 20 - description: Number of principal components to use. - - name: "--n_trees" - type: "integer" - default: 500 - description: Number of trees to use. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [lmds, ranger, pbapply, irlba] - - type: nextflow - directives: - label: [hightime, highmem, highcpu] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/lmds_irlba_rf/script.R b/src/tasks/predict_modality/methods/lmds_irlba_rf/script.R deleted file mode 100644 index 6a5b7ed595..0000000000 --- a/src/tasks/predict_modality/methods/lmds_irlba_rf/script.R +++ /dev/null @@ -1,93 +0,0 @@ -cat("Loading dependencies\n") -requireNamespace("anndata", quietly = TRUE) -requireNamespace("pbapply", quietly = TRUE) -library(Matrix, warn.conflicts = FALSE, quietly = TRUE) - -## VIASH START -path <- "resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/" -par <- list( - input_train_mod1 = paste0(path, "train_mod1.h5ad"), - input_test_mod1 = paste0(path, "test_mod1.h5ad"), - input_train_mod2 = paste0(path, "train_mod2.h5ad"), - output = "output.h5ad", - n_pcs = 20L, - n_trees = 50L -) -meta <- list(functionality_name = "foo") -## VIASH END - -n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) - -cat("Reading mod1 files\n") -input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) -input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) - -dataset_id <- input_train_mod1$uns[["dataset_id"]] - -cat("Performing DR on the mod1 values\n") -dr <- lmds::lmds( - rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), - ndim = par$n_pcs, - distance_method = par$distance_method -) -# alternative: -# pr_out <- irlba::prcomp_irlba( -# rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), -# n = par$n_pcs -# ) -# dr <- pr_out$x - -# split up dr data -ix <- seq_len(nrow(input_train_mod1)) -dr_train <- as.data.frame(dr[ix, , drop = FALSE]) -dr_test <- as.data.frame(dr[-ix, , drop = FALSE]) -dr_train <- dr[ix, , drop = FALSE] -dr_test <- dr[-ix, , drop = FALSE] - -rm(input_train_mod1, input_test_mod1) -gc() - - -cat("Reading mod2 files\n") -X_mod2 <- anndata::read_h5ad(par$input_train_mod2)$layers[["normalized"]] -prcomp_mod2 <- irlba::prcomp_irlba(X_mod2, n = par$n_pcs) -dr_mod2 <- prcomp_mod2$x - -cat("Predicting for each column in modality 2\n") -pred_drs <- pbapply::pblapply( - seq_len(ncol(dr_mod2)), - function(i) { - y <- dr_mod2[, i] - uy <- unique(y) - if (length(uy) > 1) { - rf <- ranger::ranger( - x = dr_train, - y = y, - num.trees = par$n_trees, - num.threads = n_cores - ) - stats::predict(rf, dr_test)$prediction - } else { - rep(uy, nrow(dr_test)) - } - } -) - -cat("Creating outputs object\n") -pred_dr <- Matrix::Matrix(do.call(cbind, pred_drs), sparse = TRUE) -prediction <- pred_dr %*% t(prcomp_mod2$rotation) -rownames(prediction) <- rownames(dr_test) -colnames(prediction) <- colnames(X_mod2) - -out <- anndata::AnnData( - layers = list(normalized = as(prediction, "CsparseMatrix")), - shape = dim(prediction), - uns = list( - dataset_id = dataset_id, - method_id = meta$functionality_name - ) -) - - -cat("Writing predictions to file\n") -zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml b/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml deleted file mode 100644 index 385f1234bb..0000000000 --- a/src/tasks/predict_modality/methods/newwave_knnr/config.vsh.yaml +++ /dev/null @@ -1,42 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: newwave_knnr - status: disabled # disabled due to poor performance and long execution times - info: - label: NewWave+KNNR - summary: Perform DR with NewWave, predict modality with KNN regression. - description: Perform DR with NewWave, predict modality with KNN regression. - reference: agostinis2022newwave - repository_url: https://github.com/fedeago/NewWave - documentation_url: https://bioconductor.org/packages/release/bioc/html/NewWave.html - preferred_normalization: log_cp10k - arguments: - - name: "--newwave_maxiter" - type: "integer" - default: 40 - description: Maximum number of NewWave iterations. - - name: "--newwave_ngene" - type: "integer" - default: 200 - description: Setting of the n_gene_par NewWave parameter. - - name: "--newwave_ncell" - type: "integer" - default: 200 - description: Setting of the n_cell_par NewWave parameter. - - name: "--n_neighbors" - type: "integer" - default: 20 - description: Number of neighbors to use in the knn regression. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ lmds, FNN, proxy, proxyC ] - bioc: [ SingleCellExperiment, NewWave ] - - type: nextflow - directives: - label: [hightime, highmem, highcpu, highsharedmem] diff --git a/src/tasks/predict_modality/methods/newwave_knnr/script.R b/src/tasks/predict_modality/methods/newwave_knnr/script.R deleted file mode 100644 index 84f8a0b469..0000000000 --- a/src/tasks/predict_modality/methods/newwave_knnr/script.R +++ /dev/null @@ -1,107 +0,0 @@ -cat("Loading dependencies\n") -requireNamespace("anndata", quietly = TRUE) -library(Matrix, warn.conflicts = FALSE, quietly = TRUE) -requireNamespace("NewWave", quietly = TRUE) -requireNamespace("FNN", quietly = TRUE) -requireNamespace("SingleCellExperiment", quietly = TRUE) - -## VIASH START -path <- "resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/" -par <- list( - input_train_mod1 = paste0(path, "train_mod1.h5ad"), - input_test_mod1 = paste0(path, "test_mod1.h5ad"), - input_train_mod2 = paste0(path, "train_mod2.h5ad"), - output = "output.h5ad", - newwave_maxiter = 40L, - newwave_ngene = 200L, - newwave_ncell = 200L, - n_neighbors = 20L -) -meta <- list(functionality_name = "foo") -## VIASH END - -print(par) - -n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) - -method_id <- meta$functionality_name - -cat("Reading h5ad files\n") -input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) -input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) - -# fetch batch labels -batch1 <- c(as.character(input_train_mod1$obs$batch), as.character(input_test_mod1$obs$batch)) -batch2 <- as.character(input_train_mod1$obs$batch) - -# create SummarizedExperiment object -data1 <- SummarizedExperiment::SummarizedExperiment( - assays = list( - counts = as( - cbind( - t(input_train_mod1$layers[["counts"]]), - t(input_test_mod1$layers[["counts"]]) - ), - "CsparseMatrix" - ) - ), - colData = data.frame(batch = factor(batch1)) -) -data1 <- data1[Matrix::rowSums(SummarizedExperiment::assay(data1)) > 0, ] -rm(input_train_mod1, input_test_mod1) -gc() - -cat("Running NewWave on mod1\n") -res1 <- NewWave::newWave( - data1, - X = "~batch", - verbose = TRUE, - K = 10, - maxiter_optimize = par$newwave_maxiter, - n_gene_par = min(par$newwave_ngene, nrow(data1)), - n_cell_par = min(par$newwave_ncell, ncol(data1)), - commondispersion = FALSE -) -dr_mod1 <- SingleCellExperiment::reducedDim(res1) -colnames(dr_mod1) <- paste0("comp_", seq_len(ncol(dr_mod1))) -rm(data1) -gc() - -# split DR matrices -train_ix <- seq_along(batch2) -dr_mod1_train <- dr_mod1[train_ix, , drop = FALSE] -dr_mod1_test <- dr_mod1[-train_ix, , drop = FALSE] - - -cat("Predicting for each column in modality 2\n") -input_train_mod2 <- anndata::read_h5ad(par$input_train_mod2) - -# precompute knn indices -knn_ix <- FNN::get.knnx( - dr_mod1_train, - dr_mod1_test, - k = min(nrow(dr_mod1_train), par$n_neighbors) -)$nn.index - -# perform knn regression. -pred <- input_train_mod2$layers[["normalized"]][knn_ix[, 1], , drop = FALSE] -if (par$n_neighbors > 1) { - for (k in seq(2, par$n_neighbors)) { - pred <- pred + input_train_mod2$layers[["normalized"]][knn_ix[, k], , drop = FALSE] - } -} -pred <- pred / par$n_neighbors -rownames(pred) <- rownames(dr_mod1_test) - -cat("Creating outputs object\n") -out <- anndata::AnnData( - layers = list(normalized = pred), - shape = dim(pred), - uns = list( - dataset_id = input_train_mod2$uns[["dataset_id"]], - method_id = meta$functionality_name - ) -) - -cat("Writing predictions to file\n") -zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/novel/helper_functions.py b/src/tasks/predict_modality/methods/novel/helper_functions.py deleted file mode 100644 index 17c57c9b3b..0000000000 --- a/src/tasks/predict_modality/methods/novel/helper_functions.py +++ /dev/null @@ -1,247 +0,0 @@ -import torch - -from torch import nn -import torch.nn.functional as F - -from torch.utils.data import Dataset - -from typing import Optional - -import anndata -import numpy as np -import pandas as pd -import scipy.sparse -import sklearn.decomposition -import sklearn.feature_extraction.text -import sklearn.preprocessing -import sklearn.neighbors -import sklearn.utils.extmath - -class tfidfTransformer(): - def __init__(self): - self.idf = None - self.fitted = False - - def fit(self, X): - self.idf = X.shape[0] / X.sum(axis=0) - self.fitted = True - - def transform(self, X): - if not self.fitted: - raise RuntimeError('Transformer was not fitted on any data') - if scipy.sparse.issparse(X): - tf = X.multiply(1 / X.sum(axis=1)) - return tf.multiply(self.idf) - else: - tf = X / X.sum(axis=1, keepdims=True) - return tf * self.idf - - def fit_transform(self, X): - self.fit(X) - return self.transform(X) - -class lsiTransformer(): - def __init__(self, - n_components: int = 20, - use_highly_variable = None - ): - self.n_components = n_components - self.use_highly_variable = use_highly_variable - self.tfidfTransformer = tfidfTransformer() - self.normalizer = sklearn.preprocessing.Normalizer(norm="l1") - self.pcaTransformer = sklearn.decomposition.TruncatedSVD(n_components = self.n_components, random_state=777) - # self.lsi_mean = None - # self.lsi_std = None - self.fitted = None - - def fit(self, adata: anndata.AnnData): - if self.use_highly_variable is None: - self.use_highly_variable = "hvg" in adata.var - adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata - X = self.tfidfTransformer.fit_transform(adata_use.X) - X_norm = self.normalizer.fit_transform(X) - X_norm = np.log1p(X_norm * 1e4) - X_lsi = self.pcaTransformer.fit_transform(X_norm) - # self.lsi_mean = X_lsi.mean(axis=1, keepdims=True) - # self.lsi_std = X_lsi.std(axis=1, ddof=1, keepdims=True) - self.fitted = True - - def transform(self, adata): - if not self.fitted: - raise RuntimeError('Transformer was not fitted on any data') - adata_use = adata[:, adata.var["hvg"]] if self.use_highly_variable else adata - X = self.tfidfTransformer.transform(adata_use.X) - X_norm = self.normalizer.transform(X) - X_norm = np.log1p(X_norm * 1e4) - X_lsi = self.pcaTransformer.transform(X_norm) - X_lsi -= X_lsi.mean(axis=1, keepdims=True) - X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True) - lsi_df = pd.DataFrame(X_lsi, index = adata_use.obs_names) - return lsi_df - - def fit_transform(self, adata): - self.fit(adata) - return self.transform(adata) - -class ModalityMatchingDataset(Dataset): - def __init__( - self, df_modality1, df_modality2, is_train=True - ): - super().__init__() - self.df_modality1 = df_modality1 - self.df_modality2 = df_modality2 - self.is_train = is_train - def __len__(self): - return self.df_modality1.shape[0] - - def __getitem__(self, index: int): - if self.is_train == True: - x = self.df_modality1.iloc[index].values - y = self.df_modality2.iloc[index].values - return x, y - else: - x = self.df_modality1.iloc[index].values - return x - -class Swish(torch.autograd.Function): - @staticmethod - def forward(ctx, i): - result = i * sigmoid(i) - ctx.save_for_backward(i) - return result - @staticmethod - def backward(ctx, grad_output): - i = ctx.saved_variables[0] - sigmoid_i = sigmoid(i) - return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) - -class Swish_module(nn.Module): - def forward(self, x): - return Swish.apply(x) - -sigmoid = torch.nn.Sigmoid() - -class ModelRegressionGex2Atac(nn.Module): - def __init__(self, dim_mod1, dim_mod2): - super(ModelRegressionGex2Atac, self).__init__() - #self.bn = torch.nn.BatchNorm1d(1024) - self.input_ = nn.Linear(dim_mod1, 1024) - self.fc = nn.Linear(1024, 256) - self.fc1 = nn.Linear(256, 2048) - self.dropout1 = nn.Dropout(p=0.298885630228993) - self.dropout2 = nn.Dropout(p=0.11289717442776658) - self.dropout3 = nn.Dropout(p=0.13523634924414762) - self.output = nn.Linear(2048, dim_mod2) - def forward(self, x): - x = F.gelu(self.input_(x)) - x = self.dropout1(x) - x = F.gelu(self.fc(x)) - x = self.dropout2(x) - x = F.gelu(self.fc1(x)) - x = self.dropout3(x) - x = F.gelu(self.output(x)) - return x - -class ModelRegressionAtac2Gex(nn.Module): # - def __init__(self, dim_mod1, dim_mod2): - super(ModelRegressionAtac2Gex, self).__init__() - self.input_ = nn.Linear(dim_mod1, 2048) - self.fc = nn.Linear(2048, 2048) - self.fc1 = nn.Linear(2048, 512) - self.dropout1 = nn.Dropout(p=0.2649138776004753) - self.dropout2 = nn.Dropout(p=0.1769628308148758) - self.dropout3 = nn.Dropout(p=0.2516791883012817) - self.output = nn.Linear(512, dim_mod2) - def forward(self, x): - x = F.gelu(self.input_(x)) - x = self.dropout1(x) - x = F.gelu(self.fc(x)) - x = self.dropout2(x) - x = F.gelu(self.fc1(x)) - x = self.dropout3(x) - x = F.gelu(self.output(x)) - return x - -class ModelRegressionAdt2Gex(nn.Module): - def __init__(self, dim_mod1, dim_mod2): - super(ModelRegressionAdt2Gex, self).__init__() - self.input_ = nn.Linear(dim_mod1, 512) - self.dropout1 = nn.Dropout(p=0.0) - self.swish = Swish_module() - self.fc = nn.Linear(512, 512) - self.fc1 = nn.Linear(512, 512) - self.fc2 = nn.Linear(512, 512) - self.output = nn.Linear(512, dim_mod2) - def forward(self, x): - x = F.gelu(self.input_(x)) - x = F.gelu(self.fc(x)) - x = F.gelu(self.fc1(x)) - x = F.gelu(self.fc2(x)) - x = F.gelu(self.output(x)) - return x - -class ModelRegressionGex2Adt(nn.Module): - def __init__(self, dim_mod1, dim_mod2): - super(ModelRegressionGex2Adt, self).__init__() - self.input_ = nn.Linear(dim_mod1, 512) - self.dropout1 = nn.Dropout(p=0.20335661386636347) - self.dropout2 = nn.Dropout(p=0.15395289261127876) - self.dropout3 = nn.Dropout(p=0.16902655078832815) - self.fc = nn.Linear(512, 512) - self.fc1 = nn.Linear(512, 2048) - self.output = nn.Linear(2048, dim_mod2) - def forward(self, x): - # x = self.batchswap_noise(x) - x = F.gelu(self.input_(x)) - x = self.dropout1(x) - x = F.gelu(self.fc(x)) - x = self.dropout2(x) - x = F.gelu(self.fc1(x)) - x = self.dropout3(x) - x = F.gelu(self.output(x)) - return x - -def rmse(y, y_pred): - return np.sqrt(np.mean(np.square(y - y_pred))) - -def train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, name_model, device): - best_score = 100000 - for i in range(100): - train_losses = [] - test_losses = [] - model.train() - - for x, y in dataloader_train: - optimizer.zero_grad() - output = model(x.float().to(device)) - loss = torch.sqrt(loss_fn(output, y.float().to(device))) - loss.backward() - train_losses.append(loss.item()) - optimizer.step() - - model.eval() - with torch.no_grad(): - for x, y in dataloader_test: - output = model(x.float().to(device)) - output[output<0] = 0.0 - loss = torch.sqrt(loss_fn(output, y.float().to(device))) - test_losses.append(loss.item()) - - outputs = [] - targets = [] - model.eval() - with torch.no_grad(): - for x, y in dataloader_test: - output = model(x.float().to(device)) - - outputs.append(output.detach().cpu().numpy()) - targets.append(y.float().detach().cpu().numpy()) - cat_outputs = np.concatenate(outputs) - cat_targets = np.concatenate(targets) - cat_outputs[cat_outputs<0.0] = 0 - - if best_score > rmse(cat_targets,cat_outputs): - torch.save(model.state_dict(), name_model) - best_score = rmse(cat_targets,cat_outputs) - print("best rmse: ", best_score) - diff --git a/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml deleted file mode 100644 index 72e3292407..0000000000 --- a/src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml +++ /dev/null @@ -1,25 +0,0 @@ -__merge__: ../../../api/comp_method_predict.yaml -functionality: - name: novel_predict - arguments: - - name: "--input_transform" - type: file - direction: input - required: false - example: "lsi_transformer.pickle" - resources: - - type: python_script - path: script.py - - path: ../helper_functions.py -platforms: - - type: docker - image: openproblems/base_pytorch_nvidia:1.0.0 - setup: - - type: python - packages: - - scikit-learn - - networkx - - type: nextflow - directives: - label: [highmem, hightime, midcpu, highsharedmem, gpu] - diff --git a/src/tasks/predict_modality/methods/novel/predict/run_test.sh b/src/tasks/predict_modality/methods/novel/predict/run_test.sh deleted file mode 100644 index af5550e5d7..0000000000 --- a/src/tasks/predict_modality/methods/novel/predict/run_test.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -viash run src/tasks/predict_modality/methods/novel/predict/config.vsh.yaml -- \ - --input_train_mod2 'resources/predict_modality/datasets/openproblems_neurips2021/bmmc_cite/normal/log_cp10k/train_mod2.h5ad' \ - --input_test_mod1 'resources/predict_modality/datasets/openproblems_neurips2021/bmmc_cite/normal/log_cp10k/test_mod1.h5ad' \ - --input_model output/novel/model.pt \ - --input_transform output/novel/lsi_transform.pickle \ - --output 'output/novel/novel_test.h5ad' \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/predict/script.py b/src/tasks/predict_modality/methods/novel/predict/script.py deleted file mode 100644 index 5f336ce7b0..0000000000 --- a/src/tasks/predict_modality/methods/novel/predict/script.py +++ /dev/null @@ -1,119 +0,0 @@ -import sys -import torch -from torch.utils.data import DataLoader - -import anndata as ad -import pickle -import numpy as np -from scipy.sparse import csc_matrix - -#check gpu available -if (torch.cuda.is_available()): - device = 'cuda:0' #switch to current device - print('current device: gpu', flush=True) -else: - device = 'cpu' - print('current device: cpu', flush=True) - - -## VIASH START - -par = { - 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad', - 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad', - 'input_model': 'resources_test/predict_modality/neurips2021_bmmc_cite/model.pt', - 'input_transform': 'transformer.pickle' -} -meta = { - 'resources_dir': 'src/tasks/predict_modality/methods/novel', - 'functionality_name': '171129' -} -## VIASH END - -sys.path.append(meta['resources_dir']) -from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac, ModalityMatchingDataset - -print("Load data", flush=True) - -input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) -input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) - -mod1 = input_test_mod1.uns['modality'] -mod2 = input_train_mod2.uns['modality'] - -n_vars_mod1 = input_train_mod2.uns["model_dim"]["mod1"] -n_vars_mod2 = input_train_mod2.uns["model_dim"]["mod2"] - -input_test_mod1.X = input_test_mod1.layers['normalized'].tocsr() - -# Remove vars that were removed from training set. Mostlyy only applicable for testing. -if input_train_mod2.uns.get("removed_vars"): - rem_var = input_train_mod2.uns["removed_vars"] - input_test_mod1 = input_test_mod1[:, ~input_test_mod1.var_names.isin(rem_var)] - -del input_train_mod2 - - -model_fp = par['input_model'] - -print("Start predict", flush=True) - -if mod1 == 'GEX' and mod2 == 'ADT': - model = ModelRegressionGex2Adt(n_vars_mod1,n_vars_mod2) - weight = torch.load(model_fp, map_location='cpu') - with open(par['input_transform'], 'rb') as f: - lsi_transformer_gex = pickle.load(f) - - model.load_state_dict(weight) - input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) - -elif mod1 == 'GEX' and mod2 == 'ATAC': - model = ModelRegressionGex2Atac(n_vars_mod1,n_vars_mod2) - weight = torch.load(model_fp, map_location='cpu') - with open(par['input_transform'], 'rb') as f: - lsi_transformer_gex = pickle.load(f) - - model.load_state_dict(weight) - input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) - -elif mod1 == 'ATAC' and mod2 == 'GEX': - model = ModelRegressionAtac2Gex(n_vars_mod1,n_vars_mod2) - weight = torch.load(model_fp, map_location='cpu') - with open(par['input_transform'], 'rb') as f: - lsi_transformer_gex = pickle.load(f) - - model.load_state_dict(weight) - input_test_mod1_ = lsi_transformer_gex.transform(input_test_mod1) - -elif mod1 == 'ADT' and mod2 == 'GEX': - model = ModelRegressionAdt2Gex(n_vars_mod1,n_vars_mod2) - weight = torch.load(model_fp, map_location='cpu') - - model.load_state_dict(weight) - input_test_mod1_ = input_test_mod1.to_df() - -dataset_test = ModalityMatchingDataset(input_test_mod1_, None, is_train=False) -dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) - -outputs = [] -model.eval() -with torch.no_grad(): - for x in dataloader_test: - output = model(x.float()) - outputs.append(output.detach().cpu().numpy()) - -outputs = np.concatenate(outputs) -outputs[outputs<0] = 0 -outputs = csc_matrix(outputs) - -adata = ad.AnnData( - layers={"normalized": outputs}, - shape=outputs.shape, - uns={ - 'dataset_id': input_test_mod1.uns['dataset_id'], - 'method_id': meta['functionality_name'], - }, -) -adata.write_h5ad(par['output'], compression = "gzip") - - diff --git a/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml deleted file mode 100644 index 682782e059..0000000000 --- a/src/tasks/predict_modality/methods/novel/run/config.vsh.yaml +++ /dev/null @@ -1,21 +0,0 @@ -__merge__: ../../../api/comp_method.yaml -functionality: - name: novel - info: - label: Novel - summary: A method using encoder-decoder MLP model - description: This method trains an encoder-decoder MLP model with one output neuron per component in the target. As an input, the encoders use representations obtained from ATAC and GEX data via LSI transform and raw ADT data. The hyperparameters of the models were found via broad hyperparameter search using the Optuna framework. - documentation_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/novel#readme - repository_url: https://github.com/openproblems-bio/neurips2021_multimodal_topmethods/tree/main/src/predict_modality/methods/novel - reference: pmlr-v176-lance2022multimodal - submission_id: "169769" - preferred_normalization: log_cp10k - resources: - - path: main.nf - type: nextflow_script - entrypoint: run_wf - dependencies: - - name: predict_modality/methods/novel_train - - name: predict_modality/methods/novel_predict -platforms: - - type: nextflow \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/run/main.nf b/src/tasks/predict_modality/methods/novel/run/main.nf deleted file mode 100644 index 59111194cb..0000000000 --- a/src/tasks/predict_modality/methods/novel/run/main.nf +++ /dev/null @@ -1,25 +0,0 @@ -workflow run_wf { - take: input_ch - main: - output_ch = input_ch - | novel_train.run( - fromState: ["input_train_mod1", "input_train_mod2"], - toState: ["input_model": "output", "input_transform": "output_transform", "output_train_mod2": "output_train_mod2"] - ) - | novel_predict.run( - fromState: { id, state -> - [ - "input_train_mod2": state.output_train_mod2, - "input_test_mod1": state.input_test_mod1, - "input_model": state.input_model, - "input_transform": state.input_transform, - "output": state.output]}, - toState: ["output": "output"] - ) - - | map { tup -> - [tup[0], [output: tup[1].output]] - } - - emit: output_ch -} \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/run/run_test.sh b/src/tasks/predict_modality/methods/novel/run/run_test.sh deleted file mode 100644 index f6da6b0863..0000000000 --- a/src/tasks/predict_modality/methods/novel/run/run_test.sh +++ /dev/null @@ -1,15 +0,0 @@ -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -nextflow run . \ - -main-script target/nextflow/predict_modality/methods/novel/main.nf \ - -profile docker \ - -c src/wf_utils/labels_ci.config \ - --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \ - --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \ - --input_test_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad \ - --publish_dir output/novel/nextflow diff --git a/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml b/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml deleted file mode 100644 index 87ea471301..0000000000 --- a/src/tasks/predict_modality/methods/novel/train/config.vsh.yaml +++ /dev/null @@ -1,31 +0,0 @@ -__merge__: ../../../api/comp_method_train.yaml -functionality: - name: novel_train - arguments: - - name: --output_transform - type: file - description: "The output transform file" - required: false - default: "lsi_transformer.pickle" - direction: output - - name: --output_train_mod2 - type: file - description: copy of the input with model dim in `.uns` - direction: output - default: "train_mod2.h5ad" - required: false - resources: - - path: script.py - type: python_script - - path: ../helper_functions.py -platforms: - - type: docker - image: openproblems/base_pytorch_nvidia:1.0.0 - setup: - - type: python - packages: - - scikit-learn - - networkx - - type: nextflow - directives: - label: [highmem, hightime, midcpu, highsharedmem, gpu] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/novel/train/run_test.sh b/src/tasks/predict_modality/methods/novel/train/run_test.sh deleted file mode 100644 index 08630b1ac0..0000000000 --- a/src/tasks/predict_modality/methods/novel/train/run_test.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# Run script for all test resources - -echo "GEX2ADT" -viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ - --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad \ - --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad \ - --output output/model.pt - -# echo "ADT2GEX" -# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ -# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod1.h5ad \ -# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/swap/train_mod2.h5ad \ -# --output output/model.pt - -# echo "GEX2ATAC" -# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ -# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod1.h5ad \ -# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/normal/train_mod2.h5ad \ -# --output output/model.pt - -# echo "ATAC2GEX" -# viash run src/tasks/predict_modality/methods/novel/train/config.vsh.yaml -- \ -# --input_train_mod1 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad \ -# --input_train_mod2 resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad \ -# --output output/model.pt - - diff --git a/src/tasks/predict_modality/methods/novel/train/script.py b/src/tasks/predict_modality/methods/novel/train/script.py deleted file mode 100644 index 39ea8b4778..0000000000 --- a/src/tasks/predict_modality/methods/novel/train/script.py +++ /dev/null @@ -1,148 +0,0 @@ -import sys - -import torch -from torch.utils.data import DataLoader -# from sklearn.model_selection import train_test_split - -import anndata as ad -import pickle - -#check gpu available -if (torch.cuda.is_available()): - device = 'cuda:0' #switch to current device - print('current device: gpu', flush=True) -else: - device = 'cpu' - print('current device: cpu', flush=True) - - -## VIASH START - -par = { - 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad', - 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad', - 'output_train_mod2': 'train_mod2.h5ad', - 'output': 'model.pt' -} - -meta = { - 'resources_dir': 'src/tasks/predict_modality/methods/novel', -} -## VIASH END - - -sys.path.append(meta['resources_dir']) -from helper_functions import train_and_valid, lsiTransformer, ModalityMatchingDataset -from helper_functions import ModelRegressionAtac2Gex, ModelRegressionAdt2Gex, ModelRegressionGex2Adt, ModelRegressionGex2Atac - -print('Load data', flush=True) - -input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) -input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) - -adata = input_train_mod2.copy() - -mod1 = input_train_mod1.uns['modality'] -mod2 = input_train_mod2.uns['modality'] - -input_train_mod1.X = input_train_mod1.layers['normalized'] -input_train_mod2.X = input_train_mod2.layers['normalized'] - -input_train_mod2_df = input_train_mod2.to_df() - -del input_train_mod2 - -print('Start train', flush=True) - - -# Check for zero divide -zero_row = input_train_mod1.X.sum(axis=0) == 0 - -rem_var = None -if True in zero_row: - rem_var = input_train_mod1[:, zero_row].var_names - input_train_mod1 = input_train_mod1[:, ~zero_row] - - -# select number of variables for LSI -n_comp = input_train_mod1.n_vars -1 if input_train_mod1.n_vars < 256 else 256 - -if mod1 != 'ADT': - lsi_transformer_gex = lsiTransformer(n_components=n_comp) - input_train_mod1_df = lsi_transformer_gex.fit_transform(input_train_mod1) -else: - input_train_mod1_df = input_train_mod1.to_df() - -# reproduce train/test split from phase 1 -batch = input_train_mod1.obs["batch"] -train_ix = [ k for k,v in enumerate(batch) if v not in {'s1d2', 's3d7'} ] -test_ix = [ k for k,v in enumerate(batch) if v in {'s1d2', 's3d7'} ] - -train_mod1 = input_train_mod1_df.iloc[train_ix, :] -train_mod2 = input_train_mod2_df.iloc[train_ix, :] -test_mod1 = input_train_mod1_df.iloc[test_ix, :] -test_mod2 = input_train_mod2_df.iloc[test_ix, :] - -n_vars_train_mod1 = train_mod1.shape[1] -n_vars_train_mod2 = train_mod2.shape[1] -n_vars_test_mod1 = test_mod1.shape[1] -n_vars_test_mod2 = test_mod2.shape[1] - -n_vars_mod1 = input_train_mod1_df.shape[1] -n_vars_mod2 = input_train_mod2_df.shape[1] - -if mod1 == 'ATAC' and mod2 == 'GEX': - dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) - dataloader_train = DataLoader(dataset_train, 256, shuffle = True, num_workers = 8) - - dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) - dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) - - model = ModelRegressionAtac2Gex(n_vars_mod1,n_vars_mod2).to(device) - optimizer = torch.optim.AdamW(model.parameters(), lr=0.00008386597445284492,weight_decay=0.000684887347727808) - -elif mod1 == 'ADT' and mod2 == 'GEX': - dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) - dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 4) - - dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) - dataloader_test = DataLoader(dataset_test, 32, shuffle = False, num_workers = 4) - - model = ModelRegressionAdt2Gex(n_vars_mod1,n_vars_mod2).to(device) - optimizer = torch.optim.Adam(model.parameters(), lr=0.00041, weight_decay=0.0000139) - - -elif mod1 == 'GEX' and mod2 == 'ADT': - dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) - dataloader_train = DataLoader(dataset_train, 32, shuffle = True, num_workers = 8) - - dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) - dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) - - model = ModelRegressionGex2Adt(n_vars_mod1,n_vars_mod2).to(device) - optimizer = torch.optim.AdamW(model.parameters(), lr=0.000034609210829678734, weight_decay=0.0009965881574697426) - - -elif mod1 == 'GEX' and mod2 == 'ATAC': - dataset_train = ModalityMatchingDataset(train_mod1, train_mod2) - dataloader_train = DataLoader(dataset_train, 64, shuffle = True, num_workers = 8) - - dataset_test = ModalityMatchingDataset(test_mod1, test_mod2) - dataloader_test = DataLoader(dataset_test, 64, shuffle = False, num_workers = 8) - - model = ModelRegressionGex2Atac(n_vars_mod1,n_vars_mod2).to(device) - optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001806762345275399, weight_decay=0.0004084171379280058) - -loss_fn = torch.nn.MSELoss() -train_and_valid(model, optimizer, loss_fn, dataloader_train, dataloader_test, par['output'], device) - -# Add model dim for use in predict part -adata.uns["model_dim"] = {"mod1": n_vars_mod1, "mod2": n_vars_mod2} -if rem_var: - adata.uns["removed_vars"] = [rem_var[0]] -adata.write_h5ad(par['output_train_mod2'], compression="gzip") - -if mod1 != 'ADT': - with open(par['output_transform'], 'wb') as f: - pickle.dump(lsi_transformer_gex, f) - diff --git a/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml b/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml deleted file mode 100644 index a1ee69041d..0000000000 --- a/src/tasks/predict_modality/methods/random_forest/config.vsh.yaml +++ /dev/null @@ -1,37 +0,0 @@ -__merge__: ../../api/comp_method.yaml -functionality: - name: random_forest - status: disabled # disabled due to long execution times - info: - label: Random Forests - summary: Random forest regression. - description: A random forest regression method. - reference: breiman2001random - documentation_url: https://www.stat.berkeley.edu/~breiman/RandomForests/reg_home.htm - repository_url: https://github.com/cran/randomForest - preferred_normalization: log_cp10k - arguments: - - name: "--distance_method" - type: "string" - default: "pearson" - description: The distance method to use. Possible values are euclidean, pearson, spearman and others. - - name: "--n_pcs" - type: "integer" - default: 20 - description: Number of principal components to use. - - name: "--n_trees" - type: "integer" - default: 50 - description: Number of trees to use. - resources: - - type: r_script - path: script.R -platforms: - - type: docker - image: openproblems/base_r:1.0.0 - setup: - - type: r - cran: [ lmds, ranger, pbapply] - - type: nextflow - directives: - label: [hightime, highmem, highcpu] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/random_forest/script.R b/src/tasks/predict_modality/methods/random_forest/script.R deleted file mode 100644 index e148eefbf7..0000000000 --- a/src/tasks/predict_modality/methods/random_forest/script.R +++ /dev/null @@ -1,83 +0,0 @@ -cat("Loading dependencies\n") -requireNamespace("anndata", quietly = TRUE) -requireNamespace("pbapply", quietly = TRUE) -library(Matrix, warn.conflicts = FALSE, quietly = TRUE) - -## VIASH START -path <- "output/datasets/predict_modality/openproblems_bmmc_multiome_phase1_mod1/openproblems_bmmc_multiome_phase1_mod1.censor_dataset.output_" -par <- list( - input_train_mod1 = paste0(path, "train_mod1.h5ad"), - input_test_mod1 = paste0(path, "test_mod1.h5ad"), - input_train_mod2 = paste0(path, "train_mod2.h5ad"), - output = "output.h5ad", - n_pcs = 20L, - n_trees = 50L -) -meta <- list(functionality_name = "foo") -## VIASH END - -n_cores <- parallel::detectCores(all.tests = FALSE, logical = TRUE) - -cat("Reading mod1 files\n") -input_train_mod1 <- anndata::read_h5ad(par$input_train_mod1) -input_test_mod1 <- anndata::read_h5ad(par$input_test_mod1) - -dataset_id <- input_train_mod1$uns[["dataset_id"]] - -cat("Performing DR on the mod1 values\n") -dr <- lmds::lmds( - rbind(input_train_mod1$layers[["normalized"]], input_test_mod1$layers[["normalized"]]), - ndim = par$n_pcs, - distance_method = par$distance_method -) - -ix <- seq_len(nrow(input_train_mod1)) -dr_train <- as.data.frame(dr[ix, , drop = FALSE]) -dr_test <- as.data.frame(dr[-ix, , drop = FALSE]) -dr_train <- dr[ix, , drop = FALSE] -dr_test <- dr[-ix, , drop = FALSE] - -rm(input_train_mod1, input_test_mod1) -gc() - - -cat("Reading mod2 files\n") -X_mod2 <- anndata::read_h5ad(par$input_train_mod2)$layers[["normalized"]] - -cat("Predicting for each column in modality 2\n") -preds <- pbapply::pblapply( - seq_len(ncol(X_mod2)), - cl = n_cores, - function(i) { - y <- X_mod2[, i] - uy <- unique(y) - if (length(uy) > 1) { - rf <- ranger::ranger( - x = dr_train, - y = y, - num.trees = par$n_trees - ) - stats::predict(rf, dr_test)$prediction - } else { - rep(uy, nrow(dr_test)) - } - } -) - -cat("Creating outputs object\n") -prediction <- Matrix::Matrix(do.call(cbind, preds), sparse = TRUE) -rownames(prediction) <- rownames(dr_test) -colnames(prediction) <- colnames(X_mod2) - -out <- anndata::AnnData( - layers = list(normalized = prediction), - shape = dim(prediction), - uns = list( - dataset_id = dataset_id, - method_id = meta$functionality_name - ) -) - - -cat("Writing predictions to file\n") -zzz <- out$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml b/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml deleted file mode 100644 index ef972e416f..0000000000 --- a/src/tasks/predict_modality/methods/simple_mlp/predict/config.vsh.yaml +++ /dev/null @@ -1,21 +0,0 @@ -__merge__: ../../../api/comp_method_predict.yaml -functionality: - name: simplemlp_predict - resources: - - type: python_script - path: script.py - - path: ../resources/ -platforms: - - type: docker - # image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime - image: openproblems/base_pytorch_nvidia:1.0.0 - # run_args: ["--gpus all --ipc=host"] - setup: - - type: python - pypi: - - scikit-learn - - scanpy - - pytorch-lightning - - type: nextflow - directives: - label: [highmem, hightime, midcpu, gpu, highsharedmem] \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/simple_mlp/predict/script.py b/src/tasks/predict_modality/methods/simple_mlp/predict/script.py deleted file mode 100644 index b67284e348..0000000000 --- a/src/tasks/predict_modality/methods/simple_mlp/predict/script.py +++ /dev/null @@ -1,104 +0,0 @@ -from glob import glob -import sys -import numpy as np -from scipy.sparse import csc_matrix -import anndata as ad -import torch -from torch.utils.data import TensorDataset,DataLoader - -## VIASH START -par = { - 'input_train_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad', - 'input_train_mod2': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad', - 'input_test_mod1': 'resources_test/predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad', - 'input_model': 'output/model', - 'output': 'output/prediction' -} -meta = { - 'resources_dir': 'src/tasks/predict_modality/methods/simple_mlp', - 'cpus': 10 -} -## VIASH END - -resources_dir = f"{meta['resources_dir']}/resources" -sys.path.append(resources_dir) -from models import MLP -import utils - -def _predict(model,dl): - model = model.cuda() - model.eval() - yps = [] - for x in dl: - with torch.no_grad(): - yp = model(x[0].cuda()) - yps.append(yp.detach().cpu().numpy()) - yp = np.vstack(yps) - return yp - - -print('Load data', flush=True) -input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) -input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) - -# determine variables -mod_1 = input_test_mod1.uns['modality'] -mod_2 = input_train_mod2.uns['modality'] - -task = f'{mod_1}2{mod_2}' - -print('Load ymean', flush=True) -ymean_path = f"{par['input_model']}/{task}_ymean.npy" -ymean = np.load(ymean_path) - -print('Start predict', flush=True) -if task == 'GEX2ATAC': - y_pred = ymean*np.ones([input_test_mod1.n_obs, input_test_mod1.n_vars]) -else: - folds = [0, 1, 2] - - ymean = torch.from_numpy(ymean).float() - yaml_path=f"{resources_dir}/yaml/mlp_{task}.yaml" - config = utils.load_yaml(yaml_path) - X = input_test_mod1.layers["normalized"].toarray() - X = torch.from_numpy(X).float() - - te_ds = TensorDataset(X) - - yp = 0 - for fold in folds: - # load_path = f"{par['input_model']}/{task}_fold_{fold}/version_0/checkpoints/*" - load_path = f"{par['input_model']}/{task}_fold_{fold}/**.ckpt" - print(load_path) - ckpt = glob(load_path)[0] - model_inf = MLP.load_from_checkpoint( - ckpt, - in_dim=X.shape[1], - out_dim=input_test_mod1.n_vars, - ymean=ymean, - config=config - ) - te_loader = DataLoader( - te_ds, - batch_size=config.batch_size, - num_workers=0, - shuffle=False, - drop_last=False - ) - yp = yp + _predict(model_inf, te_loader) - - y_pred = yp/len(folds) - -y_pred = csc_matrix(y_pred) - -adata = ad.AnnData( - layers={"normalized": y_pred}, - shape=y_pred.shape, - uns={ - 'dataset_id': input_test_mod1.uns['dataset_id'], - 'method_id': meta['functionality_name'], - }, -) - -print('Write data', flush=True) -adata.write_h5ad(par['output'], compression = "gzip") \ No newline at end of file diff --git a/src/tasks/predict_modality/methods/simple_mlp/resources/models.py b/src/tasks/predict_modality/methods/simple_mlp/resources/models.py deleted file mode 100644 index 25ce9b2995..0000000000 --- a/src/tasks/predict_modality/methods/simple_mlp/resources/models.py +++ /dev/null @@ -1,68 +0,0 @@ -import torch -import pytorch_lightning as pl -import torch.nn as nn -import torch.nn.functional as F - -class MLP(pl.LightningModule): - def __init__(self,in_dim,out_dim,ymean,config): - super(MLP, self).__init__() - self.ymean = ymean.cuda() - H1 = config.H1 - H2 = config.H2 - p = config.dropout - self.config = config - self.fc1 = nn.Linear(in_dim, H1) - self.fc2 = nn.Linear(H1,H2) - self.fc3 = nn.Linear(H1+H2, out_dim) - self.dp2 = nn.Dropout(p=p) - - def forward(self, x): - x0 = x - x1 = F.relu(self.fc1(x)) - x1 = self.dp2(x1) - x = F.relu(self.fc2(x1)) - x = torch.cat([x,x1],dim=1) - x = self.fc3(x) - x = self.apply_mask(x) - return x - - def apply_mask(self,yp): - tmp = torch.ones_like(yp).float()*self.ymean - mask = tmp