add harmony and harmonypy components

openproblems-bio · Sep 24, 2024 · ed4a0a5 · ed4a0a5
1 parent ec1dc7b
commit ed4a0a5
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 0 deletions.
diff --git a/src/methods/harmony/config.vsh.yaml b/src/methods/harmony/config.vsh.yaml
@@ -0,0 +1,33 @@
+__merge__: /src/api/comp_method.yaml
+name: harmony
+label: Harmony
+summary: Fast, sensitive and accurate integration of single-cell data with Harmony
+description: |
+  Harmony is a general-purpose R package with an efficient algorithm for integrating multiple data sets. 
+  It is especially useful for large single-cell datasets such as single-cell RNA-seq.
+references:
+  # Korsunsky, I., Millard, N., Fan, J. et al.
+  # Fast, sensitive and accurate integration of single-cell data with Harmony.
+  # Nat Methods 16, 1289–1296 (2019). https://doi.org/10.1038/s41592-019-0619-0
+  doi: 10.1038/s41592-019-0619-0
+links:
+  repository: https://github.com/immunogenomics/harmony
+  documentation: https://portals.broadinstitute.org/harmony
+info:
+  method_types: [embedding]
+  preferred_normalization: log_cp10k
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1.0.0
+    setup:
+      - type: r
+        cran:
+          - harmony
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowcpu, highmem, midtime]
diff --git a/src/methods/harmony/script.R b/src/methods/harmony/script.R
@@ -0,0 +1,40 @@
+cat("Loading dependencies\n")
+requireNamespace("anndata", quietly = TRUE)
+requireNamespace("Matrix", quietly = TRUE)
+requireNamespace("harmony", quietly = TRUE)
+
+## VIASH START
+par <- list(
+  input = 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad',
+  output = 'output.h5ad'
+)
+meta <- list(
+  name = "harmony"
+)
+## VIASH END
+
+cat("Read input\n")
+adata <- anndata::read_h5ad(par$input)
+
+cat("Run harmony\n")
+out <- harmony::RunHarmony(
+  data_mat = adata$obsm[["X_pca"]],
+  meta_data = adata$obs[["batch"]]
+)
+
+cat("Store outputs\n")
+output <- anndata::AnnData(
+  obs = adata$obs[, c()],
+  var = adata$var[, c()],
+  obsm = list(
+    X_emb = out,
+  ),
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name
+  )
+)
+
+cat("Write output to file\n")
+zzz <- output$write_h5ad(par$output, compression = "gzip")
diff --git a/src/methods/harmonypy/config.vsh.yaml b/src/methods/harmonypy/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: /src/api/comp_method.yaml
+name: harmonypy
+label: Harmonypy
+summary: harmonypy is a port of the harmony R package by Ilya Korsunsky.
+description: |
+  Harmony is a general-purpose R package with an efficient algorithm for integrating multiple data sets. 
+  It is especially useful for large single-cell datasets such as single-cell RNA-seq.
+references:
+  # Korsunsky, I., Millard, N., Fan, J. et al.
+  # Fast, sensitive and accurate integration of single-cell data with Harmony.
+  # Nat Methods 16, 1289–1296 (2019). https://doi.org/10.1038/s41592-019-0619-0
+  doi: 10.1038/s41592-019-0619-0
+links:
+  repository: https://github.com/slowkow/harmonypy
+  documentation: https://portals.broadinstitute.org/harmony
+info:
+  method_types: [embedding]
+  preferred_normalization: log_cp10k
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - harmonypy
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowcpu, highmem, midtime]
diff --git a/src/methods/harmonypy/script.py b/src/methods/harmonypy/script.py
@@ -0,0 +1,52 @@
+import sys
+import anndata as ad
+import numpy as np
+import harmonypy as hm
+
+## VIASH START
+par = {
+    "input": "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad",
+    "output": "output.h5ad"
+}
+meta = {
+    "name": "harmonypy",
+    "resources_dir": "src/utils"
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+print(">> Read input", flush=True)
+adata = read_anndata(
+    par["input"],
+    obs="obs",
+    obsm="obsm",
+    var="var",
+    uns="uns"
+)
+
+print(">> Run harmonypy", flush=True)
+out = hm.run_harmony(
+  adata.obsm["X_pca"],
+  adata.obs,
+  "batch"
+)
+
+print("Store output", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsm={
+        "X_emb": out.Z_corr.transpose()
+    },
+    shape=adata.shape,
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "normalization_id": adata.uns["normalization_id"],
+        "method_id": meta["name"],
+    }
+)
+
+print("Write output to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")