Merge pull request #123 from Reed-CompBio/oi2

Update Omics Integrator 2 container and usage
Reed-CompBio · Sep 16, 2023 · 17af0e9 · 17af0e9
2 parents bbfe3f3 + 7b31043
commit 17af0e9
Show file tree

Hide file tree

Showing 7 changed files with 87 additions and 86 deletions.
diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml
@@ -77,7 +77,7 @@ jobs:
     - name: Pull Docker images
       run: |
         docker pull reedcompbio/omics-integrator-1:latest
-        docker pull reedcompbio/omics-integrator-2:latest
+        docker pull reedcompbio/omics-integrator-2:v2
         docker pull reedcompbio/pathlinker:latest
         docker pull reedcompbio/meo:latest
         docker pull reedcompbio/mincostflow:latest
@@ -98,8 +98,8 @@ jobs:
         path: docker-wrappers/OmicsIntegrator2/.
         dockerfile: docker-wrappers/OmicsIntegrator2/Dockerfile
         repository: reedcompbio/omics-integrator-2
-        tags: latest
-        cache_froms: reedcompbio/omics-integrator-2:latest
+        tags: v2
+        cache_froms: reedcompbio/omics-integrator-2:v2
         push: false
     - name: Build PathLinker Docker image
       uses: docker/build-push-action@v1

diff --git a/docker-wrappers/OmicsIntegrator2/Dockerfile b/docker-wrappers/OmicsIntegrator2/Dockerfile
@@ -1,11 +1,6 @@
 # Omics Integrator 2 wrapper
 # https://github.com/fraenkel-lab/OmicsIntegrator2
-# Activates the conda environment before running command inside container
-# Uses the strategy from https://pythonspeed.com/articles/activate-conda-dockerfile/
-# by Itamar Turner-Trauring
 FROM continuumio/miniconda3:4.9.2
 
 COPY environment.yml .
-RUN conda env create -f environment.yml
-
-ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "oi2"]
+RUN conda env update --name base --file environment.yml --prune
diff --git a/docker-wrappers/OmicsIntegrator2/README.md b/docker-wrappers/OmicsIntegrator2/README.md
@@ -2,12 +2,7 @@
 
 A Docker image for [Omics Integrator 2](https://github.com/fraenkel-lab/OmicsIntegrator2) that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/omics-integrator-2).
 
-## Activating conda inside a Docker container
-
-By default, an installed conda environment will not be activated inside the Docker container.
-Docker does not invoke Bash as a login shell.
-[This blog post](https://pythonspeed.com/articles/activate-conda-dockerfile/) provides a workaround demonstrated here in `Dockerfile` and `env.yml`.
-It defines a custom ENTRYPOINT that uses `conda run` to run the command inside the conda environment.
+## Building the Docker image
 
 To create the Docker image run:
 ```
@@ -27,8 +22,11 @@ Test code is located in `test/OmicsIntegrator2`.
 The `input` subdirectory contains test files `oi2-edges.txt` and `oi2-prizes.txt`.
 The Docker wrapper can be tested with `pytest`.
 
+## Versions:
+- v1: Created a named conda environment in the container and used `ENTRYPOINT` to execute commands inside that environment. Not compatible with Singularity.
+- v2: Used the environment file to update the base conda environment so the `ENTRYPOINT` command was no longer needed. Compatible with Singularity.
+
 ## TODO
 - Attribute https://github.com/fraenkel-lab/OmicsIntegrator2
 - Modify environment to use fraenkel-lab or [PyPI](https://pypi.org/project/OmicsIntegrator/) version instead of fork
 - Document usage
-- Consider `continuumio/miniconda3:4.9.2-alpine` base image
diff --git a/src/omicsintegrator2.py b/src/omicsintegrator2.py
@@ -1,52 +1,49 @@
-import os
 from pathlib import Path
 
-import docker
 import pandas as pd
 
+from src.dataset import Dataset
 from src.prm import PRM
-from src.util import prepare_path_docker
+from src.util import add_rank_column, prepare_volume, run_container
 
 __all__ = ['OmicsIntegrator2']
 
 
 class OmicsIntegrator2(PRM):
     required_inputs = ['prizes', 'edges']
 
-    def generate_inputs(data, filename_map):
+    def generate_inputs(data: Dataset, filename_map):
         """
-        Access fields from the dataset and write the required input files
+        Access fields from the dataset and write the required input files.
+        Automatically converts edge weights to edge costs.
         @param data: dataset
         @param filename_map: a dict mapping file types in the required_inputs to the filename for that type
-        @return:
         """
         for input_type in OmicsIntegrator2.required_inputs:
             if input_type not in filename_map:
                 raise ValueError(f"{input_type} filename is missing")
 
         if data.contains_node_columns('prize'):
-            #NODEID is always included in the node table
+            # NODEID is always included in the node table
             node_df = data.request_node_columns(['prize'])
-        elif data.contains_node_columns(['sources','targets']):
-            #If there aren't prizes but are sources and targets, make prizes based on them
-            node_df = data.request_node_columns(['sources','targets'])
+        elif data.contains_node_columns(['sources', 'targets']):
+            # If there aren't prizes but are sources and targets, make prizes based on them
+            node_df = data.request_node_columns(['sources', 'targets'])
             node_df.loc[node_df['sources']==True, 'prize'] = 1.0
             node_df.loc[node_df['targets']==True, 'prize'] = 1.0
         else:
             raise ValueError("Omics Integrator 2 requires node prizes or sources and targets")
 
-        #Omics Integrator already gives warnings for strange prize values, so we won't here
-        node_df.to_csv(filename_map['prizes'],sep='\t',index=False,columns=['NODEID','prize'],header=['name','prize'])
+        # Omics Integrator already gives warnings for strange prize values, so we won't here
+        node_df.to_csv(filename_map['prizes'], sep='\t', index=False, columns=['NODEID', 'prize'], header=['name', 'prize'])
         edges_df = data.get_interactome()
 
-        #We'll have to update this when we make iteractomes more proper, but for now
+        # We'll have to update this when we make iteractomes more proper, but for now
         # assume we always get a weight and turn it into a cost.
-        # use the same approach as omicsintegrator2 by adding half the max cost as the base cost.
+        # use the same approach as OmicsIntegrator2 by adding half the max cost as the base cost.
         # if everything is less than 1 assume that these are confidences and set the max to 1
-        edges_df['cost'] = (max(edges_df['Weight'].max(),1.0)*1.5) - edges_df['Weight']
-        edges_df.to_csv(filename_map['edges'],sep='\t',index=False,columns=['Interactor1','Interactor2','cost'],header=['protein1','protein2','cost'])
-
-
+        edges_df['cost'] = (max(edges_df['Weight'].max(), 1.0)*1.5) - edges_df['Weight']
+        edges_df.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'Interactor2', 'cost'], header=['protein1', 'protein2', 'cost'])
 
     # TODO add parameter validation
     # TODO add reasonable default values
@@ -63,22 +60,25 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise
         if edges is None or prizes is None or output_file is None:
             raise ValueError('Required Omics Integrator 2 arguments are missing')
 
-        if singularity:
-            raise NotImplementedError('Omics Integrator 2 does not yet support Singularity')
+        work_dir = '/spras'
+
+        # Each volume is a tuple (src, dest)
+        volumes = list()
 
-        # Initialize a Docker client using environment variables
-        client = docker.from_env()
-        work_dir = Path(__file__).parent.parent.absolute()
+        bind_path, edge_file = prepare_volume(edges, work_dir)
+        volumes.append(bind_path)
 
-        edge_file = Path(edges)
-        prize_file = Path(prizes)
+        bind_path, prize_file = prepare_volume(prizes, work_dir)
+        volumes.append(bind_path)
 
         out_dir = Path(output_file).parent
         # Omics Integrator 2 requires that the output directory exist
-        Path(work_dir, out_dir).mkdir(parents=True, exist_ok=True)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir)
+        volumes.append(bind_path)
 
-        command = ['OmicsIntegrator', '-e', edge_file.as_posix(), '-p', prize_file.as_posix(),
-                   '-o', out_dir.as_posix(), '--filename', 'oi2']
+        command = ['OmicsIntegrator', '-e', edge_file, '-p', prize_file,
+                   '-o', mapped_out_dir, '--filename', 'oi2']
 
         # Add optional arguments
         if w is not None:
@@ -101,34 +101,13 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise
 
         print('Running Omics Integrator 2 with arguments: {}'.format(' '.join(command)), flush=True)
 
-        #Don't perform this step on systems where permissions aren't an issue like windows
-        need_chown = True
-        try:
-            uid = os.getuid()
-        except AttributeError:
-            need_chown = False
-
-        try:
-            out = client.containers.run('reedcompbio/omics-integrator-2',
-                                        command,
-                                        stderr=True,
-                                        volumes={
-                                            prepare_path_docker(work_dir): {'bind': '/OmicsIntegrator2', 'mode': 'rw'}},
-                                        working_dir='/OmicsIntegrator2')
-            if need_chown:
-                #This command changes the ownership of output files so we don't
-                # get a permissions error when snakemake tries to touch the files
-                chown_command = " ".join(["chown",str(uid),out_dir.as_posix()+"/oi2*"])
-                client.containers.run('reedcompbio/omics-integrator-2',
-                                            chown_command,
-                                            stderr=True,
-                                            volumes={prepare_path_docker(work_dir): {'bind': '/OmicsIntegrator2', 'mode': 'rw'}},
-                                            working_dir='/OmicsIntegrator2')
-
-            print(out.decode('utf-8'))
-        finally:
-            # Not sure whether this is needed
-            client.close()
+        container_framework = 'singularity' if singularity else 'docker'
+        out = run_container(container_framework,
+                            'reedcompbio/omics-integrator-2:v2',
+                            command,
+                            volumes,
+                            work_dir)
+        print(out)
 
         # TODO do we want to retain other output files?
         # TODO if deleting other output files, write them all to a tmp directory and copy
@@ -157,5 +136,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file):
         df = pd.read_csv(raw_pathway_file, sep='\t')
         df = df[df['in_solution'] == True]  # Check whether this column can be empty before revising this line
         df = df.take([0, 1], axis=1)
-        df[3] = [1 for _ in range(len(df.index))]
+        df = add_rank_column(df)
         df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t')
diff --git a/src/util.py b/src/util.py
@@ -230,7 +230,7 @@ def hash_filename(filename: str, length: Optional[int] = None) -> str:
 
 
 # Because this is called independently for each file, the same local path can be mounted to multiple volumes
-def prepare_volume(filename: str, volume_base: str) -> Tuple[Tuple[PurePath, PurePath], str]:
+def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]:
     """
     Makes a file on the local file system accessible within a container by mapping the local (source) path to a new
     container (destination) path and renaming the file to be relative to the destination path.
@@ -246,6 +246,8 @@ def prepare_volume(filename: str, volume_base: str) -> Tuple[Tuple[PurePath, Pur
     if not base_path.is_absolute():
         raise ValueError(f'Volume base must be an absolute path: {volume_base}')
 
+    if isinstance(filename, PurePath):
+        filename = str(filename)
     filename_hash = hash_filename(filename, DEFAULT_HASH_LENGTH)
     dest = PurePosixPath(base_path, filename_hash)
 

diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py
@@ -1,8 +1,14 @@
+import shutil
+from pathlib import Path
+
 import pytest
 
 from src.omicsintegrator2 import OmicsIntegrator2
 
 TEST_DIR = 'test/OmicsIntegrator2/'
+EDGE_FILE = TEST_DIR+'input/oi2-edges.txt'
+PRIZE_FILE = TEST_DIR+'input/oi2-prizes.txt'
+OUT_FILE = Path(TEST_DIR, 'output', 'test.tsv')
 
 
 class TestOmicsIntegrator2:
@@ -11,34 +17,53 @@ class TestOmicsIntegrator2:
     """
     def test_oi2_required(self):
         # Only include required arguments
-        OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt',
-                             prizes=TEST_DIR+'input/oi2-prizes.txt',
-                             output_file=TEST_DIR+'output/test.tsv')
+        OUT_FILE.unlink(missing_ok=True)
+        OmicsIntegrator2.run(edges=EDGE_FILE,
+                             prizes=PRIZE_FILE,
+                             output_file=OUT_FILE)
+        assert OUT_FILE.exists()
 
     def test_oi2_some_optional(self):
         # Include optional argument
-        OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt',
-                             prizes=TEST_DIR+'input/oi2-prizes.txt',
-                             output_file=TEST_DIR+'output/test.tsv',
+        OUT_FILE.unlink(missing_ok=True)
+        OmicsIntegrator2.run(edges=EDGE_FILE,
+                             prizes=PRIZE_FILE,
+                             output_file=OUT_FILE,
                              g=0)
+        assert OUT_FILE.exists()
 
     def test_oi2_all_optional(self):
         # Include all optional arguments
-        OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt',
-                             prizes=TEST_DIR+'input/oi2-prizes.txt',
-                             output_file=TEST_DIR+'output/test.tsv',
+        OUT_FILE.unlink(missing_ok=True)
+        OmicsIntegrator2.run(edges=EDGE_FILE,
+                             prizes=PRIZE_FILE,
+                             output_file=OUT_FILE,
                              w=5,
                              b=1,
                              g=3,
                              noise=0.1,
                              noisy_edges=0,
                              random_terminals=0,
                              dummy_mode='terminals',
-                             seed=2)
+                             seed=2,
+                             singularity=False)
+        assert OUT_FILE.exists()
 
     def test_oi2_missing(self):
         # Test the expected error is raised when required arguments are missing
         with pytest.raises(ValueError):
             # No output_file
-            OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt',
-                                 prizes=TEST_DIR+'input/oi2-prizes.txt')
+            OmicsIntegrator2.run(edges=EDGE_FILE,
+                                 prizes=PRIZE_FILE)
+
+    # Only run Singularity test if the binary is available on the system
+    # spython is only available on Unix, but do not explicitly skip non-Unix platforms
+    @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system')
+    def test_oi2_singularity(self):
+        # Only include required arguments
+        OUT_FILE.unlink(missing_ok=True)
+        OmicsIntegrator2.run(edges=EDGE_FILE,
+                             prizes=PRIZE_FILE,
+                             output_file=OUT_FILE,
+                             singularity=True)
+        assert OUT_FILE.exists()
diff --git a/test/test_util.py b/test/test_util.py
@@ -39,6 +39,8 @@ def test_hash_params_sha1_base32(self):
                              [('oi1-edges.txt', '/spras', '/spras/MG4YPNK/oi1-edges.txt'),
                               ('test/OmicsIntegrator1/input/oi1-edges.txt', '/spras', '/spras/ZNNT3GR/oi1-edges.txt'),
                               ('test/OmicsIntegrator1/output/', '/spras', '/spras/DPCSFJV/output'),
+                              (PurePosixPath('test/OmicsIntegrator1/output/'), '/spras', '/spras/TNDO5TR/output'),
+                              ('test/OmicsIntegrator1/output', PurePosixPath('/spras'), '/spras/TNDO5TR/output'),
                               ('../src', '/spras', '/spras/NNBVZ6X/src')])
     def test_prepare_volume(self, filename, volume_base, expected_filename):
         _, container_filename = prepare_volume(filename, volume_base)