diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index e2dbf3dd..177401dd 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -77,7 +77,7 @@ jobs: - name: Pull Docker images run: | docker pull reedcompbio/omics-integrator-1:latest - docker pull reedcompbio/omics-integrator-2:latest + docker pull reedcompbio/omics-integrator-2:v2 docker pull reedcompbio/pathlinker:latest docker pull reedcompbio/meo:latest docker pull reedcompbio/mincostflow:latest @@ -98,8 +98,8 @@ jobs: path: docker-wrappers/OmicsIntegrator2/. dockerfile: docker-wrappers/OmicsIntegrator2/Dockerfile repository: reedcompbio/omics-integrator-2 - tags: latest - cache_froms: reedcompbio/omics-integrator-2:latest + tags: v2 + cache_froms: reedcompbio/omics-integrator-2:v2 push: false - name: Build PathLinker Docker image uses: docker/build-push-action@v1 diff --git a/docker-wrappers/OmicsIntegrator2/Dockerfile b/docker-wrappers/OmicsIntegrator2/Dockerfile index 3bab7dd9..e0df2d2d 100644 --- a/docker-wrappers/OmicsIntegrator2/Dockerfile +++ b/docker-wrappers/OmicsIntegrator2/Dockerfile @@ -1,11 +1,6 @@ # Omics Integrator 2 wrapper # https://github.com/fraenkel-lab/OmicsIntegrator2 -# Activates the conda environment before running command inside container -# Uses the strategy from https://pythonspeed.com/articles/activate-conda-dockerfile/ -# by Itamar Turner-Trauring FROM continuumio/miniconda3:4.9.2 COPY environment.yml . -RUN conda env create -f environment.yml - -ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "oi2"] +RUN conda env update --name base --file environment.yml --prune diff --git a/docker-wrappers/OmicsIntegrator2/README.md b/docker-wrappers/OmicsIntegrator2/README.md index e1678826..c7c7d11f 100644 --- a/docker-wrappers/OmicsIntegrator2/README.md +++ b/docker-wrappers/OmicsIntegrator2/README.md @@ -2,12 +2,7 @@ A Docker image for [Omics Integrator 2](https://github.com/fraenkel-lab/OmicsIntegrator2) that is available on [DockerHub](https://hub.docker.com/repository/docker/reedcompbio/omics-integrator-2). -## Activating conda inside a Docker container - -By default, an installed conda environment will not be activated inside the Docker container. -Docker does not invoke Bash as a login shell. -[This blog post](https://pythonspeed.com/articles/activate-conda-dockerfile/) provides a workaround demonstrated here in `Dockerfile` and `env.yml`. -It defines a custom ENTRYPOINT that uses `conda run` to run the command inside the conda environment. +## Building the Docker image To create the Docker image run: ``` @@ -27,8 +22,11 @@ Test code is located in `test/OmicsIntegrator2`. The `input` subdirectory contains test files `oi2-edges.txt` and `oi2-prizes.txt`. The Docker wrapper can be tested with `pytest`. +## Versions: +- v1: Created a named conda environment in the container and used `ENTRYPOINT` to execute commands inside that environment. Not compatible with Singularity. +- v2: Used the environment file to update the base conda environment so the `ENTRYPOINT` command was no longer needed. Compatible with Singularity. + ## TODO - Attribute https://github.com/fraenkel-lab/OmicsIntegrator2 - Modify environment to use fraenkel-lab or [PyPI](https://pypi.org/project/OmicsIntegrator/) version instead of fork - Document usage -- Consider `continuumio/miniconda3:4.9.2-alpine` base image \ No newline at end of file diff --git a/src/omicsintegrator2.py b/src/omicsintegrator2.py index e76f9b4c..0147dd49 100644 --- a/src/omicsintegrator2.py +++ b/src/omicsintegrator2.py @@ -1,11 +1,10 @@ -import os from pathlib import Path -import docker import pandas as pd +from src.dataset import Dataset from src.prm import PRM -from src.util import prepare_path_docker +from src.util import add_rank_column, prepare_volume, run_container __all__ = ['OmicsIntegrator2'] @@ -13,40 +12,38 @@ class OmicsIntegrator2(PRM): required_inputs = ['prizes', 'edges'] - def generate_inputs(data, filename_map): + def generate_inputs(data: Dataset, filename_map): """ - Access fields from the dataset and write the required input files + Access fields from the dataset and write the required input files. + Automatically converts edge weights to edge costs. @param data: dataset @param filename_map: a dict mapping file types in the required_inputs to the filename for that type - @return: """ for input_type in OmicsIntegrator2.required_inputs: if input_type not in filename_map: raise ValueError(f"{input_type} filename is missing") if data.contains_node_columns('prize'): - #NODEID is always included in the node table + # NODEID is always included in the node table node_df = data.request_node_columns(['prize']) - elif data.contains_node_columns(['sources','targets']): - #If there aren't prizes but are sources and targets, make prizes based on them - node_df = data.request_node_columns(['sources','targets']) + elif data.contains_node_columns(['sources', 'targets']): + # If there aren't prizes but are sources and targets, make prizes based on them + node_df = data.request_node_columns(['sources', 'targets']) node_df.loc[node_df['sources']==True, 'prize'] = 1.0 node_df.loc[node_df['targets']==True, 'prize'] = 1.0 else: raise ValueError("Omics Integrator 2 requires node prizes or sources and targets") - #Omics Integrator already gives warnings for strange prize values, so we won't here - node_df.to_csv(filename_map['prizes'],sep='\t',index=False,columns=['NODEID','prize'],header=['name','prize']) + # Omics Integrator already gives warnings for strange prize values, so we won't here + node_df.to_csv(filename_map['prizes'], sep='\t', index=False, columns=['NODEID', 'prize'], header=['name', 'prize']) edges_df = data.get_interactome() - #We'll have to update this when we make iteractomes more proper, but for now + # We'll have to update this when we make iteractomes more proper, but for now # assume we always get a weight and turn it into a cost. - # use the same approach as omicsintegrator2 by adding half the max cost as the base cost. + # use the same approach as OmicsIntegrator2 by adding half the max cost as the base cost. # if everything is less than 1 assume that these are confidences and set the max to 1 - edges_df['cost'] = (max(edges_df['Weight'].max(),1.0)*1.5) - edges_df['Weight'] - edges_df.to_csv(filename_map['edges'],sep='\t',index=False,columns=['Interactor1','Interactor2','cost'],header=['protein1','protein2','cost']) - - + edges_df['cost'] = (max(edges_df['Weight'].max(), 1.0)*1.5) - edges_df['Weight'] + edges_df.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'Interactor2', 'cost'], header=['protein1', 'protein2', 'cost']) # TODO add parameter validation # TODO add reasonable default values @@ -63,22 +60,25 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise if edges is None or prizes is None or output_file is None: raise ValueError('Required Omics Integrator 2 arguments are missing') - if singularity: - raise NotImplementedError('Omics Integrator 2 does not yet support Singularity') + work_dir = '/spras' + + # Each volume is a tuple (src, dest) + volumes = list() - # Initialize a Docker client using environment variables - client = docker.from_env() - work_dir = Path(__file__).parent.parent.absolute() + bind_path, edge_file = prepare_volume(edges, work_dir) + volumes.append(bind_path) - edge_file = Path(edges) - prize_file = Path(prizes) + bind_path, prize_file = prepare_volume(prizes, work_dir) + volumes.append(bind_path) out_dir = Path(output_file).parent # Omics Integrator 2 requires that the output directory exist - Path(work_dir, out_dir).mkdir(parents=True, exist_ok=True) + out_dir.mkdir(parents=True, exist_ok=True) + bind_path, mapped_out_dir = prepare_volume(out_dir, work_dir) + volumes.append(bind_path) - command = ['OmicsIntegrator', '-e', edge_file.as_posix(), '-p', prize_file.as_posix(), - '-o', out_dir.as_posix(), '--filename', 'oi2'] + command = ['OmicsIntegrator', '-e', edge_file, '-p', prize_file, + '-o', mapped_out_dir, '--filename', 'oi2'] # Add optional arguments if w is not None: @@ -101,34 +101,13 @@ def run(edges=None, prizes=None, output_file=None, w=None, b=None, g=None, noise print('Running Omics Integrator 2 with arguments: {}'.format(' '.join(command)), flush=True) - #Don't perform this step on systems where permissions aren't an issue like windows - need_chown = True - try: - uid = os.getuid() - except AttributeError: - need_chown = False - - try: - out = client.containers.run('reedcompbio/omics-integrator-2', - command, - stderr=True, - volumes={ - prepare_path_docker(work_dir): {'bind': '/OmicsIntegrator2', 'mode': 'rw'}}, - working_dir='/OmicsIntegrator2') - if need_chown: - #This command changes the ownership of output files so we don't - # get a permissions error when snakemake tries to touch the files - chown_command = " ".join(["chown",str(uid),out_dir.as_posix()+"/oi2*"]) - client.containers.run('reedcompbio/omics-integrator-2', - chown_command, - stderr=True, - volumes={prepare_path_docker(work_dir): {'bind': '/OmicsIntegrator2', 'mode': 'rw'}}, - working_dir='/OmicsIntegrator2') - - print(out.decode('utf-8')) - finally: - # Not sure whether this is needed - client.close() + container_framework = 'singularity' if singularity else 'docker' + out = run_container(container_framework, + 'reedcompbio/omics-integrator-2:v2', + command, + volumes, + work_dir) + print(out) # TODO do we want to retain other output files? # TODO if deleting other output files, write them all to a tmp directory and copy @@ -157,5 +136,5 @@ def parse_output(raw_pathway_file, standardized_pathway_file): df = pd.read_csv(raw_pathway_file, sep='\t') df = df[df['in_solution'] == True] # Check whether this column can be empty before revising this line df = df.take([0, 1], axis=1) - df[3] = [1 for _ in range(len(df.index))] + df = add_rank_column(df) df.to_csv(standardized_pathway_file, header=False, index=False, sep='\t') diff --git a/src/util.py b/src/util.py index 9d3fae06..05a35a01 100644 --- a/src/util.py +++ b/src/util.py @@ -230,7 +230,7 @@ def hash_filename(filename: str, length: Optional[int] = None) -> str: # Because this is called independently for each file, the same local path can be mounted to multiple volumes -def prepare_volume(filename: str, volume_base: str) -> Tuple[Tuple[PurePath, PurePath], str]: +def prepare_volume(filename: Union[str, PurePath], volume_base: Union[str, PurePath]) -> Tuple[Tuple[PurePath, PurePath], str]: """ Makes a file on the local file system accessible within a container by mapping the local (source) path to a new container (destination) path and renaming the file to be relative to the destination path. @@ -246,6 +246,8 @@ def prepare_volume(filename: str, volume_base: str) -> Tuple[Tuple[PurePath, Pur if not base_path.is_absolute(): raise ValueError(f'Volume base must be an absolute path: {volume_base}') + if isinstance(filename, PurePath): + filename = str(filename) filename_hash = hash_filename(filename, DEFAULT_HASH_LENGTH) dest = PurePosixPath(base_path, filename_hash) diff --git a/test/OmicsIntegrator2/test_oi2.py b/test/OmicsIntegrator2/test_oi2.py index 91d90816..9470649d 100644 --- a/test/OmicsIntegrator2/test_oi2.py +++ b/test/OmicsIntegrator2/test_oi2.py @@ -1,8 +1,14 @@ +import shutil +from pathlib import Path + import pytest from src.omicsintegrator2 import OmicsIntegrator2 TEST_DIR = 'test/OmicsIntegrator2/' +EDGE_FILE = TEST_DIR+'input/oi2-edges.txt' +PRIZE_FILE = TEST_DIR+'input/oi2-prizes.txt' +OUT_FILE = Path(TEST_DIR, 'output', 'test.tsv') class TestOmicsIntegrator2: @@ -11,22 +17,27 @@ class TestOmicsIntegrator2: """ def test_oi2_required(self): # Only include required arguments - OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt', - prizes=TEST_DIR+'input/oi2-prizes.txt', - output_file=TEST_DIR+'output/test.tsv') + OUT_FILE.unlink(missing_ok=True) + OmicsIntegrator2.run(edges=EDGE_FILE, + prizes=PRIZE_FILE, + output_file=OUT_FILE) + assert OUT_FILE.exists() def test_oi2_some_optional(self): # Include optional argument - OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt', - prizes=TEST_DIR+'input/oi2-prizes.txt', - output_file=TEST_DIR+'output/test.tsv', + OUT_FILE.unlink(missing_ok=True) + OmicsIntegrator2.run(edges=EDGE_FILE, + prizes=PRIZE_FILE, + output_file=OUT_FILE, g=0) + assert OUT_FILE.exists() def test_oi2_all_optional(self): # Include all optional arguments - OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt', - prizes=TEST_DIR+'input/oi2-prizes.txt', - output_file=TEST_DIR+'output/test.tsv', + OUT_FILE.unlink(missing_ok=True) + OmicsIntegrator2.run(edges=EDGE_FILE, + prizes=PRIZE_FILE, + output_file=OUT_FILE, w=5, b=1, g=3, @@ -34,11 +45,25 @@ def test_oi2_all_optional(self): noisy_edges=0, random_terminals=0, dummy_mode='terminals', - seed=2) + seed=2, + singularity=False) + assert OUT_FILE.exists() def test_oi2_missing(self): # Test the expected error is raised when required arguments are missing with pytest.raises(ValueError): # No output_file - OmicsIntegrator2.run(edges=TEST_DIR+'input/oi2-edges.txt', - prizes=TEST_DIR+'input/oi2-prizes.txt') + OmicsIntegrator2.run(edges=EDGE_FILE, + prizes=PRIZE_FILE) + + # Only run Singularity test if the binary is available on the system + # spython is only available on Unix, but do not explicitly skip non-Unix platforms + @pytest.mark.skipif(not shutil.which('singularity'), reason='Singularity not found on system') + def test_oi2_singularity(self): + # Only include required arguments + OUT_FILE.unlink(missing_ok=True) + OmicsIntegrator2.run(edges=EDGE_FILE, + prizes=PRIZE_FILE, + output_file=OUT_FILE, + singularity=True) + assert OUT_FILE.exists() diff --git a/test/test_util.py b/test/test_util.py index 88ae0eb7..968ec716 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -39,6 +39,8 @@ def test_hash_params_sha1_base32(self): [('oi1-edges.txt', '/spras', '/spras/MG4YPNK/oi1-edges.txt'), ('test/OmicsIntegrator1/input/oi1-edges.txt', '/spras', '/spras/ZNNT3GR/oi1-edges.txt'), ('test/OmicsIntegrator1/output/', '/spras', '/spras/DPCSFJV/output'), + (PurePosixPath('test/OmicsIntegrator1/output/'), '/spras', '/spras/TNDO5TR/output'), + ('test/OmicsIntegrator1/output', PurePosixPath('/spras'), '/spras/TNDO5TR/output'), ('../src', '/spras', '/spras/NNBVZ6X/src')]) def test_prepare_volume(self, filename, volume_base, expected_filename): _, container_filename = prepare_volume(filename, volume_base)