From 7a7164048a16a8556538e99e0004c779707c5d08 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Thu, 2 Nov 2023 14:32:55 +0100 Subject: [PATCH 01/22] manage python part of FastOMA with hatch --- pyproject.toml | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..fb8a4fa --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "FastOMA" +dynamic = ["version"] +description = "FastOMA - a package to infer orthology information among proteomes" +readme = "README.md" +license = "MIT" +requires-python = ">=3.8" +authors = [ + { name = "Sina Majidian", email = "sina.majidian@gmail.com" }, + { name = "Adrian Altenhoff", email = "adrian.altenhoff@inf.ethz.ch" } +] +dependencies = [ + "biopython ~=1.81", + "DendroPy ~=4.3", + "ete3 ~=3.1", + "lxml ~=4.3", + "omamer ~=2.0", + "pyham ~=1.1", + "pyparsing ", +] + +[project.optinal-dependencies] +nextflow = [ + "nextflow" +] + + +[project.scripts] +batch-roothogs = "FastOMA.batch_roothogs:batch_roothogs" +check-input-fastoma = "FastOMA.check_input_fastoma:check_input_fastoma" +collect-subhogs = "FastOMA.collect_subhogs:collect_subhogs" +infer-roothogs = "FastOMA.infer_roothogs:infer_roothogs" +infer-subhogs = "FastOMA.infer_subhogs:infer_subhogs" + +[project.urls] +Homepage = "https://github.com/DessimozLab/FastOMA" + +[tool.hatch.version] +path = "FastOMA/__init__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/FastOMA", +] From 579ab46da378c36a142b7ca5bb2c86b2752dcbec Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Thu, 2 Nov 2023 14:33:32 +0100 Subject: [PATCH 02/22] initial Dockerfile to generate container with necessay dependencies --- Dockerfile | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..99ac18e --- /dev/null +++ b/Dockerfile @@ -0,0 +1,42 @@ +FROM python:3.11-slim as basis + +# set environment varibles +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + + +FROM basis as builder +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + fasttree \ + libxml2 \ + mafft \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src +RUN pip install --upgrade hatch pip +COPY pyproject.toml . +RUN python -m venv /app \ + && hatch dep show requirements > requirements.txt \ + && /app/bin/pip install -r requirements.txt + +COPY . . +RUN ls -la \ + && hatch build \ + && ls -la dist/ \ + && /app/bin/pip install dist/*.whl + + +FROM basis as runtime +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + fasttree \ + libxml2 \ + mafft \ + && apt-get -y autoremove \ + && apt-get -y autoclean \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /app /app +ENV PATH="/app/bin:$PATH" From 7b9d0d52ef85ded6d881c9b8d1a5bf9294860de3 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 6 Nov 2023 07:48:51 +0100 Subject: [PATCH 03/22] adding .gitignore and .dockerignore --- .dockerignore | 7 +++++++ .gitignore | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..466368d --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +work +.nextflow* +.idea +.git +output +testdata +dist diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..35f96f5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.nextflow* +work/ +.idea/ +dist/ +archive +.git +.gitignore From ce3b3a4650ca920acc125444c23958e61714923b Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 6 Nov 2023 09:35:45 +0100 Subject: [PATCH 04/22] adding github workflow to build docker image --- .github/workflows/docker-image.yml | 73 ++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 .github/workflows/docker-image.yml diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 0000000..73e5fb6 --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,73 @@ +name: Docker Image CI + +on: + push: + pull_request: + release: + type: [published] + +env: + TEST_TAG: dessimozlab/fastoma:test + +jobs: + + build: + + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Docker meta + id: meta + uses: docker/metadata-action@v4 + with: + # list of Docker images to use as base name for tags + images: | + dessimozlab/fastoma + # generate Docker tags based on the following events/attributes + tags: | + type=schedule + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Build and export to docker for testing + uses: docker/build-push-action@v3 + with: + context: . + load: true + tags: ${{ env.TEST_TAG }} + + #- name: Test + # run: | + # docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/cds-marker_genes.fasta.gz --reads /reads/sample_1.fastq --output_path /out + # if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v3 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + #${{ github.event_name != 'push' && github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} From 62edd3b125d89786155d42e3a3c9b0b8e84c71a0 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 13 Nov 2023 16:36:43 +0100 Subject: [PATCH 05/22] [WIP] restructure pipeline such that no signaling is needed and data properly mounted --- FastOMA/batch_roothogs.py | 148 +++++++++++++++----------------------- FastOMA_light.nf | 66 +++++++++-------- README.md | 17 ++++- nextflow.config | 32 +++++++++ 4 files changed, 136 insertions(+), 127 deletions(-) create mode 100644 nextflow.config diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py index 59a3ddb..55e04d2 100644 --- a/FastOMA/batch_roothogs.py +++ b/FastOMA/batch_roothogs.py @@ -1,98 +1,64 @@ - - -# from Bio import SeqIO -import os -from os import makedirs import shutil -from os import listdir from . import _config - - -def list_rhog_fastas(address_rhogs_folder): - """ - create orthoxml_to_newick.py list of rootHOG IDs stored in the folder of rHOG . - input: folder address - output: list of rhog Id (integer) - """ - rhog_files = listdir(address_rhogs_folder) - rhogid_list= [] - for rhog_file in rhog_files: - if rhog_file.split(".")[-1] == "fa" and rhog_file.split("_")[0] == "HOG": - rhogid = rhog_file.split(".")[0].split("_")[1] - rhogid_list.append(rhogid) - - return rhogid_list - - - -def folder_1h_rhog(address_rhogs_folder, output_folder_big, output_folder_rest): - - # in_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/working_nfp/" - - #work_fldor_out = in_folder - - rhogid_list = list_rhog_fastas(address_rhogs_folder) - len(rhogid_list) - dic_rhognum_size = {} - for rhogid in rhogid_list[2:]: - rhog_i_prot_address = address_rhogs_folder + "/HOG_" + rhogid + ".fa" - # rhog_i = list(SeqIO.parse(rhog_i_prot_address, "fasta")) - rhog_i_size = os.path.getsize(rhog_i_prot_address) - dic_rhognum_size[rhogid] = rhog_i_size - len(dic_rhognum_size) - - - #list_list_rest_rhog = [[]] - list_list_rest_size = [[]] - #list_list_big = [] - - # to have at least one big and one rest rhog, this makes nextflow pipline easier, - # otherwise we need if condition collect_subhogs won't satisfy if one of them is empty - - list_list_rest_rhog = [[rhogid_list[0]],[]] # each insid list should take 1h to - list_list_rest_size = [[],[]] - list_list_big = [rhogid_list[1]] - - - - for rhognum, size in dic_rhognum_size.items(): - # print(rhognum, size) - if size > _config.big_rhog_filesize_thresh: - list_list_big.append(rhognum) - else: - if sum(list_list_rest_size[-1]) < _config.sum_list_rhogs_filesize_thresh: - list_list_rest_rhog[-1].append(rhognum) - list_list_rest_size[-1].append(size) +from pathlib import Path +logger = _config.logger_hog + + +class BatchBuilder: + def __init__(self, outdir: Path, max_size: int): + self.outdir = outdir + self.max_size = max_size + + def __enter__(self): + self.cur_batch = [] + self.cur_size = 0 + self.counter = 0 + self.outdir.mkdir(parents=True, exist_ok=True) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if len(self.cur_batch) > 0: + self._flush() + + def add_hog(self, hog_file: Path): + self.cur_batch.append(hog_file) + self.cur_size += hog_file.stat().st_size + if self.cur_size > self.max_size: + self._flush() + + def _flush(self): + batch_dir = self.outdir / str(self.counter) + batch_dir.mkdir() + for fn in self.cur_batch: + shutil.copy(fn, batch_dir) + logger.debug("creating batch %s with %d families; total size of files is %d", + batch_dir, len(self.cur_batch), self.cur_size) + self.cur_size = 0 + self.cur_batch = [] + + +def folder_1h_rhog(roothog_path:Path, output_folder_big:Path, output_folder_rest:Path): + # create a list of hogs in descending filesize order + hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1]) + with BatchBuilder(output_folder_big, 1) as big_hogs, \ + BatchBuilder(output_folder_rest, _config.sum_list_rhogs_filesize_thresh) as rest_hogs: + for hog, fsize in hog_size_tuples: + if fsize > _config.big_rhog_filesize_thresh: + big_hogs.add_hog(hog) else: - list_list_rest_rhog.append([rhognum]) - list_list_rest_size.append([size]) - - # makedirs(work_fldor_out+"rhogs_rest") - for folder_id, list_rhog in enumerate(list_list_rest_rhog): - # print(folder_id) - # output_folder_rest = work_fldor_out + "/rhogs_rest/" - makedirs( output_folder_rest + str(folder_id)) - for rhogid in list_rhog: - name = "HOG_" + rhogid + ".fa" - folder_rest =output_folder_rest + str(folder_id) + "/" - shutil.copy(address_rhogs_folder + name, folder_rest + name) - - # makedirs(work_fldor_out+"rhogs_big") - # output_folder_big = work_fldor_out + "rhogs_big/" - for folder_id, rhogid in enumerate(list_list_big): - name = "HOG_" + rhogid + ".fa" - folder_big = output_folder_big + "b" + str(folder_id) + "/" - makedirs(folder_big) - shutil.copy(address_rhogs_folder + name, folder_big + name) - - return 1 - + rest_hogs.add_hog(hog) def batch_roothogs(): - - input_rhog = "./omamer_rhogs/" # - output_folder_big = "./rhogs_big/" - output_folder_rest = "./rhogs_rest/" - folder_1h_rhog(input_rhog, output_folder_big, output_folder_rest) + import argparse + parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis") + parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored") + parser.add_argument('--out-big', required=True, help="folder where the big single family hogs should be stored") + parser.add_argument('--out-rest', required=True, help="folder where the remaining families should be stored in" + "batch subfolder structure.") + parser.add_argument('-v', default=0, action="count", help="incrase verbosity") + conf = parser.parse_args() + logger.setLevel(level=30 - 10 * min(conf.v, 2)) + + folder_1h_rhog(Path(conf.input_roothogs), Path(conf.out_big), Path(conf.out_rest)) diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 7c1df26..009825f 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -17,20 +17,22 @@ params.genetrees_folder = params.output_folder + "/genetrees" process omamer_run{ time {4.h} publishDir params.hogmap_folder + input: - path proteomes_omamerdb_inputhog + tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder) + output: - path "*.hogmap" - val true + path "*.hogmap" + val true + script: """ - if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ] - then - cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ${proteomes_omamerdb_inputhog[0]}.hogmap + if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then + cp ${precomputed_hogmap_folder}/${proteome}.hogmap ${proteome}.hogmap else - omamer search --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap + omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap fi - """ // --nthreads 10 + """ } @@ -41,9 +43,8 @@ process infer_roothogs{ path proteome_folder path splice_folder output: - path "omamer_rhogs" + path "omamer_rhogs/*" path "gene_id_dic_xml.pickle" - val true // nextflow-io.github.io/patterns/state-dependency/ script: """ infer-roothogs --logger-level DEBUG @@ -53,15 +54,13 @@ process infer_roothogs{ process batch_roothogs{ input: - val ready_infer_roothogs - path "omamer_rhogs" + path rhogs, stageAs: "omamer_rhogs/*" output: path "rhogs_rest/*", optional: true path "rhogs_big/*" , optional: true - val true script: """ - batch-roothogs + batch-roothogs --input-roothogs omamer_rhogs/ --out-big rhogs_big --out-rest rhogs_rest -vvv """ } @@ -85,7 +84,7 @@ process hog_big{ process hog_rest{ publishDir params.pickles_temp input: - val rhogsrest_tree_ready + tuple path(rhogsrest), path(species_tree) output: path "*.pickle" path "*.fa" , optional: true // msa if write True @@ -93,8 +92,8 @@ process hog_rest{ val true script: """ - infer-subhogs --input-rhog-folder ${rhogsrest_tree_ready[0]} --species-tree ${rhogsrest_tree_ready[1]} --fragment-detection --low-so-detection - """ // --parrallel False + infer-subhogs --input-rhog-folder ${rhogsrest} --species-tree ${species_tree} --fragment-detection --low-so-detection + """ } @@ -118,37 +117,36 @@ process collect_subhogs{ } workflow { - proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true) + proteomes = Channel.fromPath(params.proteomes, type:'any', checkIfExists:true) proteome_folder = Channel.fromPath(params.proteome_folder) hogmap_folder = Channel.fromPath(params.hogmap_folder) splice_folder = Channel.fromPath(params.splice_folder) genetrees_folder = Channel.fromPath(params.genetrees_folder) - hogmap_in = Channel.fromPath(params.hogmap_in) + hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir') pickles_temp = Channel.fromPath(params.pickles_temp) - omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") + omamerdb = Channel.fromPath(params.omamer_db) proteomes_omamerdb = proteomes.combine(omamerdb) - proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"} - (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog) + proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) + + (hogmap, ready_omamer_run) = omamer_run(proteomes_omamerdb_inputhog) ready_omamer_run_c = ready_omamer_run.collect() (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder) - ready_infer_roothogs_c = ready_infer_roothogs.collect() - (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, omamer_rhogs) - ready_batch_roothogs_c = ready_batch_roothogs.collect() + (rhogs_rest_list, rhogs_big_list) = batch_roothogs(omamer_rhogs) + species_tree = Channel.fromPath(params.species_tree) - rhogsbig = rhogs_big_list.flatten() - rhogsbig_tree = rhogsbig.combine(species_tree) - rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} - (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready) - - rhogsrest = rhogs_rest_list.flatten() - rhogsrest_tree = rhogsrest.combine(species_tree) - rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c) - (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready) + //rhogsbig = rhogs_big_list.flatten() + //rhogsbig_tree = rhogsbig.combine(species_tree) + //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} + //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree) + + rhogsrest_tree = rhogs_rest_list.combine(species_tree) + //rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c) + (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree) (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs) orthoxml_file.view{" output orthoxml file ${it}"} diff --git a/README.md b/README.md index f9354dc..1ad49bf 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,22 @@ which can be used with [PyHAM](https://github.com/DessimozLab/pyham). In summary, you need to 1) install FastOMA and its prerequisites (below), and 2) put the input files in the folder `in_folder` and 3) run FastOMA using the nextflow recipe `FastOMA_light.nf`. ``` -nextflow FastOMA_light.nf --input_folder /path/to/in_folder --output_folder /path/to/out_folder +nextflow run FastOMA_light.nf -profile docker --input_folder /path/to/in_folder --output_folder /path/to/out_folder ``` -The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`. +The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`. + +## More details on how to run +We provide for every commit of the repository a docker image for FastOMA on dockerhub. You can specify the container as +part of the nextflow command with the parameter `container_version`. If you want to use the container of the current +git checkout version, you can specify this in the following way: + +```bash +nextflow run FastOMA.nf -profile docker \ + --container_version "sha-$(git rev-list --max-count=1 --abbrev-commit HEAD)" \ + --input_folder testdata/in_folder \ + --output_folder myresult/ +``` + # How to install FastOMA diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..866c569 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,32 @@ +// General configuration used in all profiles +manifest { + description = 'FastOMA - Infer orthology ' + nextflowVersion = '22.10.4' +} + +params { + container_name = "dessimozlab/fastoma" + container_version = "latest" + omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5" +} + +// Profiles configure nextflow depending on the environment (local, integration, live, etc.) +profiles { + docker { + process { + container = "$params.container_name:$params.container_version" + } + docker.enabled = true + } + singularity { + process { + container = "$params.container_name:$params.container_version" + } + singularity.enabled = true + singularity.autoMounts = true + } + standard { + process.executor = 'local' + } +} + From 40950c04fec943086a73689d32812a3d30e75267 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Tue, 14 Nov 2023 10:09:24 +0100 Subject: [PATCH 06/22] [WIP] correct batching, missing collection phase --- FastOMA/_config.py | 2 +- FastOMA/batch_roothogs.py | 1 + FastOMA/infer_subhogs.py | 9 +-------- FastOMA_light.nf | 35 +++++++++++++++++++---------------- 4 files changed, 22 insertions(+), 25 deletions(-) diff --git a/FastOMA/_config.py b/FastOMA/_config.py index fe778f2..46fdf62 100644 --- a/FastOMA/_config.py +++ b/FastOMA/_config.py @@ -93,7 +93,7 @@ # batch_roothogs big_rhog_filesize_thresh = 400 * 1000 -sum_list_rhogs_filesize_thresh = 2 * 1e6 +sum_list_rhogs_filesize_thresh = 2 * 1e3 # big_rhog_filesize_thresh = 1.6 * 1000 # 600 would be better diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py index 55e04d2..86de8b8 100644 --- a/FastOMA/batch_roothogs.py +++ b/FastOMA/batch_roothogs.py @@ -25,6 +25,7 @@ def add_hog(self, hog_file: Path): self.cur_size += hog_file.stat().st_size if self.cur_size > self.max_size: self._flush() + self.counter += 1 def _flush(self): batch_dir = self.outdir / str(self.counter) diff --git a/FastOMA/infer_subhogs.py b/FastOMA/infer_subhogs.py index 05670c7..2c685c9 100644 --- a/FastOMA/infer_subhogs.py +++ b/FastOMA/infer_subhogs.py @@ -3,13 +3,6 @@ from . import _utils_subhog from . import _infer_subhog from . import _config -# import os -# from os import listdir -# import os -import sys -# from os import listdir -# import _config - from ._config import logger_hog import os @@ -53,7 +46,7 @@ def infer_subhogs(): if inferhog_concurrent_on: print("parallelization for subhog inference is on.") - pickles_rhog_folder = "./" # pickles_temp/ pickle_rhogs + pickles_rhog_folder = "./pickle_hogs" # pickles_temp/ pickle_rhogs if not os.path.exists(pickles_rhog_folder): os.makedirs(pickles_rhog_folder) diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 009825f..48882f3 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -65,7 +65,6 @@ process batch_roothogs{ } process hog_big{ - publishDir params.pickles_temp cpus 2 time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume` input: @@ -82,17 +81,21 @@ process hog_big{ } process hog_rest{ - publishDir params.pickles_temp input: - tuple path(rhogsrest), path(species_tree) + each rhogsrest + path species_tree output: - path "*.pickle" - path "*.fa" , optional: true // msa if write True - path "*.nwk" , optional: true // gene trees if write True + path "pickle_hogs" + path "msa/*.fa" , optional: true // msa if write True + path "gene_trees/*.nwk" , optional: true // gene trees if write True val true script: """ - infer-subhogs --input-rhog-folder ${rhogsrest} --species-tree ${species_tree} --fragment-detection --low-so-detection + infer-subhogs --input-rhog-folder ${rhogsrest} \ + --species-tree ${species_tree} \ + --fragment-detection \ + --low-so-detection + #--out pickle_hogs """ } @@ -100,11 +103,9 @@ process hog_rest{ process collect_subhogs{ publishDir params.output_folder, mode: 'copy' input: - val ready_hog_rest - val ready_hog_big - path "pickles_temp" // this is the folder includes pickles_rhogs + path pickles, stageAs: "pickle_folders/?" path "gene_id_dic_xml.pickle" - path "omamer_rhogs" + path rhogs, stageAs: "omamer_rhogs/*" output: path "output_hog.orthoxml" path "OrthologousGroupsFasta" @@ -121,6 +122,7 @@ workflow { proteome_folder = Channel.fromPath(params.proteome_folder) hogmap_folder = Channel.fromPath(params.hogmap_folder) splice_folder = Channel.fromPath(params.splice_folder) + species_tree = Channel.fromPath(params.species_tree) genetrees_folder = Channel.fromPath(params.genetrees_folder) hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir') @@ -135,20 +137,21 @@ workflow { (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder) - (rhogs_rest_list, rhogs_big_list) = batch_roothogs(omamer_rhogs) + (rhogs_rest_batches, rhogs_big_list) = batch_roothogs(omamer_rhogs) + rhogs_rest_batches.flatten().view{ "batch $it" } - species_tree = Channel.fromPath(params.species_tree) //rhogsbig = rhogs_big_list.flatten() //rhogsbig_tree = rhogsbig.combine(species_tree) //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree) - rhogsrest_tree = rhogs_rest_list.combine(species_tree) + //rhogsrest_tree = rhogs_rest_list.combine(species_tree) //rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c) - (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree) + (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogs_rest_batches.flatten(), species_tree) + pickle_rest_rhog.view() - (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs) + (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(pickle_rest_rhog.collect(), gene_id_dic_xml, omamer_rhogs) orthoxml_file.view{" output orthoxml file ${it}"} } From 02eb3792422fce55d2f99b70cd156eea21044f4e Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Tue, 14 Nov 2023 11:57:58 +0100 Subject: [PATCH 07/22] [WIP] correct batching, orthoxml generated --- FastOMA/batch_roothogs.py | 2 +- FastOMA/collect_subhogs.py | 70 ++++++++++++++++++++++---------------- FastOMA_light.nf | 6 +++- 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py index 86de8b8..9245219 100644 --- a/FastOMA/batch_roothogs.py +++ b/FastOMA/batch_roothogs.py @@ -38,7 +38,7 @@ def _flush(self): self.cur_batch = [] -def folder_1h_rhog(roothog_path:Path, output_folder_big:Path, output_folder_rest:Path): +def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_rest: Path): # create a list of hogs in descending filesize order hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1]) with BatchBuilder(output_folder_big, 1) as big_hogs, \ diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py index c10db60..af3923e 100644 --- a/FastOMA/collect_subhogs.py +++ b/FastOMA/collect_subhogs.py @@ -13,32 +13,60 @@ import os from FastOMA.zoo.hog.convert import orthoxml_to_newick from Bio import SeqIO +from pathlib import Path # This code collect subhogs and writes outputs. +def load_hogs(pickle_folder: Path): + hogs_all = [] + cnt = 0 + logger_hog.info("reading pickle files from %s", pickle_folder) + for root, dirs, files in os.walk(pickle_folder, followlinks=True): + for f in files: + if f.startswith('file_') and f.endswith('.pickle'): + with open(os.path.join(root, f), 'rb') as handle: + cur = pickle.load(handle) + hogs_all.extend(cur) + cnt += 1 + if cnt % 500 == 0: + logger_hog.info("read %d batch pickle files so far, resulting in %d roothogs so far", + cnt, len(hogs_all)) + logger_hog.info("number of pickle files is %d.", cnt) + logger_hog.debug("number of hogs in all batches is %d", len(hogs_all)) + return hogs_all -def collect_subhogs(): - - logger_hog.info("started collecting pickle files ") +def collect_subhogs(): + import argparse + parser = argparse.ArgumentParser(description="collecting all computed HOGs and combine into a single orthoxml") + parser.add_argument('--pickle-folder', required=True, + help="folder containing the pickle files. Will be searched recursively") + parser.add_argument('--roothogs-folder', required=True, help="folder containing the omamer roothogs") + parser.add_argument('--gene-id-pickle-file', required=True, + help="file containing the gene-id dictionary in pickle format") + parser.add_argument('--out', required=False, default="output_hog.orthoxml", help="output filename in orthoxml") + parser.add_argument('-v', action="count", default=0) + conf = parser.parse_args() + logger_hog.setLevel(level=30 - 10 * min(conf.v, 2)) + logger_hog.debug(conf) + + write_hog_orthoxml(conf.pickle_folder, conf.out, conf.gene_id_pickle_file) + + +def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file): # todo as input argument/option in nextflow protein_format_qfo_dataset_before2022 = True #False # in benchamrk dataset the output prot names should be short # tr|A0A0N7KCI6|A0A0N7KCI6_ORYSJ # for qfo benchmark, the middle should be written in the file - pickle_folder = "./pickles_temp/" #pickle_rhogs - output_xml_name = "./output_hog.orthoxml" - gene_id_pickle_file = "./gene_id_dic_xml.pickle" # this file includes integer mapping of each protein - - orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA", "originVersion": "Nov 2021", "version": "0.3"}) # with open(gene_id_pickle_file, 'rb') as handle: gene_id_name = pickle.load(handle) # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name) - logger_hog.debug("We read the gene_id_name dictionary with"+str(len(gene_id_name))+"items") + logger_hog.debug("We read the gene_id_name dictionary with %d items", len(gene_id_name)) logger_hog.debug("Now creating the header of orthoxml") # #### create the header of orthoxml #### @@ -56,22 +84,8 @@ def collect_subhogs(): logger_hog.debug("gene_xml is created.") # #### create the groups of orthoxml #### - pickle_files_adress_raw = listdir(pickle_folder) - pickle_files_adress = [i for i in pickle_files_adress_raw if i.endswith(".pickle") and i.startswith("file_")] - - logger_hog.info("number of pickle files is "+str(len(pickle_files_adress))+".") - logger_hog.debug("pickle files are " + str(len(pickle_files_adress)) + ".") - hogs_a_rhog_xml_all = [] - for pickle_file_adress in pickle_files_adress: - with open(pickle_folder + pickle_file_adress, 'rb') as handle: - hogs_a_rhog_xml_batch = pickle.load(handle) # list of hog object. - hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch) - # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object. - - logger_hog.debug("number of hogs in all batches is "+str(len(hogs_a_rhog_xml_all))+" .") groups_xml = ET.SubElement(orthoxml_file, "groups") - - for hogs_a_rhog_xml in hogs_a_rhog_xml_all: + for hogs_a_rhog_xml in load_hogs(Path(pickle_folder)): groups_xml.append(hogs_a_rhog_xml) logger_hog.debug("converting the xml object to string.") @@ -82,14 +96,10 @@ def collect_subhogs(): file_xml.write(xml_str) file_xml.close() - logger_hog.info("orthoxml is written in " + output_xml_name) - - - - - + logger_hog.info("orthoxml is written in %s", output_xml_name) +def write_fasta(): logger_hog.info("Now writing OG fasta files ") diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 48882f3..be18a13 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -113,7 +113,11 @@ process collect_subhogs{ path "rootHOGs.tsv" script: """ - collect-subhogs + collect-subhogs --pickle-folder pickle_folders/ \ + --roothogs-folder omamer_rhogs/ \ + --gene-id-pickle-file gene_id_dic_xml.pickle \ + --out output_hog.orthoxml \ + -vv """ } From 151ede885789a6d85aae69876a25d1d27c193c40 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Tue, 14 Nov 2023 15:48:10 +0100 Subject: [PATCH 08/22] [WIP] fix remaining output - big roothogs not done yet --- FastOMA/collect_subhogs.py | 53 ++++++++++++++++++++++---------------- FastOMA_light.nf | 6 ++--- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py index af3923e..60e0c50 100644 --- a/FastOMA/collect_subhogs.py +++ b/FastOMA/collect_subhogs.py @@ -46,11 +46,22 @@ def collect_subhogs(): help="file containing the gene-id dictionary in pickle format") parser.add_argument('--out', required=False, default="output_hog.orthoxml", help="output filename in orthoxml") parser.add_argument('-v', action="count", default=0) + parser.add_argument('--roothog-tsv', default=None, required=False, + help="If specified, a tsv file with the given path will be produced containing the roothog asignments as TSV file.") + parser.add_argument('--marker-groups-fasta', default=None, required=False, + help="If specified, a folder named OrthologousFasta and a TSV file with the name provided in " + "this argument will be generated that contains single copy groups, i.e. groups which have " + "at most one gene per species. Useful as phylogenetic marker genes to reconstruct species " + "trees.") conf = parser.parse_args() logger_hog.setLevel(level=30 - 10 * min(conf.v, 2)) logger_hog.debug(conf) write_hog_orthoxml(conf.pickle_folder, conf.out, conf.gene_id_pickle_file) + if conf.roothog_tsv is not None: + write_roothogs(conf.out, conf.roothog_tsv) + if conf.marker_groups_fasta is not None: + write_group_files(conf.out, conf.roothogs_folder, conf.marker_groups_fasta) def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file): @@ -99,10 +110,13 @@ def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file): logger_hog.info("orthoxml is written in %s", output_xml_name) -def write_fasta(): - logger_hog.info("Now writing OG fasta files ") - +def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=None, output_fasta_groups=None): + if output_file_og_tsv is None: + output_file_og_tsv = "OrthologousGroups.tsv" + if output_fasta_groups is None: + output_fasta_groups = "OrthologousGroupsFasta" + logger_hog.info("Start writing OG fasta files") def max_og_tree(tree): for node in tree.traverse("preorder"): # for node in xml_tree.traverse(strategy="preorder", is_leaf_fn=lambda n: hasattr(n, "attriremoved") and n.attriremoved==True): @@ -145,14 +159,12 @@ def max_og_tree(tree): return og_prot_list - input_orthoxml = output_xml_name # sys.argv[1] # "out_folder/output_hog_.orthoxml" rhog_all_folder = "./omamer_rhogs/" #sys.argv[2] + "/" # "out_folder/rhogs_all/" fasta_format = "fa" # of the rhogs - output_file_og_tsv = "OrthologousGroups.tsv" - trees, species_dic = orthoxml_to_newick(input_orthoxml, return_gene_to_species=True) # encode_levels_as_nhx=False, xref_tag="protId", - print("We extracted " + str(len(trees)) + " trees in NHX format from the input HOG orthoxml" + input_orthoxml) + trees, species_dic = orthoxml_to_newick(orthoxml, return_gene_to_species=True) + logger_hog.info("We extracted %d trees in NHX format from the input HOG orthoxml %s", len(trees), orthoxml) OGs = {} for hog_id, tree_string in trees.items(): @@ -171,10 +183,9 @@ def max_og_tree(tree): print("We wrote the protein families information in the file " + output_file_og_tsv) - out_folder_ogs = "OrthologousGroupsFasta/" - os.makedirs(out_folder_ogs) + os.makedirs(output_fasta_groups) - print("start writing " + str(len(OGs)) + " OGs as fasta files in folder " + out_folder_ogs) + print("start writing " + str(len(OGs)) + " OGs as fasta files in folder " + output_fasta_groups) for hog_id, og_prot_list in OGs.items(): # hog_id="HOG_0667494_sub10524" rhog_id = "_".join(hog_id.split("_")[:2]) @@ -190,30 +201,28 @@ def max_og_tree(tree): og_prots.append(rhogs_prot) og_id = "OG_" + hog_id # one OG per rootHOG # "/HOG_"+ rhogid - SeqIO.write(og_prots, out_folder_ogs + og_id + ".fa", "fasta") - print("writing done") - + SeqIO.write(og_prots, output_fasta_groups + og_id + ".fa", "fasta") + logger_hog.info("writing done") +def write_roothogs(orthoxml, out): # import sys # input_orthoxml = output_xml_name - output_file = "rootHOGs.tsv" + if out is None: + out = "rootHOGs.tsv" toplevel_groups = [] - for grp in extract_flat_groups_at_level(input_orthoxml): + for grp in extract_flat_groups_at_level(orthoxml): toplevel_groups.append(set(g.xref for g in grp)) - # toplevel_groups is a list of sets + logger_hog.info("We extracted %d protein families from the input HOG orthoxml %s", len(toplevel_groups), orthoxml) + logger_hog.info("The first one contain %d proteins.", len(toplevel_groups[0])) - print("We extracted "+str(len(toplevel_groups))+" protein families from the input HOG orthoxml"+input_orthoxml) - print("The first one contain "+str(len(toplevel_groups[0]))+" proteins.") - - with open(output_file, 'w') as handle: + with open(out, 'w') as handle: for toplevel_group_idx, toplevel_group in enumerate(toplevel_groups): line_text = str(toplevel_group_idx)+"\t"+str(toplevel_group)[1:-1]+"\n" handle.write(line_text) - handle.close() - print("We wrote the protein families information in the file "+output_file) + logger_hog.info("We wrote the protein families information in the file %s", out) diff --git a/FastOMA_light.nf b/FastOMA_light.nf index be18a13..002274f 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -117,6 +117,8 @@ process collect_subhogs{ --roothogs-folder omamer_rhogs/ \ --gene-id-pickle-file gene_id_dic_xml.pickle \ --out output_hog.orthoxml \ + --marker-groups-fasta OrthologousGroups.tsv \ + --roothog-tsv rootHOGs.tsv \ -vv """ } @@ -150,9 +152,7 @@ workflow { //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree) - //rhogsrest_tree = rhogs_rest_list.combine(species_tree) - //rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c) - (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogs_rest_batches.flatten(), species_tree) + (pickle_rest_rhog, msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree) pickle_rest_rhog.view() (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(pickle_rest_rhog.collect(), gene_id_dic_xml, omamer_rhogs) From 39fa2d9af267aa773161fa9fa65c29771f395e9e Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Thu, 16 Nov 2023 10:57:20 +0100 Subject: [PATCH 09/22] [WIP] integrate big_hogs part into pipeline, write combined orthoxml file. --- FastOMA/_config.py | 2 +- FastOMA/batch_roothogs.py | 1 + FastOMA/infer_roothogs.py | 4 ++-- FastOMA_light.nf | 37 +++++++++++++++++-------------------- 4 files changed, 21 insertions(+), 23 deletions(-) diff --git a/FastOMA/_config.py b/FastOMA/_config.py index 46fdf62..29d1504 100644 --- a/FastOMA/_config.py +++ b/FastOMA/_config.py @@ -92,7 +92,7 @@ inferhog_min_hog_size_xml = 2 # by setting this as 1, pyham won't work on xml output. # batch_roothogs -big_rhog_filesize_thresh = 400 * 1000 +big_rhog_filesize_thresh = 1300 sum_list_rhogs_filesize_thresh = 2 * 1e3 diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py index 9245219..49b314e 100644 --- a/FastOMA/batch_roothogs.py +++ b/FastOMA/batch_roothogs.py @@ -23,6 +23,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): def add_hog(self, hog_file: Path): self.cur_batch.append(hog_file) self.cur_size += hog_file.stat().st_size + logger.debug("adding %s with size %d to batch %d", hog_file, hog_file.stat().st_size, self.counter) if self.cur_size > self.max_size: self._flush() self.counter += 1 diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py index 09771dd..42848ea 100644 --- a/FastOMA/infer_roothogs.py +++ b/FastOMA/infer_roothogs.py @@ -33,10 +33,10 @@ def infer_roothogs(): # sys.exit(0) #folder = "/scratch/smajidi1/euk_omamer200.dev8_2/test/hogmap/FastOMA-main/testdata/in_folder/" - species_names, prot_recs_lists,fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder + species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists) - hogmaps = _utils_roothog.parse_hogmap_omamer(species_names,fasta_format_keep) # optional input folder + hogmaps = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep) # optional input folder splice_files = os.path.exists("./splice/") if splice_files: diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 002274f..254c613 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -16,7 +16,7 @@ params.genetrees_folder = params.output_folder + "/genetrees" process omamer_run{ time {4.h} - publishDir params.hogmap_folder + publishDir params.hogmap_folder, mode: 'copy' input: tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder) @@ -67,16 +67,20 @@ process batch_roothogs{ process hog_big{ cpus 2 time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume` - input: - val rhogsbig_tree_ready + input: + each rhogsbig + path species_tree output: - path "*.pickle" - path "*.fa", optional: true // msa if write True - path "*.nwk", optional: true // gene trees if write True - val true + path "pickle_hogs" + path "msa/*.fa" , optional: true // msa if write True + path "gene_trees/*.nwk" , optional: true // gene trees if write True script: """ - infer-subhogs --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel --fragment-detection --low-so-detection + infer-subhogs --input-rhog-folder ${rhogsbig} \ + --species-tree ${species_tree} \ + --fragment-detection \ + --low-so-detection \ + --parallel """ } @@ -86,9 +90,8 @@ process hog_rest{ path species_tree output: path "pickle_hogs" - path "msa/*.fa" , optional: true // msa if write True + path "msa/*.fa" , optional: true // msa if write True path "gene_trees/*.nwk" , optional: true // gene trees if write True - val true script: """ infer-subhogs --input-rhog-folder ${rhogsrest} \ @@ -143,19 +146,13 @@ workflow { (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder) - (rhogs_rest_batches, rhogs_big_list) = batch_roothogs(omamer_rhogs) - rhogs_rest_batches.flatten().view{ "batch $it" } - - - //rhogsbig = rhogs_big_list.flatten() - //rhogsbig_tree = rhogsbig.combine(species_tree) - //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} - //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree) + (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs) + (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree) (pickle_rest_rhog, msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree) - pickle_rest_rhog.view() + channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle } - (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(pickle_rest_rhog.collect(), gene_id_dic_xml, omamer_rhogs) + (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs) orthoxml_file.view{" output orthoxml file ${it}"} } From 2ab83614f0e1d40cb1c0ca4fc17ae9ee03b54e01 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Fri, 17 Nov 2023 16:39:10 +0100 Subject: [PATCH 10/22] [WIP] made pipeline work again after merge --- FastOMA/_utils_roothog.py | 97 +++++++------- FastOMA/check_fastoma_input.py | 230 +++++++++++++++++---------------- FastOMA/infer_roothogs.py | 64 ++++----- FastOMA_light.nf | 42 +++--- pyproject.toml | 2 +- 5 files changed, 214 insertions(+), 221 deletions(-) diff --git a/FastOMA/_utils_roothog.py b/FastOMA/_utils_roothog.py index d6006df..122e3d3 100644 --- a/FastOMA/_utils_roothog.py +++ b/FastOMA/_utils_roothog.py @@ -1,3 +1,5 @@ +import collections +import csv from Bio import SeqIO import pickle @@ -6,9 +8,10 @@ from ._utils_subhog import logger_hog from . import _config +HOGMapData = collections.namedtuple("HOGMapData", ("hogid", "score", "seqlen", "subfamily_medianseqlen")) -def parse_proteomes(folder="./"): # list_oma_species +def parse_proteomes(folder=None): # list_oma_species """ parsing fasta files of proteins located in /proteome/ using Bio.SeqIO.parse @@ -16,28 +19,31 @@ def parse_proteomes(folder="./"): # list_oma_species All proteoems should be with the same extension. output: prot_recs_lists: a dic with key as species name and its value list of Biopython record of species. """ + if folder is None: + folder = "./proteome" - project_files = listdir(folder+"/proteome/") - species_names = [] # query/input species name based on the file name + fasta_format_keep = "" + project_files = listdir(folder) + logger_hog.debug("using '%s' as proteome folder, found %d files", folder, len(project_files)) + species_names = [] # query/input species name based on the file name for file in project_files: - fasta_format = file.split(".")[-1] - if fasta_format == "fa" or fasta_format == "fasta": - file_name_split = file.split(".")[:-1] - species_names.append('.'.join(file_name_split)) - fasta_format_keep = fasta_format # last one is stored either fa or fasta + species_name, ext = file.rsplit('.', 1) + logger_hog.debug("%s: %s | %s", file, species_name, ext) + if ext in ("fa", "fasta"): + species_names.append(species_name) + fasta_format_keep = ext # last one is stored either fa or fasta + # todo accept all fasta formats in the input prtoeome folder, fasta, fa, fna, .. prot_recs_lists = {} # key: species name, value is a dic of query protein Biopython records. # 'MYCGE': [SeqRecord(seq=Seq('MDFDK - for species_name in species_names: - prot_address = folder+"/proteome/" + species_name + "."+fasta_format_keep + prot_address = os.path.join(folder, species_name + "." + fasta_format_keep) prots_record = list(SeqIO.parse(prot_address, "fasta")) - prot_recs_lists[species_name]=prots_record + prot_recs_lists[species_name] = prots_record - logger_hog.info("There are "+str(len(species_names))+" species in the proteome folder.") + logger_hog.info("There are %d species in the proteome folder.", len(species_names)) return species_names, prot_recs_lists, fasta_format_keep - def add_species_name_prot_id(species_names, prot_recs_lists): """ adding the name of species to each protein record @@ -77,7 +83,7 @@ def add_species_name_prot_id(species_names, prot_recs_lists): return prot_recs_all -def parse_hogmap_omamer(species_names, fasta_format_keep, folder="./"): +def parse_hogmap_omamer(species_names, fasta_format_keep, folder=None): """ function for parsing output of omamer (hogmap files) located in /hogmap/ Each hogmap file correspond to one fasta file of species, with the same name. @@ -87,41 +93,29 @@ def parse_hogmap_omamer(species_names, fasta_format_keep, folder="./"): # sp|O66907|ATPA_AQUAE HOG:C0886513.1b Eukaryota 696.5519485850672 187 0.37302594463771466 0.2643067275644273 120 504 551 0.8230616302186878 output is dic of dic for all species: """ + if folder is None: + folder = "./hogmap" + hogmaps = {} - unmapped = {} + unmapped = collections.defaultdict(list) for species_name in species_names: - hogmap_address = folder + "/hogmap/" + species_name + "."+fasta_format_keep+".hogmap" - hogmap_file = open(hogmap_address, 'r') - - for line in hogmap_file: - line_strip = line.strip() - if not line_strip.startswith('!') and not line_strip.startswith('qs'): - # qseqid hogid hoglevel family_p family_count family_normcount subfamily_score subfamily_count qseqlen subfamily_medianseqlen qseq_overlap - line_split = line_strip.split("\t") - prot_id = line_split[0] - hogid = line_split[1] - score = line_split[3] - seqlen = line_split[8] - subfamily_medianseqlen = line_split[9] - - if hogid == "N/A": - if species_name in unmapped: - unmapped[species_name].append(prot_id) - else: - unmapped[species_name] = [prot_id] + hogmap_address = os.path.join(folder, species_name + "." + fasta_format_keep + ".hogmap") + cur_species_hogmap = collections.defaultdict(list) + with open(hogmap_address, 'rt') as hogmap_file: + reader = csv.DictReader((line for line in hogmap_file if not line.startswith('!')), + dialect="excel-tab") + for row in reader: + if row['hogid'] == "N/A": + unmapped[species_name].append(row['qseqid']) else: - if species_name in hogmaps: - if prot_id in hogmaps[species_name]: - hogmaps[species_name][prot_id].append((hogid, score, seqlen, subfamily_medianseqlen)) - - else: - hogmaps[species_name][prot_id] = [(hogid, score, seqlen, subfamily_medianseqlen)] - else: - hogmaps[species_name] = {prot_id: [(hogid, score, seqlen, subfamily_medianseqlen)]} - - logger_hog.info("There are " + str(len(hogmaps)) + " species in the hogmap folder.") - logger_hog.info("The first species " + species_names[0] + " contains " + str(len(hogmaps[species_names[0]])) + " proteins.") - + cur_species_hogmap[row['qseqid']].append( + HOGMapData(row['hogid'], row['family_p'], row['qseqlen'], row['subfamily_medianseqlen'])) + hogmaps[species_name] = cur_species_hogmap + logger_hog.info("hogmap %s: %d proteins mapped to %d hogs, %d not mapped", + species_name, len(cur_species_hogmap), sum(len(x) for x in cur_species_hogmap.values()), + len(unmapped[species_name])) + + logger_hog.info("There are %d species in the hogmap folder.", len(hogmaps)) return hogmaps, unmapped @@ -583,14 +577,13 @@ def write_clusters(address_rhogs_folder, min_rhog_size): return cluster_iter-1 - -def parse_isoform_file(species_names, folder="."): +def parse_isoform_file(species_names, folder=None): + if folder is None: + folder = "./splice" isoform_by_gene_all = {} for species_name in species_names: # from fasta file - - file_splice_name = folder + "/splice/" + species_name + ".splice" - + file_splice_name = os.path.join(folder, species_name + ".splice") if os.path.isfile(file_splice_name): # from OMArk isoform_by_gene = [] @@ -601,12 +594,10 @@ def parse_isoform_file(species_names, folder="."): isoform_by_gene.append(splice) else: isoform_by_gene = [] - isoform_by_gene_all[species_name] = isoform_by_gene return isoform_by_gene_all - def find_nonbest_isoform(species_names, isoform_by_gene_all, hogmaps): isoform_selected = {} isoform_not_selected = {} diff --git a/FastOMA/check_fastoma_input.py b/FastOMA/check_fastoma_input.py index ee3b7f4..7f00b3e 100644 --- a/FastOMA/check_fastoma_input.py +++ b/FastOMA/check_fastoma_input.py @@ -1,65 +1,57 @@ - import sys from ete3 import Tree import os import collections +import argparse from . import _config from . import _utils_roothog -from ._utils_subhog import logger_hog - - - -""" -cd in_folder -python check_fastoma_input.py -""" +logger = _config.logger_hog -def check_proteome_files(): +def check_proteome_files(proteome_folder): + proteome_files = os.listdir(proteome_folder) + logger.info("There are %d files in the proteome folder.", len(proteome_files)) - proteome_files = os.listdir("./proteome/") - logger_hog.info("There are " + str(len(proteome_files)) + " files in the proteome folder. ") - - not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta") )] + not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta"))] if not_fa: - logger_hog.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these " + str(not_fa) ) - - fa_fasta = [i.split(".")[-1] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")] - if len(fa_fasta)<=2: - logger_hog.error("There are not enough proteomes in the folder ") - logger_hog.error("Check input failed. FastOMA halted!") - sys.exit(1) + logger.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these %s", not_fa) - if len(set(fa_fasta))>1: - logger_hog.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them " + str(proteome_files) ) + fa_fasta = [i.split(".")[-1] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")] + if len(fa_fasta) <= 2: + logger.error("There are not enough proteomes in the folder ") + logger.error("Check input failed. FastOMA halted!") + return False + if len(set(fa_fasta)) > 1: + logger.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them %s.", proteome_files) + return False - return 1 + return True -def check_proteome(species_names,prot_recs_lists): - +def check_proteome(species_names, prot_recs_lists, proteome_folder): + isOk = True for species_name in species_names: num_prots = len(prot_recs_lists[species_name]) if num_prots <= 2: - logger_hog.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it, in_folder/proteome/" + species_name + "."+fasta_format_keep) - - return 1 - + logger.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it, %s/%s.fa", proteome_folder, species_name) + isOk = False + return isOk -def check_hogmap_files(): +def check_hogmap_files(hogmap_folder): - hogmap_files = os.listdir("./hogmap_in/") - logger_hog.info("There are " + str(len(hogmap_files)) + " files in the hogmap folder. ") - not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap") ] + hogmap_files = os.listdir(hogmap_folder) + logger.info("There are %d files in the hogmap folder.", len(hogmap_files)) + not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap")] if not_hogmap: - logger_hog.warning("We expect that only hogmap files are in the hogmap_in folder. Better to remove these " + str(not_hogmap) ) + logger.warning("We expect that only hogmap files are in the %s folder. Better to remove these %s", + hogmap_folder, not_hogmap) - species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if i.endswith(".hogmap")] - logger_hog.info("There are "+str(len(species_hogmaps))+" hogmaps.") + species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if i.endswith(".hogmap")] + logger.info("There are "+str(len(species_hogmaps))+" hogmaps.") return species_hogmaps @@ -72,63 +64,63 @@ def check_speciestree_internalnode(species_tree): if not node.is_leaf(): internal_node_name = [] if (not node.name) or len(node.name) < 1: - logger_hog.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.") + logger.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.") else: internal_node_name.append(node.name) if len(internal_node_name) != len(set(internal_node_name)): - logger_hog.warning("All the internal node names should be unique. One of the internal node is repeated:") - logger_hog.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1]) - logger_hog.warning("We'll change the internal node names.") - - - + logger.warning("All the internal node names should be unique. One of the internal node is repeated:") + logger.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1]) + logger.warning("We'll change the internal node names.") return 1 -def check_speciestree_leaves(species_tree,species_names): +def check_speciestree_leaves(species_tree, species_names): leaves_name = [i.name for i in species_tree.get_leaves()] - logger_hog.info("The species tree has " + str(len(leaves_name))+" leaves.") - if len(set(leaves_name))!=len(leaves_name): - logger_hog.error("Leaves name should be unique in the species tree. You may try ete3 prune") - logger_hog.error("Check input failed. FastOMA halted!") - sys.exit(1) + logger.info("The species tree has %d leaves.", len(leaves_name)) + if len(set(leaves_name)) != len(leaves_name): + logger.error("Leaves name should be unique in the species tree. You may try ete3 prune") + logger.error("Check input failed. FastOMA halted!") + return False - species_names_not_intree = [i for i in species_names if i not in leaves_name] + species_names_not_intree = [i for i in species_names if i not in leaves_name] if species_names_not_intree: - logger_hog.error("These species are not in the species tree "+ str(species_names_not_intree)) - logger_hog.error("This is a tree template: ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files: AQUAE.fa CHLTR.fa MYCGE.fa. ") - logger_hog.error("Check input failed. FastOMA halted!") - sys.exit(1) - leaves_tree_not_proteome = [i for i in leaves_name if i not in species_names] + logger.error("These species are not in the species tree:\n %s", "\n ".join(species_names_not_intree)) + logger.error("This is a tree template: ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files: AQUAE.fa CHLTR.fa MYCGE.fa. ") + logger.error("Check input failed. FastOMA halted!") + return False + + leaves_tree_not_proteome = [i for i in leaves_name if i not in species_names] if leaves_tree_not_proteome: - logger_hog.warning("there are "+str(len(leaves_tree_not_proteome))+" leaves in the species tree that there is no proteome. So we will discard them.") + logger.warning("there are %d leaves in the species tree that there is no proteome. So we will discard them.", len(leaves_tree_not_proteome)) + return True - return 1 -def check_omamer_db(omamerdb_adress="omamerdb.h5"): +def check_omamer_db(omamerdb_adress=None): + if omamerdb_adress is None: + logger.warning("omamer_db not passed. will assume you know what you are doing.") + return True - if os.path.exists(omamerdb_adress): + if os.path.exists(omamerdb_adress): if os.path.getsize(omamerdb_adress) > 10000: # 3 bytes omamerdb = True # todo we can do some checks on version omamer v2 else: - logger_hog.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress) + logger.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress) omamerdb = False else: omamerdb =False - logger_hog.info("OMAmer db does not exist.") + logger.info("OMAmer db does not exist.") return omamerdb -def add_internal_node_prune(species_tree,species_names): - +def add_internal_node_prune(species_tree, species_names, out_fname): # add name for the internal, if no name is provided, removintg special chars counter_internal = 0 node_names = set() for node in species_tree.traverse(strategy="postorder"): - if not node.is_leaf() : + if not node.is_leaf(): node_name = node.name if len(node_name) < 3 or node_name in node_names: @@ -136,27 +128,26 @@ def add_internal_node_prune(species_tree,species_names): node.name = "internal_" + str(counter_internal) counter_internal += 1 node_names.add(node.name) - logger_hog.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name) - elif not all(e.isalnum() or e=="_" or e=="-" for e in node_name): #any(not c.isalnum() for c in node_name): - node_name_new = ''.join(e for e in node_name if e.isalnum() or e=="_" or e=="-") # removing special chars + logger.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name) + elif not all(e.isalnum() or e in ("_", "-") for e in node_name): #any(not c.isalnum() for c in node_name): + node_name_new = ''.join(e for e in node_name if e.isalnum() or e in ("_", "-")) # removing special chars if node_name_new in node_names: - node.name =node_name_new+"_"+str(counter_internal) + node_name_new += "_" + str(counter_internal) counter_internal += 1 - else: - node.name=node_name_new - + logger.debug("The internal node name has special chars or repeated '%s' which is changed to '%s'", + node.name, node_name_new) + node.name = node_name_new node_names.add(node.name) - logger_hog.debug("The internal node name has special chars or repeated " + node_name + " which is changed to " + node.name) else: node_names.add(node_name) - logger_hog.info("The species tree has " + str(len(species_tree)) + " leaves") - species_tree.prune(species_names) # , preserve_branch_length=True) - logger_hog.info("After pruning, the species tree has " + str(len(species_tree)) + " leaves") + logger.info("The species tree has %d leaves", len(species_tree)) + species_tree.prune(species_names) # , preserve_branch_length=True) + logger.info("After pruning, the species tree has %d leaves", len(species_tree)) - species_tree_output= "./species_tree_checked.nwk" - species_tree.write(format=1, format_root_node=True, outfile=species_tree_output) - logger_hog.debug("The new species tree is written "+species_tree_output) + species_tree_output = "./species_tree_checked.nwk" + species_tree.write(format=1, format_root_node=True, outfile=out_fname) + logger.debug("The new species tree is written %s", out_fname) return species_tree @@ -174,12 +165,12 @@ def check_splice(isoform_by_gene_all): total_isoforms+=len(isoform) total_genes+=1 - logger_hog.debug("For "+str(spliced_species_num)+" species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.") + logger.debug("For "+str(spliced_species_num)+" species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.") if total_genes ==0: - logger_hog.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ") + logger.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ") sys.exit(1) - logger_hog.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+" splices.") + logger.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+" splices.") # make sys back return 1 @@ -190,63 +181,74 @@ def check_splice(isoform_by_gene_all): def check_fastoma_input(): - _config.set_configs() - - # print(_config.in_folder) - print(_config.logger_level) - - print(_config.species_tree_address) # nwk format + import argparse + parser = argparse.ArgumentParser(description="checking parameters for FastOMA") + parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes") + parser.add_argument("--species-tree", required=True, help="Path to the input species tree file in newick format") + parser.add_argument("--out-tree", required=True, help="Path to output file for sanitised species tree. ") + parser.add_argument("--splice", help="Path to the folder containing the splice information files") + parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files") + parser.add_argument("--omamer_db", help="Path to the omamer database") + parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug") + conf = parser.parse_args() + logger.setLevel(level=30 - 10 * min(conf.v, 2)) + logger.debug("Arguments: %s", conf) + + proteomes_ok = check_proteome_files(conf.proteomes) + if not proteomes_ok: + logger.error("Halting FastOMA because of invalid proteome input data") + sys.exit(1) - result = check_proteome_files() + species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes) + if not check_proteome(species_names, prot_recs_lists, conf.proteomes): + logger.error("Halting FastOMA because of invalid proteome input data") + sys.exit(1) - species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder - check_proteome(species_names, prot_recs_lists) + #check species tree try: - species_tree = Tree(_config.species_tree_address, format=1) + species_tree = Tree(conf.species_tree, format=1) except: try: - species_tree = Tree(_config.species_tree_address) + species_tree = Tree(conf.species_tree) # todo add check fro Phyloxml except: - logger_hog.error("We have problem with parsing species tree "+str(_config.species_tree_address) + " using ete3 Tree. Maybe there are some special chars.") + logger.error("We have problem with parsing species tree %s using ete3 Tree. Maybe there are some special chars.", conf.species_tree) + sys.exit(1) check_speciestree_internalnode(species_tree) - check_speciestree_leaves(species_tree,species_names) + if not check_speciestree_leaves(species_tree, species_names): + logger.error("Halting FastOMA because of invalid species tree") + sys.exit(1) - add_internal_node_prune(species_tree,species_names) + add_internal_node_prune(species_tree, species_names, conf.out_tree) - hogmap_files = os.path.exists("./hogmap_in/") - species_hogmaps =[] + hogmap_files = conf.hogmap is not None and os.path.exists(conf.hogmap) + species_hogmaps = [] if hogmap_files: - species_hogmaps = check_hogmap_files() + species_hogmaps = check_hogmap_files(conf.hogmap) - #print("species_hogmaps",species_hogmaps) - #print("species_names",species_names) hogmap_complete = False if set(species_hogmaps) == set(species_names): hogmap_complete = True - # print("hogmap_complete", hogmap_complete) - omamerdb = check_omamer_db() + omamerdb = check_omamer_db(conf.omamer_db) if not(omamerdb or hogmap_complete): - logger_hog.error("OMAmer db does not exit and no hogmap provided. ") - logger_hog.error("Check input failed. FastOMA halted!") + logger.error("OMAmer db does not exist and no hogmap provided.") + logger.error("Check input failed. FastOMA halted!") sys.exit(1) else: - logger_hog.info("OMAmer db or hogmap exist. It looks ok. ") + logger.info("OMAmer db or hogmap exist. It looks ok.") - #todo check splice file format . if the name matches with the proteome files. - splice_files = os.path.exists("./splice/") + # todo check splice file format . if the name matches with the proteome files. + splice_files = conf.splice is not None and os.path.exists(conf.splice) if splice_files: - logger_hog.debug("splice folder exist. Let's see its inside.") - isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names) + logger.debug("splice folder exist. Let's see its inside.") + isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, conf.splice) check_splice(isoform_by_gene_all) else: - logger_hog.info("Splice folder doesn't exist and that's ok.") - - - logger_hog.info("Input check finished ! ") + logger.info("Splice folder doesn't exist and that's ok.") + logger.info("Input check finished ! ") diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py index c08cf2b..6d62b35 100644 --- a/FastOMA/infer_roothogs.py +++ b/FastOMA/infer_roothogs.py @@ -1,11 +1,9 @@ - - import os.path from shutil import which -from ._utils_subhog import logger_hog from . import _utils_roothog from . import _config +logger = _config.logger_hog """ proteomes of species as fasta files in /proteome/ @@ -14,55 +12,47 @@ """ -# if not os.path.exists(_config.in_folder): -# os.mkdir(_config.in_folder) # s - -# in_folder+"omamer_database/oma_path/OmaServer.h5" -# logger_hog.info("rHOG inferece has started. The oma database address is in " + _config.oma_database_address) -# (oma_db, list_oma_species) = _utils_rhog.parse_oma_db(_config.oma_database_address) -# (query_species_names, query_prot_recs) = _utils_rhog.parse_proteome(list_oma_species) - - - def infer_roothogs(): - _config.set_configs() - - # print(_config.in_folder) - print(_config.logger_level) - # import sys - # sys.exit(0) - - #folder = "/scratch/smajidi1/euk_omamer200.dev8_2/test/hogmap/FastOMA-main/testdata/in_folder/" - species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder + import argparse + parser = argparse.ArgumentParser(description="checking parameters for FastOMA") + parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes") + parser.add_argument("--splice", help="Path to the folder containing the splice information files") + parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files") + parser.add_argument("--out-rhog-folder", required=True, help="Folder where the roothog fasta files are written") + parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug") + conf = parser.parse_args() + logger.setLevel(level=30 - 10 * min(conf.v, 2)) + logger.debug("Arguments: %s", conf) + + species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes) # optional input folder prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists) - hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep) # optional input folder + hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep) # optional input folder - splice_files = os.path.exists("./splice/") + splice_files = conf.splice is not None and os.path.exists(conf.splice) if splice_files: - isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names) - isoform_selected, isoform_not_selected = _utils_roothog.find_nonbest_isoform(species_names,isoform_by_gene_all,hogmaps) - _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected,prot_recs_lists) + isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, folder=conf.splice) + isoform_selected, isoform_not_selected = _utils_roothog.find_nonbest_isoform( + species_names, isoform_by_gene_all, hogmaps + ) + _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_lists) # for each isoform file, there will be a file ending with _selected_isoforms.tsv - hogmaps = _utils_roothog.handle_splice(hogmaps,isoform_not_selected) - + hogmaps = _utils_roothog.handle_splice(hogmaps, isoform_not_selected) rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps) - - rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots,hogmaps) + rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots, hogmaps) rhogs_prots = _utils_roothog.merge_rhogs(hogmaps, rhogs_prots) rhogs_prots = _utils_roothog.filter_big_roothogs(hogmaps, rhogs_prots) - address_rhogs_folder = "./temp_omamer_rhogs/" min_rhog_size = 2 - rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, address_rhogs_folder, min_rhog_size) - linclust_available=which("mmseqs") # True # + rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size) + linclust_available=which("mmseqs") # True # # if memseqs is not installed the output will be empty / None if linclust_available: num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, "singleton_unmapped.fa") if num_unmapped_singleton: result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa") - logger_hog.debug(" linclust is done "+ result_linclust) - num_clusters = _utils_roothog.write_clusters(address_rhogs_folder, min_rhog_size) - logger_hog.debug("we wrote "+str(num_clusters)+" new clusters with linclust ") + logger.debug(" linclust is done %s", result_linclust) + num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size) + logger.debug("we wrote %d new clusters with linclust ", num_clusters) diff --git a/FastOMA_light.nf b/FastOMA_light.nf index df34866..1be668f 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -4,22 +4,22 @@ params.input_folder = "./in_folder/" params.output_folder = "./out_folder/" params.proteome_folder = params.input_folder + "/proteome" -params.proteomes = params.proteome_folder + "/*" params.hogmap_in = params.input_folder + "/hogmap_in" - -params.hogmap_folder = params.output_folder + "/hogmap" params.splice_folder = params.input_folder + "/splice" params.species_tree = params.input_folder + "/species_tree.nwk" -params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk" -params.temp_pickles = params.output_folder + "/temp_pickles" + + + +// output subfolder definition params.genetrees_folder = params.output_folder + "/genetrees" +params.hogmap_folder = params.output_folder + "/hogmap" params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs" -process check_input{ +process check_input{ publishDir params.output_folder, mode: 'copy' input: @@ -30,10 +30,15 @@ process check_input{ path splice_folder output: path "species_tree_checked.nwk" - val true script: """ - check-fastoma-input + check-fastoma-input --proteomes ${proteome_folder} \ + --species-tree ${species_tree} \ + --out-tree species_tree_checked.nwk \ + --splice ${splice_folder} \ + --hogmap ${hogmap_folder} \ + --omamer_db ${omamerdb} \ + -vv """ } @@ -77,7 +82,11 @@ process infer_roothogs{ path "selected_isoforms" , optional: true script: """ - infer-roothogs --logger-level DEBUG + infer-roothogs --proteomes ${proteome_folder} \ + --hogmap ${hogmap_folder} \ + --splice ${splice_folder} \ + --out-rhog-folder "omamer_rhogs/" \ + -vv """ } @@ -157,20 +166,21 @@ process collect_subhogs{ } workflow { - proteomes = Channel.fromPath(params.proteomes, type:'any', checkIfExists:true) - proteome_folder = Channel.fromPath(params.proteome_folder) - hogmap_folder = Channel.fromPath(params.hogmap_folder) - splice_folder = Channel.fromPath(params.splice_folder) - species_tree = Channel.fromPath(params.species_tree) + proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first() + proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true) + species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first() + + hogmap_folder = Channel.fromPath(params.hogmap_folder, type: "dir") + splice_folder = Channel.fromPath(params.splice_folder, type: "dir") - genetrees_folder = Channel.fromPath(params.genetrees_folder) + genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir') hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir') omamerdb = Channel.fromPath(params.omamer_db) proteomes_omamerdb = proteomes.combine(omamerdb) proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) - (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder) + (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder) (hogmap, ready_omamer_run) = omamer_run(proteomes_omamerdb_inputhog) ready_omamer_run_c = ready_omamer_run.collect() diff --git a/pyproject.toml b/pyproject.toml index fb8a4fa..66957d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ nextflow = [ [project.scripts] batch-roothogs = "FastOMA.batch_roothogs:batch_roothogs" -check-input-fastoma = "FastOMA.check_input_fastoma:check_input_fastoma" +check-fastoma-input = "FastOMA.check_fastoma_input:check_fastoma_input" collect-subhogs = "FastOMA.collect_subhogs:collect_subhogs" infer-roothogs = "FastOMA.infer_roothogs:infer_roothogs" infer-subhogs = "FastOMA.infer_subhogs:infer_subhogs" From 2abc946aa3a6e95c25fb51993bdf6e59ec6ae5d0 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 20 Nov 2023 07:19:21 +0100 Subject: [PATCH 11/22] fixing pipeline, all dependencies via dataflow --- FastOMA/_utils_roothog.py | 5 +---- FastOMA/infer_roothogs.py | 2 +- FastOMA_light.nf | 25 +++++++++---------------- nextflow.config | 1 + 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/FastOMA/_utils_roothog.py b/FastOMA/_utils_roothog.py index 122e3d3..aca5c46 100644 --- a/FastOMA/_utils_roothog.py +++ b/FastOMA/_utils_roothog.py @@ -647,13 +647,10 @@ def write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_list write isoforms """ - selected_isoforms_folder= "selected_isoforms/" + selected_isoforms_folder = "selected_isoforms/" if not os.path.exists(selected_isoforms_folder): os.mkdir(selected_isoforms_folder) - - - for species, isoform_selected_sp in isoform_selected.items(): isoform_by_gene = isoform_by_gene_all[species] assert len(isoform_by_gene) == len(isoform_selected_sp) diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py index 6d62b35..3c14a4f 100644 --- a/FastOMA/infer_roothogs.py +++ b/FastOMA/infer_roothogs.py @@ -27,7 +27,7 @@ def infer_roothogs(): species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes) # optional input folder prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists) - hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep) # optional input folder + hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep, folder=conf.hogmap) # optional input folder splice_files = conf.splice is not None and os.path.exists(conf.splice) if splice_files: diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 1be668f..8bb68c0 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -10,7 +10,6 @@ params.species_tree = params.input_folder + "/species_tree.nwk" - // output subfolder definition params.genetrees_folder = params.output_folder + "/genetrees" params.hogmap_folder = params.output_folder + "/hogmap" @@ -20,7 +19,6 @@ params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs" process check_input{ - publishDir params.output_folder, mode: 'copy' input: path proteome_folder @@ -30,6 +28,7 @@ process check_input{ path splice_folder output: path "species_tree_checked.nwk" + val "check_completed" script: """ check-fastoma-input --proteomes ${proteome_folder} \ @@ -48,11 +47,10 @@ process omamer_run{ publishDir params.hogmap_folder, mode: 'copy' input: - tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder) + tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready) output: path "*.hogmap" - val true script: """ @@ -68,12 +66,12 @@ process omamer_run{ process infer_roothogs{ publishDir = [ path: params.temp_output, - mode: 'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename } + enabled: params.debug_enabled, + //mode: 'copy', saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename } ] input: - val ready_omamer_run - path hogmap_folder + path hogmaps, stageAs: "hogmaps/*" path proteome_folder path splice_folder output: @@ -83,7 +81,7 @@ process infer_roothogs{ script: """ infer-roothogs --proteomes ${proteome_folder} \ - --hogmap ${hogmap_folder} \ + --hogmap hogmaps \ --splice ${splice_folder} \ --out-rhog-folder "omamer_rhogs/" \ -vv @@ -170,21 +168,16 @@ workflow { proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true) species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first() - hogmap_folder = Channel.fromPath(params.hogmap_folder, type: "dir") splice_folder = Channel.fromPath(params.splice_folder, type: "dir") - genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir') hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir') omamerdb = Channel.fromPath(params.omamer_db) - proteomes_omamerdb = proteomes.combine(omamerdb) - proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) - (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder) - (hogmap, ready_omamer_run) = omamer_run(proteomes_omamerdb_inputhog) - ready_omamer_run_c = ready_omamer_run.collect() + omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check) + hogmap = omamer_run(omamer_input_channel) - (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder) + (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder) (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs) diff --git a/nextflow.config b/nextflow.config index 866c569..c810e92 100644 --- a/nextflow.config +++ b/nextflow.config @@ -8,6 +8,7 @@ params { container_name = "dessimozlab/fastoma" container_version = "latest" omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5" + debug_enabled = false } // Profiles configure nextflow depending on the environment (local, integration, live, etc.) From b7d5fa1cdf77f4756e6cabf6e0a8de6ed9a00a42 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 22 Nov 2023 10:26:03 +0100 Subject: [PATCH 12/22] [WIP] adding conda profile - not tested yet --- FastOMA/requirements | 166 ------------------------------------------ environment-conda.yml | 12 +++ nextflow.config | 11 ++- 3 files changed, 22 insertions(+), 167 deletions(-) delete mode 100644 FastOMA/requirements create mode 100644 environment-conda.yml diff --git a/FastOMA/requirements b/FastOMA/requirements deleted file mode 100644 index fcba280..0000000 --- a/FastOMA/requirements +++ /dev/null @@ -1,166 +0,0 @@ -$ conda list -# packages in environment at /miniconda/envs/fastoma: -# -# Name Version Build Channel -_libgcc_mutex 0.1 conda_forge conda-forge -_openmp_mutex 4.5 2_gnu conda-forge -anyio 3.7.1 pyhd8ed1ab_0 conda-forge -argon2-cffi 23.1.0 pyhd8ed1ab_0 conda-forge -argon2-cffi-bindings 21.2.0 py39hb9d737c_3 conda-forge -arrow 1.2.3 pyhd8ed1ab_0 conda-forge -asttokens 2.2.1 pyhd8ed1ab_0 conda-forge -async-lru 2.0.4 pyhd8ed1ab_0 conda-forge -attrs 23.1.0 pyh71513ae_1 conda-forge -babel 2.12.1 pyhd8ed1ab_1 conda-forge -backcall 0.2.0 pyh9f0ad1d_0 conda-forge -backports 1.0 pyhd8ed1ab_3 conda-forge -backports.functools_lru_cache 1.6.5 pyhd8ed1ab_0 conda-forge -beautifulsoup4 4.12.2 pyha770c72_0 conda-forge -biopython 1.81 pypi_0 pypi -bleach 6.0.0 pyhd8ed1ab_0 conda-forge -blosc2 2.0.0 pypi_0 pypi -brotli-python 1.0.9 py39h5a03fae_9 conda-forge -bzip2 1.0.8 h7f98852_4 conda-forge -ca-certificates 2023.7.22 hbcca054_0 conda-forge -cached-property 1.5.2 hd8ed1ab_1 conda-forge -cached_property 1.5.2 pyha770c72_1 conda-forge -certifi 2023.7.22 pyhd8ed1ab_0 conda-forge -cffi 1.15.1 py39he91dace_3 conda-forge -charset-normalizer 3.2.0 pyhd8ed1ab_0 conda-forge -comm 0.1.4 pyhd8ed1ab_0 conda-forge -cython 3.0.1 pypi_0 pypi -debugpy 1.6.8 py39h3d6467e_0 conda-forge -decorator 5.1.1 pyhd8ed1ab_0 conda-forge -defusedxml 0.7.1 pyhd8ed1ab_0 conda-forge -dendropy 4.6.1 pypi_0 pypi -entrypoints 0.4 pyhd8ed1ab_0 conda-forge -ete3 3.1.3 pypi_0 pypi -exceptiongroup 1.1.3 pyhd8ed1ab_0 conda-forge -executing 1.2.0 pyhd8ed1ab_0 conda-forge -fastoma 0.0.6 dev_0 -fasttree 2.1.11 h031d066_2 bioconda -fqdn 1.5.1 pyhd8ed1ab_0 conda-forge -future 0.18.3 pypi_0 pypi -humanfriendly 10.0 pypi_0 pypi -idna 3.4 pyhd8ed1ab_0 conda-forge -importlib-metadata 6.8.0 pyha770c72_0 conda-forge -importlib_metadata 6.8.0 hd8ed1ab_0 conda-forge -importlib_resources 6.0.1 pyhd8ed1ab_0 conda-forge -ipykernel 6.25.1 pyh71e2992_0 conda-forge -ipython 8.14.0 pyh41d4057_0 conda-forge -iqtree 2.2.3 h21ec9f0_0 bioconda -isoduration 20.11.0 pyhd8ed1ab_0 conda-forge -jedi 0.19.0 pyhd8ed1ab_0 conda-forge -jinja2 3.1.2 pyhd8ed1ab_1 conda-forge -json5 0.9.14 pyhd8ed1ab_0 conda-forge -jsonpointer 2.0 py_0 conda-forge -jsonschema 4.19.0 pyhd8ed1ab_1 conda-forge -jsonschema-specifications 2023.7.1 pyhd8ed1ab_0 conda-forge -jsonschema-with-format-nongpl 4.19.0 pyhd8ed1ab_1 conda-forge -jupyter-lsp 2.2.0 pyhd8ed1ab_0 conda-forge -jupyter_client 8.3.0 pyhd8ed1ab_0 conda-forge -jupyter_core 5.3.1 py39hf3d152e_0 conda-forge -jupyter_events 0.7.0 pyhd8ed1ab_2 conda-forge -jupyter_server 2.7.1 pyhd8ed1ab_0 conda-forge -jupyter_server_terminals 0.4.4 pyhd8ed1ab_1 conda-forge -jupyterlab 4.0.5 pyhd8ed1ab_0 conda-forge -jupyterlab_pygments 0.2.2 pyhd8ed1ab_0 conda-forge -jupyterlab_server 2.24.0 pyhd8ed1ab_0 conda-forge -ld_impl_linux-64 2.40 h41732ed_0 conda-forge -libffi 3.4.2 h7f98852_5 conda-forge -libgcc-ng 13.1.0 he5830b7_0 conda-forge -libgomp 13.1.0 he5830b7_0 conda-forge -libnsl 2.0.0 h7f98852_0 conda-forge -libsodium 1.0.18 h36c2ea0_1 conda-forge -libsqlite 3.43.0 h2797004_0 conda-forge -libstdcxx-ng 13.1.0 hfd8a6a1_0 conda-forge -libuuid 2.38.1 h0b41bf4_0 conda-forge -libzlib 1.2.13 hd590300_5 conda-forge -llvmlite 0.40.1 pypi_0 pypi -lxml 4.9.3 pypi_0 pypi -mafft 7.520 h031d066_2 bioconda -markupsafe 2.1.3 py39hd1e30aa_0 conda-forge -matplotlib-inline 0.1.6 pyhd8ed1ab_0 conda-forge -mistune 3.0.1 pyhd8ed1ab_0 conda-forge -msgpack 1.0.5 pypi_0 pypi -nbclient 0.8.0 pyhd8ed1ab_0 conda-forge -nbconvert-core 7.7.4 pyhd8ed1ab_0 conda-forge -nbformat 5.9.2 pyhd8ed1ab_0 conda-forge -ncurses 6.4 hcb278e6_0 conda-forge -nest-asyncio 1.5.6 pyhd8ed1ab_0 conda-forge -nextflow 23.4.3 pypi_0 pypi -notebook-shim 0.2.3 pyhd8ed1ab_0 conda-forge -numba 0.57.1 pypi_0 pypi -numexpr 2.8.5 pypi_0 pypi -numpy 1.24.4 pypi_0 pypi -omamer 0.2.6 pypi_0 pypi -openssl 3.1.2 hd590300_0 conda-forge -overrides 7.4.0 pyhd8ed1ab_0 conda-forge -packaging 23.1 pyhd8ed1ab_0 conda-forge -pandas 2.0.3 pypi_0 pypi -pandocfilters 1.5.0 pyhd8ed1ab_0 conda-forge -parso 0.8.3 pyhd8ed1ab_0 conda-forge -pexpect 4.8.0 pyh1a96a4e_2 conda-forge -pickleshare 0.7.5 py_1003 conda-forge -pip 23.2.1 pyhd8ed1ab_0 conda-forge -pkgutil-resolve-name 1.3.10 pyhd8ed1ab_0 conda-forge -platformdirs 3.10.0 pyhd8ed1ab_0 conda-forge -prometheus_client 0.17.1 pyhd8ed1ab_0 conda-forge -prompt-toolkit 3.0.39 pyha770c72_0 conda-forge -prompt_toolkit 3.0.39 hd8ed1ab_0 conda-forge -property-manager 3.0 pypi_0 pypi -psutil 5.9.5 py39h72bdee0_0 conda-forge -ptyprocess 0.7.0 pyhd3deb0d_0 conda-forge -pure_eval 0.2.2 pyhd8ed1ab_0 conda-forge -py-cpuinfo 9.0.0 pypi_0 pypi -pycparser 2.21 pyhd8ed1ab_0 conda-forge -pygments 2.16.1 pyhd8ed1ab_0 conda-forge -pyparsing 3.1.1 pypi_0 pypi -pysais 1.1.0 pypi_0 pypi -pysocks 1.7.1 pyha2e5f31_6 conda-forge -python 3.9.17 h0755675_0_cpython conda-forge -python-dateutil 2.8.2 pyhd8ed1ab_0 conda-forge -python-fastjsonschema 2.18.0 pyhd8ed1ab_0 conda-forge -python-json-logger 2.0.7 pyhd8ed1ab_0 conda-forge -python_abi 3.9 3_cp39 conda-forge -pytz 2023.3 pyhd8ed1ab_0 conda-forge -pyyaml 6.0.1 py39hd1e30aa_0 conda-forge -pyzmq 25.1.1 py39hb257651_0 conda-forge -readline 8.2 h8228510_1 conda-forge -referencing 0.30.2 pyhd8ed1ab_0 conda-forge -requests 2.31.0 pyhd8ed1ab_0 conda-forge -rfc3339-validator 0.1.4 pyhd8ed1ab_0 conda-forge -rfc3986-validator 0.1.1 pyh9f0ad1d_0 conda-forge -rpds-py 0.9.2 py39h9fdd4d6_0 conda-forge -scipy 1.11.2 pypi_0 pypi -send2trash 1.8.2 pyh41d4057_0 conda-forge -setuptools 68.1.2 pyhd8ed1ab_0 conda-forge -six 1.16.0 pyh6c4a22f_0 conda-forge -sniffio 1.3.0 pyhd8ed1ab_0 conda-forge -soupsieve 2.3.2.post1 pyhd8ed1ab_0 conda-forge -stack_data 0.6.2 pyhd8ed1ab_0 conda-forge -tables 3.8.0 pypi_0 pypi -terminado 0.17.1 pyh41d4057_0 conda-forge -tinycss2 1.2.1 pyhd8ed1ab_0 conda-forge -tk 8.6.12 h27826a3_0 conda-forge -tomli 2.0.1 pyhd8ed1ab_0 conda-forge -tornado 6.3.3 py39hd1e30aa_0 conda-forge -tqdm 4.66.1 pypi_0 pypi -traitlets 5.9.0 pyhd8ed1ab_0 conda-forge -typing-extensions 4.7.1 hd8ed1ab_0 conda-forge -typing_extensions 4.7.1 pyha770c72_0 conda-forge -typing_utils 0.1.0 pyhd8ed1ab_0 conda-forge -tzdata 2023.3 pypi_0 pypi -uri-template 1.3.0 pyhd8ed1ab_0 conda-forge -urllib3 2.0.4 pyhd8ed1ab_0 conda-forge -verboselogs 1.7 pypi_0 pypi -wcwidth 0.2.6 pyhd8ed1ab_0 conda-forge -webcolors 1.13 pyhd8ed1ab_0 conda-forge -webencodings 0.5.1 py_1 conda-forge -websocket-client 1.6.2 pyhd8ed1ab_0 conda-forge -wheel 0.41.2 pyhd8ed1ab_0 conda-forge -xz 5.2.6 h166bdaf_0 conda-forge -yaml 0.2.5 h7f98852_2 conda-forge -zeromq 4.3.4 h9c3ff4c_1 conda-forge -zipp 3.16.2 pyhd8ed1ab_0 conda-forge -linclust \ No newline at end of file diff --git a/environment-conda.yml b/environment-conda.yml new file mode 100644 index 0000000..906210a --- /dev/null +++ b/environment-conda.yml @@ -0,0 +1,12 @@ +name: fastoma-env +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - omamer + - mafft + - fasttree + - pip + - pip: + - . diff --git a/nextflow.config b/nextflow.config index c810e92..d031874 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,14 +11,16 @@ params { debug_enabled = false } -// Profiles configure nextflow depending on the environment (local, integration, live, etc.) +// Profiles configure nextflow depending on the environment (local, docker, singularity) profiles { + docker { process { container = "$params.container_name:$params.container_version" } docker.enabled = true } + singularity { process { container = "$params.container_name:$params.container_version" @@ -26,8 +28,15 @@ profiles { singularity.enabled = true singularity.autoMounts = true } + standard { process.executor = 'local' } + + conda { + process.conda = "envrionment-conda.yml" + conda.enabled = true + } + } From 6c5010f5dc89f4584925650ea6e9cf0a1ae69689 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 22 Nov 2023 13:28:29 +0100 Subject: [PATCH 13/22] [FIX] conda environment, add help text --- FastOMA_light.nf | 91 +++++++++++++++++++++++++++++++++++++++++++++++- nextflow.config | 27 +++++++++++++- 2 files changed, 116 insertions(+), 2 deletions(-) diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 8bb68c0..c6a05e7 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -1,7 +1,7 @@ // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000' -params.input_folder = "./in_folder/" +params.input_folder = "test_data/in_folder" params.output_folder = "./out_folder/" params.proteome_folder = params.input_folder + "/proteome" params.hogmap_in = params.input_folder + "/hogmap_in" @@ -18,6 +18,95 @@ params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs" +if (params.help) { + log.info """ + =========================================== + FastOMA -- PIPELINE + =========================================== + Usage: + Run the pipeline with default parameters: + nexflow run FastOMA.nf + + Run with user parameters: + nextflow run FastOMA.nf --input_folder {input.dir} --output_folder {results.dir} + + Mandatory arguments: + --input_folder Input data folder. Defaults to ${params.input_folder}. This folder + must contain the proteomes (in a subfolder named 'proteome') and + a species tree file. Optionally the folder might contain + - a sub-folder 'splice' containing splicing variant mappings + - a sub-folder 'hogmap_in' containing precomputed OMAmer + placement results for all proteomes + + All sub-folders and sub-files can also be placed in orther + locations if you provide alternative values for them (see below on + optional arguments section). + + --output_folder Path where all the output should be stored. Defaults to + ${params.output_folder} + + + Profile selection: + -profile FastOMA can be run using several execution profiles. The default + set of available profiles is + - docker Run pipeline using docker containers. Docker needs + to be installed on your system. Containers will be + fetched automatically from dockerhub. See also + additional options '--container_version' and + '--container_name'. + + - singlularity Run pipeline using singularity. Singularity needs + to be installed on your system. On HPC clusters, + it often needs to be loaded as a seperate module. + Containers will be fetched automatically from + dockerhub. See also additional options + '--container_version' and '--container_name'. + + - conda Run pipeline in a conda environment. Conda needs + to be installed on your system. The environment + will be created automatically. + + - standard Run pipeline on your local system. Mainly intended + for development purpose. All dependencies must be + installed in the calling environment. + + - slurm-singularity + Run pipeline using SLURM job scheduler and + singularity containers. This profile can also be a + template for other HPC clusters that use different + schedulers. + + - slurm-conda Run pipeline using SLURM job scheduler and conda + environment. + + Profiles are defined in nextflow.config and can be extended or + adjusted according to your needs. + + + Additional options: + --proteome_folder Overwrite location of proteomes (default ${params.proteome_folder}) + --species_tree Overwrite location of species tree file (newick format). + Defaults to ${params.species_tree} + --splice_folder Overwrite location of splice file folder. The splice files must be + named .splice. + Defaults to ${params.splice_folder} + --omamer_db Path or URL to download the OMAmer database from. + Defaults to ${params.omamer_db} + --hogmap_in Optional path where precomputed omamer mapping files are located. + Defaults to ${params.hogmap_in} + + Flags: + --help Display this message + --debug_enabled Store addtional information that might be helpful to debug in case + of a problem with FastOMA. + + """.stripIndent() + + exit 1 +} + + + process check_input{ publishDir params.output_folder, mode: 'copy' input: diff --git a/nextflow.config b/nextflow.config index d031874..e1d6889 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,6 +9,7 @@ params { container_version = "latest" omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5" debug_enabled = false + help = false } // Profiles configure nextflow depending on the environment (local, docker, singularity) @@ -34,9 +35,33 @@ profiles { } conda { - process.conda = "envrionment-conda.yml" + process.conda = "environment-conda.yml" conda.enabled = true } + slurm-singularity { + process { + container = "$params.container_name:$params.container_version" + executor = "slurm" + time = 4.h + memory = 20.GB + } + singularity.enabled = true + singularity.autoMounts = true + } + + slurm-conda { + process { + conda = "environment-conda.yml" + executor = "slurm" + time = 4.h + memory = 20.GB + } + conda.enabled = true + } + + + + } From 0b1a92c302886c0053e65f170c1d399aab950c84 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 22 Nov 2023 14:03:17 +0100 Subject: [PATCH 14/22] [FIX] rename profiles and fix default input param --- FastOMA_light.nf | 8 ++++---- nextflow.config | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/FastOMA_light.nf b/FastOMA_light.nf index c6a05e7..a3b8606 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -1,8 +1,8 @@ // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000' -params.input_folder = "test_data/in_folder" -params.output_folder = "./out_folder/" +params.input_folder = "testdata/in_folder" +params.output_folder = "out_folder/" params.proteome_folder = params.input_folder + "/proteome" params.hogmap_in = params.input_folder + "/hogmap_in" params.splice_folder = params.input_folder + "/splice" @@ -70,13 +70,13 @@ if (params.help) { for development purpose. All dependencies must be installed in the calling environment. - - slurm-singularity + - slurm_singularity Run pipeline using SLURM job scheduler and singularity containers. This profile can also be a template for other HPC clusters that use different schedulers. - - slurm-conda Run pipeline using SLURM job scheduler and conda + - slurm_conda Run pipeline using SLURM job scheduler and conda environment. Profiles are defined in nextflow.config and can be extended or diff --git a/nextflow.config b/nextflow.config index e1d6889..0b4bfcd 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,7 +39,7 @@ profiles { conda.enabled = true } - slurm-singularity { + slurm_singularity { process { container = "$params.container_name:$params.container_version" executor = "slurm" @@ -50,7 +50,7 @@ profiles { singularity.autoMounts = true } - slurm-conda { + slurm_conda { process { conda = "environment-conda.yml" executor = "slurm" From 83bcde59178952854091a82663f917c4511275f8 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Mon, 27 Nov 2023 23:29:01 +0100 Subject: [PATCH 15/22] make sure __pycache__ stays out of repo --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 35f96f5..a0c6a24 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ dist/ archive .git .gitignore +__pycache__ From a9d875813f7527919391ae8ab34011acbc3196b5 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Tue, 28 Nov 2023 09:10:58 +0100 Subject: [PATCH 16/22] adding UnionFind module from zoo --- FastOMA/batch_roothogs.py | 2 +- FastOMA/zoo/unionfind.py | 124 ++++++++++++++++++++++++++++++++++++++ pyproject.toml | 10 +-- 3 files changed, 130 insertions(+), 6 deletions(-) create mode 100644 FastOMA/zoo/unionfind.py diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py index 49b314e..4937071 100644 --- a/FastOMA/batch_roothogs.py +++ b/FastOMA/batch_roothogs.py @@ -51,7 +51,7 @@ def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_re rest_hogs.add_hog(hog) -def batch_roothogs(): +def fastoma_batch_roothogs(): import argparse parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis") parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored") diff --git a/FastOMA/zoo/unionfind.py b/FastOMA/zoo/unionfind.py new file mode 100644 index 0000000..83036e1 --- /dev/null +++ b/FastOMA/zoo/unionfind.py @@ -0,0 +1,124 @@ +import collections + +"""UnionFind.py + +Union-find data structure. Based on Josiah Carlson's code, +http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912 +with significant additional changes by D. Eppstein and +Adrian Altenhoff. +""" + + +class UnionFind(object): + """Union-find data structure. + + Each unionFind instance X maintains a family of disjoint sets of + hashable objects, supporting the following two methods: + + - X[item] returns a name for the set containing the given item. + Each set is named by an arbitrarily-chosen one of its members; as + long as the set remains unchanged it will keep the same name. If + the item is not yet part of a set in X, a new singleton set is + created for it. + + - X.union(item1, item2, ...) merges the sets containing each item + into a single larger set. If any item is not yet part of a set + in X, it is added to X as one of the members of the merged set. + """ + + def __init__(self, elements=None): + """Create a new union-find structure. + + If elements is not None, the structure gets initialized + with each element as a singleton component. + + :param elements: an iterable to initialize the structure. + """ + + self.weights = {} + self.parents = {} + if elements is not None: + for elem in iter(elements): + self.parents[elem] = elem + self.weights[elem] = 1 + + def __getitem__(self, obj): + """return the name of set which contains obj. + + :param obj: the query object + + :SeeAlso: :meth:`find`""" + return self.find(obj) + + def find(self, obj): + """Find and return the name of the set containing the obj. + + If the object is not found in any set, a new singleton set + is created that holds only this object until it is further merged.""" + + # check for previously unknown obj. If unknown, add it + # as a new cluster + if obj not in self.parents: + self.parents[obj] = obj + self.weights[obj] = 1 + return obj + + # find path of objects leading to the root + path = [obj] + root = self.parents[obj] + while root != path[-1]: + path.append(root) + root = self.parents[root] + + # compress the path and return + for ancestor in path: + self.parents[ancestor] = root + return root + + def remove(self, obj): + """Remove an object from the sets. + + Removes an object entirly from the datastructure. The + containing set will shrink by this one element. + + :Note: If one tries to accessed it afterwards using + :meth:`find`, it will be created newly and put as a + singleton. + """ + if obj not in self.parents: + return + comp = self.find(obj) + self.weights[comp] -= 1 + self.parents.pop(obj) + + def __iter__(self): + """Iterate through all items ever found or unioned by this structure.""" + return iter(self.parents) + + def union(self, *objects): + """Find the sets containing the objects and merge them. + + any number of objects can be passed to this method and + all of them will be merged into one set containing at + least these objects. + + :param objects: the objects to be merged. they have to be all + hashable. If they haven't been initialy added to the UnionFind + datastructre at instantiation time, they are added at this point + in time. + """ + roots = [self[x] for x in objects] + heaviest = max([(self.weights[r], r) for r in roots], key=lambda x: x[0])[1] + for r in roots: + if r != heaviest: + self.weights[heaviest] += self.weights[r] + self.parents[r] = heaviest + + def get_components(self): + """return a list of sets corresponding to the connected + components of the structure.""" + comp_dict = collections.defaultdict(set) + for elem in iter(self): + comp_dict[self[elem]].add(elem) + comp = list(comp_dict.values()) + return comp diff --git a/pyproject.toml b/pyproject.toml index 66957d5..d0669c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,11 +30,11 @@ nextflow = [ [project.scripts] -batch-roothogs = "FastOMA.batch_roothogs:batch_roothogs" -check-fastoma-input = "FastOMA.check_fastoma_input:check_fastoma_input" -collect-subhogs = "FastOMA.collect_subhogs:collect_subhogs" -infer-roothogs = "FastOMA.infer_roothogs:infer_roothogs" -infer-subhogs = "FastOMA.infer_subhogs:infer_subhogs" +fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs" +fastoma-check-fastoma-input = "FastOMA.check_fastoma_input:fastoma_check_fastoma_input" +fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs" +fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs" +fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs" [project.urls] Homepage = "https://github.com/DessimozLab/FastOMA" From 93c4ea95c2d5e2cc092b4af4984c51b5d26dce74 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Tue, 28 Nov 2023 09:11:47 +0100 Subject: [PATCH 17/22] rename and rewire script --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d0669c0..fc5e48e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ nextflow = [ [project.scripts] fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs" -fastoma-check-fastoma-input = "FastOMA.check_fastoma_input:fastoma_check_fastoma_input" +fastoma-check-input = "FastOMA.check_input:fastoma_check_input" fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs" fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs" fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs" From 2e5d521f63eaf1f1c8048a71c3e3f9753bfb1b32 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Tue, 28 Nov 2023 11:08:23 +0100 Subject: [PATCH 18/22] [FIX] write fasta files for OrthologGroups --- FastOMA/collect_subhogs.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py index 8732245..6fbe840 100644 --- a/FastOMA/collect_subhogs.py +++ b/FastOMA/collect_subhogs.py @@ -16,6 +16,7 @@ # This code collect subhogs and writes outputs. + def load_hogs(pickle_folder: Path): hogs_all = [] cnt = 0 @@ -78,8 +79,6 @@ def max_og_tree(tree, species_dic): return og_prot_list - - def fastoma_collect_subhogs(): import argparse parser = argparse.ArgumentParser(description="collecting all computed HOGs and combine into a single orthoxml") @@ -105,7 +104,7 @@ def fastoma_collect_subhogs(): if conf.roothog_tsv is not None: write_roothogs(conf.out, conf.roothog_tsv) if conf.marker_groups_fasta is not None: - write_group_files(conf.out, conf.roothogs_folder, conf.marker_groups_fasta) + write_group_files(Path(conf.out), Path(conf.roothogs_folder), conf.marker_groups_fasta) def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file): @@ -159,13 +158,13 @@ def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=N output_file_og_tsv = "OrthologousGroups.tsv" if output_fasta_groups is None: output_fasta_groups = "OrthologousGroupsFasta" + output_fasta_groups = Path(output_fasta_groups) + logger_hog.info("Start writing OG fasta files ") - rhog_all_folder = "./omamer_rhogs/" #sys.argv[2] + "/" # "out_folder/rhogs_all/" fasta_format = "fa" # of the rhogs - trees, species_dic = orthoxml_to_newick(orthoxml, return_gene_to_species=True) logger_hog.info("We extracted %d trees in NHX format from the input HOG orthoxml %s", len(trees), orthoxml) @@ -181,7 +180,7 @@ def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=N raise og_prot_list = max_og_tree(tree, species_dic) - if len(og_prot_list) >= 2: # a group should have at least 1 member/protein + if len(og_prot_list) >= 2: # a group should have at least 1 member/protein OGs[hog_id] = og_prot_list logger_hog.info("done extracting trees") @@ -190,29 +189,31 @@ def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=N for hog_id, og_prot_list in OGs.items(): line_text = str(hog_id) + "\t" + str(og_prot_list)[1:-1] + "\n" handle.write(line_text) - handle.close() - - logger_hog.info("We wrote the protein families information in the file " + output_file_og_tsv) - os.makedirs(output_fasta_groups) + logger_hog.info("We wrote the protein families information in the file %s", output_file_og_tsv) + output_fasta_groups.mkdir(parents=True, exist_ok=True) logger_hog.info("start writing %d OGs as fasta files in folder %s.", len(OGs), output_fasta_groups) for hog_id, og_prot_list in OGs.items(): # hog_id="HOG_0667494_sub10524" rhog_id = "_".join(hog_id.split("_")[:2]) - omamer_rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format - omamer_rhogs_all_prots = list(SeqIO.parse(omamer_rhogs_all_address, "fasta")) + rhog_fasta = roothog_folder / (rhog_id + "." + fasta_format) + omamer_rhogs_all_prots = list(SeqIO.parse(rhog_fasta, "fasta")) og_prots = [] og_prot_list = OGs[hog_id] for rhogs_prot in omamer_rhogs_all_prots: - if rhogs_prot.id.split("||")[0] in og_prot_list: + prot_id = rhogs_prot.id + if _config.protein_format_qfo_dataset_before2022: + prot_id = prot_id.split('|')[1] + + if prot_id in og_prot_list: sp = rhogs_prot.id.split("||")[1] rhogs_prot.description += " [" + sp + "]" og_prots.append(rhogs_prot) og_id = "OG_" + hog_id # one OG per rootHOG # "/HOG_"+ rhogid - SeqIO.write(og_prots, output_fasta_groups + og_id + ".fa", "fasta") + SeqIO.write(og_prots, output_fasta_groups / (og_id + ".fa"), "fasta") logger_hog.info("writing done") @@ -237,3 +238,5 @@ def write_roothogs(orthoxml, out): logger_hog.info("We wrote the protein families information in the file %s", out) +if __name__ == "__main__": + fastoma_collect_subhogs() \ No newline at end of file From 0b8d63da2f0f1d61b438f740a37f1678cb2b6cf7 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 29 Nov 2023 09:36:54 +0100 Subject: [PATCH 19/22] [WIP] towards better reporting --- .dockerignore | 1 + Dockerfile | 1 + FastOMA/_config.py | 4 ++-- FastOMA_light.nf | 31 ++++++++++++++++++++++++++++++- nextflow.config | 23 +++++++++++++++++------ 5 files changed, 51 insertions(+), 9 deletions(-) diff --git a/.dockerignore b/.dockerignore index 466368d..2021e50 100644 --- a/.dockerignore +++ b/.dockerignore @@ -5,3 +5,4 @@ work output testdata dist +archive/ diff --git a/Dockerfile b/Dockerfile index 99ac18e..ee6c1f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,6 +34,7 @@ RUN apt-get update \ fasttree \ libxml2 \ mafft \ + procps \ && apt-get -y autoremove \ && apt-get -y autoclean \ && rm -rf /var/lib/apt/lists/* diff --git a/FastOMA/_config.py b/FastOMA/_config.py index ae845c6..62db7bd 100644 --- a/FastOMA/_config.py +++ b/FastOMA/_config.py @@ -78,8 +78,8 @@ inferhog_min_hog_size_xml = 2 # by setting this as 1, pyham won't work on xml output. # batch_roothogs -big_rhog_filesize_thresh = 1300 -sum_list_rhogs_filesize_thresh = 2 * 1e3 +big_rhog_filesize_thresh = 400 * 1000 +sum_list_rhogs_filesize_thresh = 2 * 1e6 orthoxml_v03 = True diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 0023a5c..78ae58c 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -106,6 +106,27 @@ if (params.help) { } +log.info """ +=========================================== + FastOMA -- PIPELINE +=========================================== + + Project : ${workflow.projectDir} + Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}] + Cmd line: ${workflow.commandLine} + Manifest's pipeline version: ${workflow.manifest.version} + +Parameters: + input_folder ${params.input_folder} + proteome folder ${params.proteome} + species_tree ${params.species_tree} + splice_folder ${params.splice_folder} (optional) + omamer_db ${params.omamer_db} + hogmap_in ${params.hogmap_in} (optional) + + debug_enabled ${params.debug_enabled} +""".stripIndent() + process check_input{ publishDir params.output_folder, mode: 'copy' @@ -278,6 +299,14 @@ workflow { channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle } (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs) - orthoxml_file.view{" output orthoxml file ${it}"} } + +//workflow.onComplete { +// println "Completed at : $workflow.complete" +// println "Duration : $workflow.duration" +// println "Output in : $params.output_folder" +// println "Nextflow report : $params.statsdir" +// println ( workflow.success ? "Done!" : "Oops .. something went wrong" ) +//} + diff --git a/nextflow.config b/nextflow.config index 0b4bfcd..d9a0320 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,7 +1,11 @@ // General configuration used in all profiles manifest { - description = 'FastOMA - Infer orthology ' - nextflowVersion = '22.10.4' + name = "FastOMA - Orthology inference" + description = """FastOMA computes Hierarchical Orthologous Groups from proteomes.""" + author = "Sina Majidian, Adrian Altenhoff" + homePage = "https://omabrowser.org" + mainScript = "FastOMA.nf" + nextflowVersion = "22.10.4" } params { @@ -10,6 +14,9 @@ params { omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5" debug_enabled = false help = false + + output_folder = "Output" + statsdir = "${params.output_folder}/stats" } // Profiles configure nextflow depending on the environment (local, docker, singularity) @@ -59,9 +66,13 @@ profiles { } conda.enabled = true } - - - - } +timeline { + enabled = true + file = "${params.statsdir}/timeline.html" +} +report { + enabled = false + file = "${params.statsdir}/report.html" +} \ No newline at end of file From 6ea0659e7ffa2305173e6fb841e6074b01f9bcd8 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 29 Nov 2023 12:03:28 +0100 Subject: [PATCH 20/22] [FIX] better reporting of pipeline --- FastOMA_light.nf | 27 +++++++++++++++++---------- nextflow.config | 5 +++-- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/FastOMA_light.nf b/FastOMA_light.nf index 78ae58c..e8c28a2 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -1,3 +1,4 @@ +import groovy.json.JsonBuilder; // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000' @@ -99,6 +100,8 @@ if (params.help) { --help Display this message --debug_enabled Store addtional information that might be helpful to debug in case of a problem with FastOMA. + --report Produce nextflow report and timeline and store in in + $params.statdir """.stripIndent() @@ -118,13 +121,14 @@ log.info """ Parameters: input_folder ${params.input_folder} - proteome folder ${params.proteome} + proteome folder ${params.proteome_folder} species_tree ${params.species_tree} - splice_folder ${params.splice_folder} (optional) + splice_folder ${params.splice_folder} omamer_db ${params.omamer_db} - hogmap_in ${params.hogmap_in} (optional) + hogmap_in ${params.hogmap_in} debug_enabled ${params.debug_enabled} + report ${params.report} """.stripIndent() @@ -302,11 +306,14 @@ workflow { } -//workflow.onComplete { -// println "Completed at : $workflow.complete" -// println "Duration : $workflow.duration" -// println "Output in : $params.output_folder" -// println "Nextflow report : $params.statsdir" -// println ( workflow.success ? "Done!" : "Oops .. something went wrong" ) -//} +workflow.onComplete { + def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : ""); + println "" + println "Completed at : $workflow.complete" + println "Duration : $workflow.duration" + println "Processes : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)" + println "Output in : $params.output_folder" + report + println ( workflow.success ? "Done!" : "Oops .. something went wrong" ) +} + diff --git a/nextflow.config b/nextflow.config index d9a0320..0e2720d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,7 @@ params { omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5" debug_enabled = false help = false + report = false output_folder = "Output" statsdir = "${params.output_folder}/stats" @@ -69,10 +70,10 @@ profiles { } timeline { - enabled = true + enabled = params.report file = "${params.statsdir}/timeline.html" } report { - enabled = false + enabled = params.report file = "${params.statsdir}/report.html" } \ No newline at end of file From 87f419ca3008938787ca3ae68da26a4197caf54f Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 29 Nov 2023 23:10:14 +0100 Subject: [PATCH 21/22] add resource limits to workflow --- FastOMA_light.nf | 55 ++++++++++++++++++++++++++---------------------- conf/base.config | 55 ++++++++++++++++++++++++++++++++++++++++++++++++ nextflow.config | 53 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 136 insertions(+), 27 deletions(-) create mode 100644 conf/base.config diff --git a/FastOMA_light.nf b/FastOMA_light.nf index e8c28a2..aac82a6 100644 --- a/FastOMA_light.nf +++ b/FastOMA_light.nf @@ -133,31 +133,34 @@ Parameters: process check_input{ - publishDir params.output_folder, mode: 'copy' - input: - path proteome_folder - path hogmap_folder - path species_tree - path omamerdb - path splice_folder - output: - path "species_tree_checked.nwk" - val "check_completed" - script: - """ - fastoma-check-input --proteomes ${proteome_folder} \ - --species-tree ${species_tree} \ - --out-tree species_tree_checked.nwk \ - --splice ${splice_folder} \ - --hogmap ${hogmap_folder} \ - --omamer_db ${omamerdb} \ - -vv - """ + label "process_single" + publishDir params.output_folder, mode: 'copy' + + input: + path proteome_folder + path hogmap_folder + path species_tree + path omamerdb + path splice_folder + output: + path "species_tree_checked.nwk" + val "check_completed" + script: + """ + fastoma-check-input --proteomes ${proteome_folder} \ + --species-tree ${species_tree} \ + --out-tree species_tree_checked.nwk \ + --splice ${splice_folder} \ + --hogmap ${hogmap_folder} \ + --omamer_db ${omamerdb} \ + -vv + """ } process omamer_run{ - time {4.h} + label "process_single" + tag "$proteome" publishDir params.hogmap_folder, mode: 'copy' input: @@ -178,11 +181,11 @@ process omamer_run{ process infer_roothogs{ + label "process_single" publishDir = [ path: params.temp_output, enabled: params.debug_enabled, - //mode: 'copy', saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename } - ] + ] input: path hogmaps, stageAs: "hogmaps/*" @@ -219,8 +222,8 @@ process batch_roothogs{ } process hog_big{ - cpus 2 - time {20.h} // for very big rhog it might need more, or you could re-run and add `-resume` + label "process_high" + input: each rhogsbig path species_tree @@ -239,6 +242,8 @@ process hog_big{ } process hog_rest{ + label "process_single" + input: each rhogsrest path species_tree diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 0000000..c92b610 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,55 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + dessimozlab/FastOMA Nextflow base config file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + A 'blank slate' config file, appropriate for general use on most high performance + compute environments. Assumes that all software is installed and available on + the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. +---------------------------------------------------------------------------------------- +*/ + +process { + + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + shell = ['/bin/bash', '-euo', 'pipefail'] + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_low { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 20.h * task.attempt, 'time' ) } + } + withLabel:process_high_memory { + memory = { check_max( 200.GB * task.attempt, 'memory' ) } + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 0e2720d..7529c85 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,12 @@ params { output_folder = "Output" statsdir = "${params.output_folder}/stats" + + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '128.GB' + max_cpus = 24 + max_time = '120.h' } // Profiles configure nextflow depending on the environment (local, docker, singularity) @@ -69,11 +75,54 @@ profiles { } } +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = params.report - file = "${params.statsdir}/timeline.html" + file = "${params.statsdir}/timeline_${trace_timestamp}.html" } report { enabled = params.report - file = "${params.statsdir}/report.html" + file = "${params.statsdir}/report_{trace_timestamp}.html" +} +trace { + enabled = params.report + file = "${params.statsdir}/trace_${trace_timestamp}.txt" +} +dag { + enabled = params.report + file = "${params.statsdir}/pipeline_dag_${trace_timestamp}.html" +} + +includeConfig "conf/base.config" + +// function to check maximum resources +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } } \ No newline at end of file From 18a8862644c2d7106284d21c26d825ab0c5b7c1d Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Wed, 29 Nov 2023 23:18:13 +0100 Subject: [PATCH 22/22] use only FastOMA.nf, light version can go --- FastOMA.nf | 448 +++++++++++++++++++++++++++++------------------ FastOMA_light.nf | 324 ---------------------------------- 2 files changed, 278 insertions(+), 494 deletions(-) delete mode 100644 FastOMA_light.nf diff --git a/FastOMA.nf b/FastOMA.nf index bb9c546..aac82a6 100644 --- a/FastOMA.nf +++ b/FastOMA.nf @@ -1,216 +1,324 @@ +import groovy.json.JsonBuilder; // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000' -params.input_folder = "./in_folder/" -params.output_folder = "./out_folder/" +params.input_folder = "testdata/in_folder" +params.output_folder = "out_folder/" params.proteome_folder = params.input_folder + "/proteome" -params.proteomes = params.proteome_folder + "/*" params.hogmap_in = params.input_folder + "/hogmap_in" - -params.hogmap_folder = params.output_folder + "/hogmap" params.splice_folder = params.input_folder + "/splice" params.species_tree = params.input_folder + "/species_tree.nwk" -params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk" -params.temp_pickles = params.output_folder + "/temp_pickles" + + +// output subfolder definition params.genetrees_folder = params.output_folder + "/genetrees" +params.hogmap_folder = params.output_folder + "/hogmap" params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs" + +if (params.help) { + log.info """ + =========================================== + FastOMA -- PIPELINE + =========================================== + Usage: + Run the pipeline with default parameters: + nexflow run FastOMA.nf + + Run with user parameters: + nextflow run FastOMA.nf --input_folder {input.dir} --output_folder {results.dir} + + Mandatory arguments: + --input_folder Input data folder. Defaults to ${params.input_folder}. This folder + must contain the proteomes (in a subfolder named 'proteome') and + a species tree file. Optionally the folder might contain + - a sub-folder 'splice' containing splicing variant mappings + - a sub-folder 'hogmap_in' containing precomputed OMAmer + placement results for all proteomes + + All sub-folders and sub-files can also be placed in orther + locations if you provide alternative values for them (see below on + optional arguments section). + + --output_folder Path where all the output should be stored. Defaults to + ${params.output_folder} + + + Profile selection: + -profile FastOMA can be run using several execution profiles. The default + set of available profiles is + - docker Run pipeline using docker containers. Docker needs + to be installed on your system. Containers will be + fetched automatically from dockerhub. See also + additional options '--container_version' and + '--container_name'. + + - singlularity Run pipeline using singularity. Singularity needs + to be installed on your system. On HPC clusters, + it often needs to be loaded as a seperate module. + Containers will be fetched automatically from + dockerhub. See also additional options + '--container_version' and '--container_name'. + + - conda Run pipeline in a conda environment. Conda needs + to be installed on your system. The environment + will be created automatically. + + - standard Run pipeline on your local system. Mainly intended + for development purpose. All dependencies must be + installed in the calling environment. + + - slurm_singularity + Run pipeline using SLURM job scheduler and + singularity containers. This profile can also be a + template for other HPC clusters that use different + schedulers. + + - slurm_conda Run pipeline using SLURM job scheduler and conda + environment. + + Profiles are defined in nextflow.config and can be extended or + adjusted according to your needs. + + + Additional options: + --proteome_folder Overwrite location of proteomes (default ${params.proteome_folder}) + --species_tree Overwrite location of species tree file (newick format). + Defaults to ${params.species_tree} + --splice_folder Overwrite location of splice file folder. The splice files must be + named .splice. + Defaults to ${params.splice_folder} + --omamer_db Path or URL to download the OMAmer database from. + Defaults to ${params.omamer_db} + --hogmap_in Optional path where precomputed omamer mapping files are located. + Defaults to ${params.hogmap_in} + + Flags: + --help Display this message + --debug_enabled Store addtional information that might be helpful to debug in case + of a problem with FastOMA. + --report Produce nextflow report and timeline and store in in + $params.statdir + + """.stripIndent() + + exit 1 +} + + +log.info """ +=========================================== + FastOMA -- PIPELINE +=========================================== + + Project : ${workflow.projectDir} + Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}] + Cmd line: ${workflow.commandLine} + Manifest's pipeline version: ${workflow.manifest.version} + +Parameters: + input_folder ${params.input_folder} + proteome folder ${params.proteome_folder} + species_tree ${params.species_tree} + splice_folder ${params.splice_folder} + omamer_db ${params.omamer_db} + hogmap_in ${params.hogmap_in} + + debug_enabled ${params.debug_enabled} + report ${params.report} +""".stripIndent() + + process check_input{ - time {5.h} - memory {200.GB} - publishDir params.output_folder, mode: 'copy' - input: - path proteome_folder - path hogmap_folder - path species_tree - path omamerdb - path splice_folder - output: - path "species_tree_checked.nwk" - val true - script: - """ - fastoma-check-input - """ + label "process_single" + publishDir params.output_folder, mode: 'copy' + + input: + path proteome_folder + path hogmap_folder + path species_tree + path omamerdb + path splice_folder + output: + path "species_tree_checked.nwk" + val "check_completed" + script: + """ + fastoma-check-input --proteomes ${proteome_folder} \ + --species-tree ${species_tree} \ + --out-tree species_tree_checked.nwk \ + --splice ${splice_folder} \ + --hogmap ${hogmap_folder} \ + --omamer_db ${omamerdb} \ + -vv + """ } process omamer_run{ - time {5.h} - memory {95.GB} - publishDir params.hogmap_folder, mode: 'copy' - input: - path proteomes_omamerdb_inputhog - val ready_input_check_c - output: - path "*.hogmap" - val true - script: //todo this if condition can be done as part of nextflow, so it won't submit job for cp - """ - if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ] - then - cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ${proteomes_omamerdb_inputhog[0]}.hogmap - else - omamer search -n 10 --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap - fi - """ // --nthreads 10 + label "process_single" + tag "$proteome" + publishDir params.hogmap_folder, mode: 'copy' + + input: + tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready) + + output: + path "*.hogmap" + + script: + """ + if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then + cp ${precomputed_hogmap_folder}/${proteome}.hogmap ${proteome}.hogmap + else + omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap + fi + """ } - process infer_roothogs{ - cpus 10 // useful for linclust - time {15.h} // including linclust - memory {350.GB} - publishDir = [ - path: params.temp_output, - mode: 'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename } - ] - input: - val ready_omamer_run - path hogmap_folder - path proteome_folder - path splice_folder - output: - path "temp_omamer_rhogs" - path "gene_id_dic_xml.pickle" - path "selected_isoforms" , optional: true // SPECIESNAME_selected_isoforms.tsv - val true // nextflow-io.github.io/patterns/state-dependency/ - script: - """ - fastoma-infer-roothogs --logger-level DEBUG - """ + label "process_single" + publishDir = [ + path: params.temp_output, + enabled: params.debug_enabled, + ] + + input: + path hogmaps, stageAs: "hogmaps/*" + path proteome_folder + path splice_folder + output: + path "omamer_rhogs/*" + path "gene_id_dic_xml.pickle" + path "selected_isoforms" , optional: true + script: + """ + fastoma-infer-roothogs --proteomes ${proteome_folder} \ + --hogmap hogmaps \ + --splice ${splice_folder} \ + --out-rhog-folder "omamer_rhogs/" \ + -vv + """ } process batch_roothogs{ - time {15.h} // including linclust - memory {95.GB} - input: - val ready_infer_roothogs - path "temp_omamer_rhogs" - output: - path "rhogs_rest/*", optional: true - path "rhogs_big/*" , optional: true - val true - script: - """ - fastoma-batch-roothogs - """ + input: + path rhogs, stageAs: "omamer_rhogs/*" + output: + path "rhogs_rest/*", optional: true + path "rhogs_big/*" , optional: true + script: + """ + fastoma-batch-roothogs --input-roothogs omamer_rhogs/ \ + --out-big rhogs_big \ + --out-rest rhogs_rest \ + -vv + """ } process hog_big{ - cpus 6 - time {10.h} // for very big rhog it might need more, or you could re-run and add `-resume` - memory {200.GB} - publishDir params.temp_pickles - input: - val rhogsbig_tree_ready - output: - path "*.pickle" - path "*.fa", optional: true // msa if write True - path "*.nwk", optional: true // gene trees if write True - val true - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel - """ + label "process_high" + + input: + each rhogsbig + path species_tree + output: + path "pickle_hogs" + path "msa/*.fa" , optional: true // msa if write True + path "gene_trees/*.nwk" , optional: true // gene trees if write True + script: + """ + fastoma-infer-subhogs --input-rhog-folder ${rhogsbig} \ + --species-tree ${species_tree} \ + --fragment-detection \ + --low-so-detection \ + --parallel + """ } -temp_pickles =2 - - process hog_rest{ - time {3.h} // for very big rhog it might need more, or you could re-run and add `-resume` - memory {95.GB} - publishDir params.temp_pickles - input: - val rhogsrest_tree_ready - output: - path "*.pickle" - path "*.fa" , optional: true // msa if write True - path "*.nwk" , optional: true // gene trees if write True - val true - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsrest_tree_ready[0]} --species-tree ${rhogsrest_tree_ready[1]} - """ // --parrallel False + label "process_single" + + input: + each rhogsrest + path species_tree + output: + path "pickle_hogs" + path "msa/*.fa" , optional: true // msa if write True + path "gene_trees/*.nwk" , optional: true // gene trees if write True + script: + """ + fastoma-infer-subhogs --input-rhog-folder ${rhogsrest} \ + --species-tree ${species_tree} \ + --fragment-detection \ + --low-so-detection + #--out pickle_hogs + """ } process collect_subhogs{ - time {10.h} - memory {300.GB} // orthoxml string can be very huge in memory - publishDir params.output_folder, mode: 'copy' - input: - val ready_hog_rest - val ready_hog_big - path "temp_pickles" // this is the folder includes pickles_rhogs - path "gene_id_dic_xml.pickle" - path "temp_omamer_rhogs" - output: - path "output_hog.orthoxml" - path "OrthologousGroupsFasta" - path "OrthologousGroups.tsv" - path "rootHOGs.tsv" - script: - """ - fastoma-collect-subhogs - """ + publishDir params.output_folder, mode: 'copy' + input: + path pickles, stageAs: "pickle_folders/?" + path "gene_id_dic_xml.pickle" + path rhogs, stageAs: "omamer_rhogs/*" + output: + path "output_hog.orthoxml" + path "OrthologousGroupsFasta" + path "OrthologousGroups.tsv" + path "rootHOGs.tsv" + script: + """ + fastoma-collect-subhogs --pickle-folder pickle_folders/ \ + --roothogs-folder omamer_rhogs/ \ + --gene-id-pickle-file gene_id_dic_xml.pickle \ + --out output_hog.orthoxml \ + --marker-groups-fasta OrthologousGroups.tsv \ + --roothog-tsv rootHOGs.tsv \ + -vv + """ } +workflow { + proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first() + proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true) + species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first() + splice_folder = Channel.fromPath(params.splice_folder, type: "dir") + genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir') + hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir') + omamerdb = Channel.fromPath(params.omamer_db) + (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder) + omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check) + hogmap = omamer_run(omamer_input_channel) -workflow { - proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true) - proteome_folder = Channel.fromPath(params.proteome_folder) - hogmap_folder = Channel.fromPath(params.hogmap_folder) - splice_folder = Channel.fromPath(params.splice_folder) - temp_output = Channel.fromPath(params.temp_output) - genetrees_folder = Channel.fromPath(params.genetrees_folder) - hogmap_in = Channel.fromPath(params.hogmap_in) - - temp_pickles = Channel.fromPath(params.temp_pickles) - omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") - species_tree = Channel.fromPath(params.species_tree) - species_tree_checked = Channel.fromPath(params.species_tree_checked) - - (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder) - ready_input_check_c = ready_input_check.collect() - //species_tree_checked.view{"species_tree_checked ${it}"} - - proteomes_omamerdb = proteomes.combine(omamerdb) - proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"} - //proteomes_omamerdb_inputhog_inputcheck = proteomes_omamerdb_inputhog.combine(ready_input_check_c) - (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog,ready_input_check_c) - ready_omamer_run_c = ready_omamer_run.collect() - (temp_omamer_rhogs, gene_id_dic_xml,selected_isoforms, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder) - ready_infer_roothogs_c = ready_infer_roothogs.collect() - - (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, temp_omamer_rhogs) - ready_batch_roothogs_c = ready_batch_roothogs.collect() - - - rhogsbig = rhogs_big_list.flatten() - rhogsbig_tree = rhogsbig.combine(species_tree_checked) - rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} - (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready) - - rhogsrest = rhogs_rest_list.flatten() - rhogsrest_tree = rhogsrest.combine(species_tree_checked) - rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c) - (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready) - - (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), temp_pickles, gene_id_dic_xml, temp_omamer_rhogs) - // temp_omamer_rhogs.view{" output omamer_rhogs ${it}"} - // orthoxml_file.view{" output orthoxml file ${it}"} + (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder) + + (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs) + + (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree) + (pickle_rest_rhog, msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree) + channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle } + + (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs) } +workflow.onComplete { + def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : ""); + println "" + println "Completed at : $workflow.complete" + println "Duration : $workflow.duration" + println "Processes : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)" + println "Output in : $params.output_folder" + report + println ( workflow.success ? "Done!" : "Oops .. something went wrong" ) +} -// todo: check input files very beginning (before omamer starts) e.g all species are in the species tree. No species chars in fasta record. diff --git a/FastOMA_light.nf b/FastOMA_light.nf deleted file mode 100644 index aac82a6..0000000 --- a/FastOMA_light.nf +++ /dev/null @@ -1,324 +0,0 @@ -import groovy.json.JsonBuilder; - -// NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000' - -params.input_folder = "testdata/in_folder" -params.output_folder = "out_folder/" -params.proteome_folder = params.input_folder + "/proteome" -params.hogmap_in = params.input_folder + "/hogmap_in" -params.splice_folder = params.input_folder + "/splice" -params.species_tree = params.input_folder + "/species_tree.nwk" - - - -// output subfolder definition -params.genetrees_folder = params.output_folder + "/genetrees" -params.hogmap_folder = params.output_folder + "/hogmap" - -params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs" - - - -if (params.help) { - log.info """ - =========================================== - FastOMA -- PIPELINE - =========================================== - Usage: - Run the pipeline with default parameters: - nexflow run FastOMA.nf - - Run with user parameters: - nextflow run FastOMA.nf --input_folder {input.dir} --output_folder {results.dir} - - Mandatory arguments: - --input_folder Input data folder. Defaults to ${params.input_folder}. This folder - must contain the proteomes (in a subfolder named 'proteome') and - a species tree file. Optionally the folder might contain - - a sub-folder 'splice' containing splicing variant mappings - - a sub-folder 'hogmap_in' containing precomputed OMAmer - placement results for all proteomes - - All sub-folders and sub-files can also be placed in orther - locations if you provide alternative values for them (see below on - optional arguments section). - - --output_folder Path where all the output should be stored. Defaults to - ${params.output_folder} - - - Profile selection: - -profile FastOMA can be run using several execution profiles. The default - set of available profiles is - - docker Run pipeline using docker containers. Docker needs - to be installed on your system. Containers will be - fetched automatically from dockerhub. See also - additional options '--container_version' and - '--container_name'. - - - singlularity Run pipeline using singularity. Singularity needs - to be installed on your system. On HPC clusters, - it often needs to be loaded as a seperate module. - Containers will be fetched automatically from - dockerhub. See also additional options - '--container_version' and '--container_name'. - - - conda Run pipeline in a conda environment. Conda needs - to be installed on your system. The environment - will be created automatically. - - - standard Run pipeline on your local system. Mainly intended - for development purpose. All dependencies must be - installed in the calling environment. - - - slurm_singularity - Run pipeline using SLURM job scheduler and - singularity containers. This profile can also be a - template for other HPC clusters that use different - schedulers. - - - slurm_conda Run pipeline using SLURM job scheduler and conda - environment. - - Profiles are defined in nextflow.config and can be extended or - adjusted according to your needs. - - - Additional options: - --proteome_folder Overwrite location of proteomes (default ${params.proteome_folder}) - --species_tree Overwrite location of species tree file (newick format). - Defaults to ${params.species_tree} - --splice_folder Overwrite location of splice file folder. The splice files must be - named .splice. - Defaults to ${params.splice_folder} - --omamer_db Path or URL to download the OMAmer database from. - Defaults to ${params.omamer_db} - --hogmap_in Optional path where precomputed omamer mapping files are located. - Defaults to ${params.hogmap_in} - - Flags: - --help Display this message - --debug_enabled Store addtional information that might be helpful to debug in case - of a problem with FastOMA. - --report Produce nextflow report and timeline and store in in - $params.statdir - - """.stripIndent() - - exit 1 -} - - -log.info """ -=========================================== - FastOMA -- PIPELINE -=========================================== - - Project : ${workflow.projectDir} - Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}] - Cmd line: ${workflow.commandLine} - Manifest's pipeline version: ${workflow.manifest.version} - -Parameters: - input_folder ${params.input_folder} - proteome folder ${params.proteome_folder} - species_tree ${params.species_tree} - splice_folder ${params.splice_folder} - omamer_db ${params.omamer_db} - hogmap_in ${params.hogmap_in} - - debug_enabled ${params.debug_enabled} - report ${params.report} -""".stripIndent() - - -process check_input{ - label "process_single" - publishDir params.output_folder, mode: 'copy' - - input: - path proteome_folder - path hogmap_folder - path species_tree - path omamerdb - path splice_folder - output: - path "species_tree_checked.nwk" - val "check_completed" - script: - """ - fastoma-check-input --proteomes ${proteome_folder} \ - --species-tree ${species_tree} \ - --out-tree species_tree_checked.nwk \ - --splice ${splice_folder} \ - --hogmap ${hogmap_folder} \ - --omamer_db ${omamerdb} \ - -vv - """ -} - - -process omamer_run{ - label "process_single" - tag "$proteome" - publishDir params.hogmap_folder, mode: 'copy' - - input: - tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready) - - output: - path "*.hogmap" - - script: - """ - if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then - cp ${precomputed_hogmap_folder}/${proteome}.hogmap ${proteome}.hogmap - else - omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap - fi - """ -} - - -process infer_roothogs{ - label "process_single" - publishDir = [ - path: params.temp_output, - enabled: params.debug_enabled, - ] - - input: - path hogmaps, stageAs: "hogmaps/*" - path proteome_folder - path splice_folder - output: - path "omamer_rhogs/*" - path "gene_id_dic_xml.pickle" - path "selected_isoforms" , optional: true - script: - """ - fastoma-infer-roothogs --proteomes ${proteome_folder} \ - --hogmap hogmaps \ - --splice ${splice_folder} \ - --out-rhog-folder "omamer_rhogs/" \ - -vv - """ -} - - -process batch_roothogs{ - input: - path rhogs, stageAs: "omamer_rhogs/*" - output: - path "rhogs_rest/*", optional: true - path "rhogs_big/*" , optional: true - script: - """ - fastoma-batch-roothogs --input-roothogs omamer_rhogs/ \ - --out-big rhogs_big \ - --out-rest rhogs_rest \ - -vv - """ -} - -process hog_big{ - label "process_high" - - input: - each rhogsbig - path species_tree - output: - path "pickle_hogs" - path "msa/*.fa" , optional: true // msa if write True - path "gene_trees/*.nwk" , optional: true // gene trees if write True - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsbig} \ - --species-tree ${species_tree} \ - --fragment-detection \ - --low-so-detection \ - --parallel - """ -} - -process hog_rest{ - label "process_single" - - input: - each rhogsrest - path species_tree - output: - path "pickle_hogs" - path "msa/*.fa" , optional: true // msa if write True - path "gene_trees/*.nwk" , optional: true // gene trees if write True - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsrest} \ - --species-tree ${species_tree} \ - --fragment-detection \ - --low-so-detection - #--out pickle_hogs - """ -} - - -process collect_subhogs{ - publishDir params.output_folder, mode: 'copy' - input: - path pickles, stageAs: "pickle_folders/?" - path "gene_id_dic_xml.pickle" - path rhogs, stageAs: "omamer_rhogs/*" - output: - path "output_hog.orthoxml" - path "OrthologousGroupsFasta" - path "OrthologousGroups.tsv" - path "rootHOGs.tsv" - script: - """ - fastoma-collect-subhogs --pickle-folder pickle_folders/ \ - --roothogs-folder omamer_rhogs/ \ - --gene-id-pickle-file gene_id_dic_xml.pickle \ - --out output_hog.orthoxml \ - --marker-groups-fasta OrthologousGroups.tsv \ - --roothog-tsv rootHOGs.tsv \ - -vv - """ -} - -workflow { - proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first() - proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true) - species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first() - - splice_folder = Channel.fromPath(params.splice_folder, type: "dir") - genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir') - hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir') - - omamerdb = Channel.fromPath(params.omamer_db) - (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder) - omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check) - hogmap = omamer_run(omamer_input_channel) - - (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder) - - (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs) - - (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree) - (pickle_rest_rhog, msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree) - channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle } - - (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs) - -} - -workflow.onComplete { - def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : ""); - println "" - println "Completed at : $workflow.complete" - println "Duration : $workflow.duration" - println "Processes : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)" - println "Output in : $params.output_folder" + report - println ( workflow.success ? "Done!" : "Oops .. something went wrong" ) -} - -