From 7a7164048a16a8556538e99e0004c779707c5d08 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Thu, 2 Nov 2023 14:32:55 +0100
Subject: [PATCH 01/22] manage python part of FastOMA with hatch

---
 pyproject.toml | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..fb8a4fa
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,48 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "FastOMA"
+dynamic = ["version"]
+description = "FastOMA - a package to infer orthology information among proteomes"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.8"
+authors = [
+    { name = "Sina Majidian", email = "sina.majidian@gmail.com" },
+    { name = "Adrian Altenhoff", email = "adrian.altenhoff@inf.ethz.ch" }
+]
+dependencies = [
+    "biopython ~=1.81",
+    "DendroPy ~=4.3",
+    "ete3 ~=3.1",
+    "lxml ~=4.3",
+    "omamer ~=2.0",
+    "pyham ~=1.1",
+    "pyparsing ",
+]
+
+[project.optinal-dependencies]
+nextflow = [
+    "nextflow"
+]
+
+
+[project.scripts]
+batch-roothogs = "FastOMA.batch_roothogs:batch_roothogs"
+check-input-fastoma = "FastOMA.check_input_fastoma:check_input_fastoma"
+collect-subhogs = "FastOMA.collect_subhogs:collect_subhogs"
+infer-roothogs = "FastOMA.infer_roothogs:infer_roothogs"
+infer-subhogs = "FastOMA.infer_subhogs:infer_subhogs"
+
+[project.urls]
+Homepage = "https://github.com/DessimozLab/FastOMA"
+
+[tool.hatch.version]
+path = "FastOMA/__init__.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/FastOMA",
+]

From 579ab46da378c36a142b7ca5bb2c86b2752dcbec Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Thu, 2 Nov 2023 14:33:32 +0100
Subject: [PATCH 02/22] initial Dockerfile to generate container with necessay
 dependencies

---
 Dockerfile | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..99ac18e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,42 @@
+FROM python:3.11-slim as basis
+
+# set environment varibles
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+
+FROM basis as builder
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       build-essential \
+       fasttree \
+       libxml2 \
+       mafft \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+RUN pip install --upgrade hatch pip
+COPY pyproject.toml .
+RUN python -m venv /app \
+    && hatch dep show requirements > requirements.txt \
+    && /app/bin/pip install -r requirements.txt
+
+COPY . .
+RUN ls -la \
+    && hatch build \
+    && ls -la dist/ \
+    && /app/bin/pip install dist/*.whl
+
+
+FROM basis as runtime
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       fasttree \
+       libxml2 \
+       mafft \
+    && apt-get -y autoremove \
+    && apt-get -y autoclean \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /app /app
+ENV PATH="/app/bin:$PATH"

From 7b9d0d52ef85ded6d881c9b8d1a5bf9294860de3 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Mon, 6 Nov 2023 07:48:51 +0100
Subject: [PATCH 03/22] adding .gitignore and .dockerignore

---
 .dockerignore | 7 +++++++
 .gitignore    | 7 +++++++
 2 files changed, 14 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .gitignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..466368d
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,7 @@
+work
+.nextflow*
+.idea
+.git
+output
+testdata
+dist
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..35f96f5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.nextflow*
+work/
+.idea/
+dist/
+archive
+.git
+.gitignore

From ce3b3a4650ca920acc125444c23958e61714923b Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Mon, 6 Nov 2023 09:35:45 +0100
Subject: [PATCH 04/22] adding github workflow to build docker image

---
 .github/workflows/docker-image.yml | 73 ++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 .github/workflows/docker-image.yml

diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 0000000..73e5fb6
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,73 @@
+name: Docker Image CI
+
+on:
+  push:
+  pull_request:
+  release:
+    type: [published]
+
+env:
+  TEST_TAG: dessimozlab/fastoma:test
+
+jobs:
+
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            dessimozlab/fastoma
+          # generate Docker tags based on the following events/attributes
+          tags: |
+            type=schedule
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+            type=sha
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Build and export to docker for testing
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          load: true
+          tags: ${{ env.TEST_TAG }}
+
+      #- name: Test
+      #  run: |
+      #    docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/cds-marker_genes.fasta.gz --reads /reads/sample_1.fastq --output_path /out
+      #    if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          push: true
+          #${{ github.event_name != 'push' && github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}

From 62edd3b125d89786155d42e3a3c9b0b8e84c71a0 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Mon, 13 Nov 2023 16:36:43 +0100
Subject: [PATCH 05/22] [WIP] restructure pipeline such that no signaling is
 needed and data properly mounted

---
 FastOMA/batch_roothogs.py | 148 +++++++++++++++-----------------------
 FastOMA_light.nf          |  66 +++++++++--------
 README.md                 |  17 ++++-
 nextflow.config           |  32 +++++++++
 4 files changed, 136 insertions(+), 127 deletions(-)
 create mode 100644 nextflow.config

diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py
index 59a3ddb..55e04d2 100644
--- a/FastOMA/batch_roothogs.py
+++ b/FastOMA/batch_roothogs.py
@@ -1,98 +1,64 @@
-
-
-# from Bio import SeqIO
-import os
-from os import makedirs
 import shutil
-from os import listdir
 from . import _config
-
-
-def list_rhog_fastas(address_rhogs_folder):
-    """
-     create orthoxml_to_newick.py list of rootHOG IDs  stored in the folder of rHOG .
-     input: folder address
-     output: list of rhog Id (integer)
-    """
-    rhog_files = listdir(address_rhogs_folder)
-    rhogid_list= []
-    for rhog_file in rhog_files:
-        if rhog_file.split(".")[-1] == "fa" and rhog_file.split("_")[0] == "HOG":
-            rhogid = rhog_file.split(".")[0].split("_")[1]
-            rhogid_list.append(rhogid)
-
-    return rhogid_list
-
-
-
-def folder_1h_rhog(address_rhogs_folder, output_folder_big, output_folder_rest):
-
-    # in_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/working_nfp/"
-
-    #work_fldor_out = in_folder
-
-    rhogid_list = list_rhog_fastas(address_rhogs_folder)
-    len(rhogid_list)
-    dic_rhognum_size = {}
-    for rhogid in rhogid_list[2:]:
-        rhog_i_prot_address = address_rhogs_folder + "/HOG_" + rhogid + ".fa"
-        # rhog_i = list(SeqIO.parse(rhog_i_prot_address, "fasta"))
-        rhog_i_size = os.path.getsize(rhog_i_prot_address)
-        dic_rhognum_size[rhogid] = rhog_i_size
-    len(dic_rhognum_size)
-
-
-    #list_list_rest_rhog = [[]]
-    list_list_rest_size = [[]]
-    #list_list_big = []
-
-    # to have at least one big and one rest rhog, this makes nextflow pipline easier,
-    # otherwise we need if condition  collect_subhogs won't satisfy if one of them is empty
-
-    list_list_rest_rhog = [[rhogid_list[0]],[]]  # each insid list should take 1h to
-    list_list_rest_size = [[],[]]
-    list_list_big = [rhogid_list[1]]
-
-
-
-    for rhognum, size in dic_rhognum_size.items():
-        # print(rhognum, size)
-        if size > _config.big_rhog_filesize_thresh:
-            list_list_big.append(rhognum)
-        else:
-            if sum(list_list_rest_size[-1]) < _config.sum_list_rhogs_filesize_thresh:
-                list_list_rest_rhog[-1].append(rhognum)
-                list_list_rest_size[-1].append(size)
+from pathlib import Path
+logger = _config.logger_hog
+
+
+class BatchBuilder:
+    def __init__(self, outdir: Path, max_size: int):
+        self.outdir = outdir
+        self.max_size = max_size
+
+    def __enter__(self):
+        self.cur_batch = []
+        self.cur_size = 0
+        self.counter = 0
+        self.outdir.mkdir(parents=True, exist_ok=True)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if len(self.cur_batch) > 0:
+            self._flush()
+
+    def add_hog(self, hog_file: Path):
+        self.cur_batch.append(hog_file)
+        self.cur_size += hog_file.stat().st_size
+        if self.cur_size > self.max_size:
+            self._flush()
+
+    def _flush(self):
+        batch_dir = self.outdir / str(self.counter)
+        batch_dir.mkdir()
+        for fn in self.cur_batch:
+            shutil.copy(fn, batch_dir)
+        logger.debug("creating batch %s with %d families; total size of files is %d",
+                     batch_dir, len(self.cur_batch), self.cur_size)
+        self.cur_size = 0
+        self.cur_batch = []
+
+
+def folder_1h_rhog(roothog_path:Path, output_folder_big:Path, output_folder_rest:Path):
+    # create a list of hogs in descending filesize order
+    hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1])
+    with BatchBuilder(output_folder_big, 1) as big_hogs, \
+            BatchBuilder(output_folder_rest, _config.sum_list_rhogs_filesize_thresh) as rest_hogs:
+        for hog, fsize in hog_size_tuples:
+            if fsize > _config.big_rhog_filesize_thresh:
+                big_hogs.add_hog(hog)
             else:
-                list_list_rest_rhog.append([rhognum])
-                list_list_rest_size.append([size])
-
-    # makedirs(work_fldor_out+"rhogs_rest")
-    for folder_id, list_rhog in enumerate(list_list_rest_rhog):
-        # print(folder_id)
-        # output_folder_rest = work_fldor_out + "/rhogs_rest/"
-        makedirs( output_folder_rest + str(folder_id))
-        for rhogid in list_rhog:
-            name = "HOG_" + rhogid + ".fa"
-            folder_rest =output_folder_rest + str(folder_id) + "/"
-            shutil.copy(address_rhogs_folder + name, folder_rest + name)
-
-    # makedirs(work_fldor_out+"rhogs_big")
-    #  output_folder_big = work_fldor_out + "rhogs_big/"
-    for folder_id, rhogid in enumerate(list_list_big):
-        name = "HOG_" + rhogid + ".fa"
-        folder_big = output_folder_big + "b" + str(folder_id) + "/"
-        makedirs(folder_big)
-        shutil.copy(address_rhogs_folder + name, folder_big + name)
-
-    return 1
-
+                rest_hogs.add_hog(hog)
 
 
 def batch_roothogs():
-
-    input_rhog = "./omamer_rhogs/" #
-    output_folder_big = "./rhogs_big/"
-    output_folder_rest = "./rhogs_rest/"
-    folder_1h_rhog(input_rhog, output_folder_big, output_folder_rest)
+    import argparse
+    parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis")
+    parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored")
+    parser.add_argument('--out-big', required=True, help="folder where the big single family hogs should be stored")
+    parser.add_argument('--out-rest', required=True, help="folder where the remaining families should be stored in"
+                                                          "batch subfolder structure.")
+    parser.add_argument('-v', default=0, action="count", help="incrase verbosity")
+    conf = parser.parse_args()
+    logger.setLevel(level=30 - 10 * min(conf.v, 2))
+
+    folder_1h_rhog(Path(conf.input_roothogs), Path(conf.out_big), Path(conf.out_rest))
 
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 7c1df26..009825f 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -17,20 +17,22 @@ params.genetrees_folder = params.output_folder + "/genetrees"
 process omamer_run{
   time {4.h}
   publishDir params.hogmap_folder
+
   input:
-    path proteomes_omamerdb_inputhog
+  tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder)
+
   output:
-    path "*.hogmap"
-    val true
+  path "*.hogmap"
+  val true
+
   script:
   """
-    if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ]
-    then
-        cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap  ${proteomes_omamerdb_inputhog[0]}.hogmap
+    if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then
+        cp ${precomputed_hogmap_folder}/${proteome}.hogmap  ${proteome}.hogmap
     else
-        omamer search --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap
+        omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap
     fi
-  """  // --nthreads 10
+  """
 }
 
 
@@ -41,9 +43,8 @@ process infer_roothogs{
     path proteome_folder
     path splice_folder
   output:
-    path "omamer_rhogs"
+    path "omamer_rhogs/*"
     path "gene_id_dic_xml.pickle"
-    val true         // nextflow-io.github.io/patterns/state-dependency/
   script:
     """
        infer-roothogs  --logger-level DEBUG
@@ -53,15 +54,13 @@ process infer_roothogs{
 
 process batch_roothogs{
   input:
-    val ready_infer_roothogs
-    path "omamer_rhogs"
+    path rhogs, stageAs: "omamer_rhogs/*"
   output:
     path "rhogs_rest/*", optional: true
     path "rhogs_big/*" , optional: true
-    val true
   script:
     """
-        batch-roothogs
+        batch-roothogs --input-roothogs omamer_rhogs/ --out-big rhogs_big --out-rest rhogs_rest -vvv
     """
 }
 
@@ -85,7 +84,7 @@ process hog_big{
 process hog_rest{
   publishDir params.pickles_temp
   input:
-    val rhogsrest_tree_ready
+    tuple path(rhogsrest), path(species_tree)
   output:
     path "*.pickle"
     path "*.fa" , optional: true   // msa         if write True
@@ -93,8 +92,8 @@ process hog_rest{
     val true
   script:
     """
-        infer-subhogs  --input-rhog-folder ${rhogsrest_tree_ready[0]}  --species-tree ${rhogsrest_tree_ready[1]} --fragment-detection --low-so-detection
-    """  // --parrallel False
+        infer-subhogs  --input-rhog-folder ${rhogsrest}  --species-tree ${species_tree} --fragment-detection --low-so-detection
+    """
 }
 
 
@@ -118,37 +117,36 @@ process collect_subhogs{
 }
 
 workflow {
-    proteomes = Channel.fromPath(params.proteomes,  type:'any' ,checkIfExists:true)
+    proteomes = Channel.fromPath(params.proteomes, type:'any', checkIfExists:true)
     proteome_folder = Channel.fromPath(params.proteome_folder)
     hogmap_folder = Channel.fromPath(params.hogmap_folder)
     splice_folder = Channel.fromPath(params.splice_folder)
 
     genetrees_folder = Channel.fromPath(params.genetrees_folder)
-    hogmap_in = Channel.fromPath(params.hogmap_in)
+    hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir')
 
     pickles_temp =  Channel.fromPath(params.pickles_temp)
-    omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
+    omamerdb = Channel.fromPath(params.omamer_db)
     proteomes_omamerdb = proteomes.combine(omamerdb)
-    proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
-    (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog)
+    proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in)
+
+    (hogmap, ready_omamer_run) = omamer_run(proteomes_omamerdb_inputhog)
     ready_omamer_run_c = ready_omamer_run.collect()
 
     (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder)
-    ready_infer_roothogs_c = ready_infer_roothogs.collect()
 
-    (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, omamer_rhogs)
-    ready_batch_roothogs_c = ready_batch_roothogs.collect()
+    (rhogs_rest_list, rhogs_big_list) = batch_roothogs(omamer_rhogs)
+
 
     species_tree = Channel.fromPath(params.species_tree)
-    rhogsbig = rhogs_big_list.flatten()
-    rhogsbig_tree =  rhogsbig.combine(species_tree)
-    rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
-    (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready)
-
-    rhogsrest = rhogs_rest_list.flatten()
-    rhogsrest_tree =  rhogsrest.combine(species_tree)
-    rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
-    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
+    //rhogsbig = rhogs_big_list.flatten()
+    //rhogsbig_tree =  rhogsbig.combine(species_tree)
+    //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
+    //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree)
+
+    rhogsrest_tree =  rhogs_rest_list.combine(species_tree)
+    //rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
+    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree)
 
     (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs)
     orthoxml_file.view{" output orthoxml file ${it}"}
diff --git a/README.md b/README.md
index f9354dc..1ad49bf 100644
--- a/README.md
+++ b/README.md
@@ -40,9 +40,22 @@ which can be used with [PyHAM](https://github.com/DessimozLab/pyham).
 In summary, you need to 1) install FastOMA and its prerequisites (below), and  2) put the input files in the folder `in_folder` 
 and 3) run FastOMA using the nextflow recipe `FastOMA_light.nf`. 
 ```
-nextflow  FastOMA_light.nf  --input_folder /path/to/in_folder   --output_folder /path/to/out_folder 
+nextflow run FastOMA_light.nf -profile docker --input_folder /path/to/in_folder   --output_folder /path/to/out_folder 
 ```
-The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`. 
+The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`.
+
+## More details on how to run
+We provide for every commit of the repository a docker image for FastOMA on dockerhub. You can specify the container as 
+part of the nextflow command with the parameter `container_version`. If you want to use the container of the current 
+git checkout version, you can specify this in the following way:
+
+```bash
+nextflow run FastOMA.nf -profile docker \
+    --container_version "sha-$(git rev-list --max-count=1 --abbrev-commit HEAD)" \
+    --input_folder testdata/in_folder \
+    --output_folder myresult/
+```
+
 
 
 # How to install FastOMA
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..866c569
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1,32 @@
+// General configuration used in all profiles
+manifest {
+  description = 'FastOMA - Infer orthology '
+  nextflowVersion = '22.10.4'
+}
+
+params {
+  container_name = "dessimozlab/fastoma"
+  container_version = "latest"
+  omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5"
+}
+
+// Profiles configure nextflow depending on the environment (local, integration, live, etc.)
+profiles {
+  docker {
+    process {
+      container = "$params.container_name:$params.container_version"
+    }
+    docker.enabled = true
+  }
+  singularity {
+    process {
+      container = "$params.container_name:$params.container_version"
+    }
+    singularity.enabled = true
+    singularity.autoMounts = true
+  }
+  standard {
+    process.executor = 'local'
+  }
+}
+

From 40950c04fec943086a73689d32812a3d30e75267 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Tue, 14 Nov 2023 10:09:24 +0100
Subject: [PATCH 06/22] [WIP] correct batching, missing collection phase

---
 FastOMA/_config.py        |  2 +-
 FastOMA/batch_roothogs.py |  1 +
 FastOMA/infer_subhogs.py  |  9 +--------
 FastOMA_light.nf          | 35 +++++++++++++++++++----------------
 4 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/FastOMA/_config.py b/FastOMA/_config.py
index fe778f2..46fdf62 100644
--- a/FastOMA/_config.py
+++ b/FastOMA/_config.py
@@ -93,7 +93,7 @@
 
 # batch_roothogs
 big_rhog_filesize_thresh = 400 * 1000
-sum_list_rhogs_filesize_thresh = 2 * 1e6
+sum_list_rhogs_filesize_thresh = 2 * 1e3
 
 
 # big_rhog_filesize_thresh = 1.6 * 1000  # 600 would be better
diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py
index 55e04d2..86de8b8 100644
--- a/FastOMA/batch_roothogs.py
+++ b/FastOMA/batch_roothogs.py
@@ -25,6 +25,7 @@ def add_hog(self, hog_file: Path):
         self.cur_size += hog_file.stat().st_size
         if self.cur_size > self.max_size:
             self._flush()
+            self.counter += 1
 
     def _flush(self):
         batch_dir = self.outdir / str(self.counter)
diff --git a/FastOMA/infer_subhogs.py b/FastOMA/infer_subhogs.py
index 05670c7..2c685c9 100644
--- a/FastOMA/infer_subhogs.py
+++ b/FastOMA/infer_subhogs.py
@@ -3,13 +3,6 @@
 from . import _utils_subhog
 from . import _infer_subhog
 from . import _config
-# import os
-# from os import listdir
-# import os
-import sys
-# from os import listdir
-# import _config
-
 from ._config import logger_hog
 import os
 
@@ -53,7 +46,7 @@ def infer_subhogs():
     if inferhog_concurrent_on:
         print("parallelization for subhog inference is on.")
 
-    pickles_rhog_folder = "./" # pickles_temp/ pickle_rhogs
+    pickles_rhog_folder = "./pickle_hogs" # pickles_temp/ pickle_rhogs
     if not os.path.exists(pickles_rhog_folder):
         os.makedirs(pickles_rhog_folder)
 
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 009825f..48882f3 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -65,7 +65,6 @@ process batch_roothogs{
 }
 
 process hog_big{
-  publishDir params.pickles_temp
   cpus  2
   time {20.h}     // for very big rhog it might need more, or you could re-run and add `-resume`
   input:
@@ -82,17 +81,21 @@ process hog_big{
 }
 
 process hog_rest{
-  publishDir params.pickles_temp
   input:
-    tuple path(rhogsrest), path(species_tree)
+    each rhogsrest
+    path species_tree
   output:
-    path "*.pickle"
-    path "*.fa" , optional: true   // msa         if write True
-    path "*.nwk" , optional: true  // gene trees  if write True
+    path "pickle_hogs"
+    path "msa/*.fa" , optional: true   // msa         if write True
+    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
     val true
   script:
     """
-        infer-subhogs  --input-rhog-folder ${rhogsrest}  --species-tree ${species_tree} --fragment-detection --low-so-detection
+        infer-subhogs  --input-rhog-folder ${rhogsrest}  \
+                       --species-tree ${species_tree} \
+                       --fragment-detection \
+                       --low-so-detection
+                       #--out pickle_hogs
     """
 }
 
@@ -100,11 +103,9 @@ process hog_rest{
 process collect_subhogs{
   publishDir params.output_folder, mode: 'copy'
   input:
-    val ready_hog_rest
-    val ready_hog_big
-    path "pickles_temp"   // this is the folder includes pickles_rhogs
+    path pickles, stageAs: "pickle_folders/?"
     path "gene_id_dic_xml.pickle"
-    path "omamer_rhogs"
+    path rhogs, stageAs: "omamer_rhogs/*"
   output:
     path "output_hog.orthoxml"
     path "OrthologousGroupsFasta"
@@ -121,6 +122,7 @@ workflow {
     proteome_folder = Channel.fromPath(params.proteome_folder)
     hogmap_folder = Channel.fromPath(params.hogmap_folder)
     splice_folder = Channel.fromPath(params.splice_folder)
+    species_tree = Channel.fromPath(params.species_tree)
 
     genetrees_folder = Channel.fromPath(params.genetrees_folder)
     hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir')
@@ -135,20 +137,21 @@ workflow {
 
     (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder)
 
-    (rhogs_rest_list, rhogs_big_list) = batch_roothogs(omamer_rhogs)
+    (rhogs_rest_batches, rhogs_big_list) = batch_roothogs(omamer_rhogs)
+    rhogs_rest_batches.flatten().view{ "batch $it" }
 
 
-    species_tree = Channel.fromPath(params.species_tree)
     //rhogsbig = rhogs_big_list.flatten()
     //rhogsbig_tree =  rhogsbig.combine(species_tree)
     //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
     //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree)
 
-    rhogsrest_tree =  rhogs_rest_list.combine(species_tree)
+    //rhogsrest_tree =  rhogs_rest_list.combine(species_tree)
     //rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
-    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree)
+    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogs_rest_batches.flatten(), species_tree)
+    pickle_rest_rhog.view()
 
-    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), pickles_temp, gene_id_dic_xml, omamer_rhogs)
+    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(pickle_rest_rhog.collect(), gene_id_dic_xml, omamer_rhogs)
     orthoxml_file.view{" output orthoxml file ${it}"}
 
 }

From 02eb3792422fce55d2f99b70cd156eea21044f4e Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Tue, 14 Nov 2023 11:57:58 +0100
Subject: [PATCH 07/22] [WIP] correct batching, orthoxml generated

---
 FastOMA/batch_roothogs.py  |  2 +-
 FastOMA/collect_subhogs.py | 70 ++++++++++++++++++++++----------------
 FastOMA_light.nf           |  6 +++-
 3 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py
index 86de8b8..9245219 100644
--- a/FastOMA/batch_roothogs.py
+++ b/FastOMA/batch_roothogs.py
@@ -38,7 +38,7 @@ def _flush(self):
         self.cur_batch = []
 
 
-def folder_1h_rhog(roothog_path:Path, output_folder_big:Path, output_folder_rest:Path):
+def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_rest: Path):
     # create a list of hogs in descending filesize order
     hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1])
     with BatchBuilder(output_folder_big, 1) as big_hogs, \
diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py
index c10db60..af3923e 100644
--- a/FastOMA/collect_subhogs.py
+++ b/FastOMA/collect_subhogs.py
@@ -13,32 +13,60 @@
 import os
 from FastOMA.zoo.hog.convert import orthoxml_to_newick
 from Bio import SeqIO
+from pathlib import Path
 
 # This code collect subhogs and writes outputs.
 
+def load_hogs(pickle_folder: Path):
+    hogs_all = []
+    cnt = 0
+    logger_hog.info("reading pickle files from %s", pickle_folder)
+    for root, dirs, files in os.walk(pickle_folder, followlinks=True):
+        for f in files:
+            if f.startswith('file_') and f.endswith('.pickle'):
+                with open(os.path.join(root, f), 'rb') as handle:
+                    cur = pickle.load(handle)
+                hogs_all.extend(cur)
+                cnt += 1
+                if cnt % 500 == 0:
+                    logger_hog.info("read %d batch pickle files so far, resulting in %d roothogs so far",
+                                    cnt, len(hogs_all))
+    logger_hog.info("number of pickle files is %d.", cnt)
+    logger_hog.debug("number of hogs in all batches is %d", len(hogs_all))
+    return hogs_all
 
-def collect_subhogs():
-
-    logger_hog.info("started collecting pickle files ")
 
+def collect_subhogs():
+    import argparse
+    parser = argparse.ArgumentParser(description="collecting all computed HOGs and combine into a single orthoxml")
+    parser.add_argument('--pickle-folder', required=True,
+                        help="folder containing the pickle files. Will be searched recursively")
+    parser.add_argument('--roothogs-folder', required=True, help="folder containing the omamer roothogs")
+    parser.add_argument('--gene-id-pickle-file', required=True,
+                        help="file containing the gene-id dictionary in pickle format")
+    parser.add_argument('--out', required=False, default="output_hog.orthoxml", help="output filename in orthoxml")
+    parser.add_argument('-v', action="count", default=0)
+    conf = parser.parse_args()
+    logger_hog.setLevel(level=30 - 10 * min(conf.v, 2))
+    logger_hog.debug(conf)
+
+    write_hog_orthoxml(conf.pickle_folder, conf.out, conf.gene_id_pickle_file)
+
+
+def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file):
     # todo as input argument/option in nextflow
     protein_format_qfo_dataset_before2022 = True #False
     # in benchamrk dataset the output prot names should be short
     # tr|A0A0N7KCI6|A0A0N7KCI6_ORYSJ
     # for qfo benchmark, the middle should be written in the file
 
-    pickle_folder = "./pickles_temp/" #pickle_rhogs
-    output_xml_name = "./output_hog.orthoxml"
-    gene_id_pickle_file = "./gene_id_dic_xml.pickle" # this file includes integer mapping of each protein
-
-
     orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA",
                                                    "originVersion": "Nov 2021", "version": "0.3"})  #
 
     with open(gene_id_pickle_file, 'rb') as handle:
         gene_id_name = pickle.load(handle)
         # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
-    logger_hog.debug("We read the gene_id_name dictionary with"+str(len(gene_id_name))+"items")
+    logger_hog.debug("We read the gene_id_name dictionary with %d items", len(gene_id_name))
     logger_hog.debug("Now creating the header of orthoxml")
 
    #  #### create the header of orthoxml ####
@@ -56,22 +84,8 @@ def collect_subhogs():
     logger_hog.debug("gene_xml is created.")
 
     #  #### create the groups of orthoxml   ####
-    pickle_files_adress_raw = listdir(pickle_folder)
-    pickle_files_adress = [i for i in pickle_files_adress_raw if i.endswith(".pickle") and i.startswith("file_")]
-
-    logger_hog.info("number of pickle files is "+str(len(pickle_files_adress))+".")
-    logger_hog.debug("pickle files are " + str(len(pickle_files_adress)) + ".")
-    hogs_a_rhog_xml_all = []
-    for pickle_file_adress in pickle_files_adress:
-        with open(pickle_folder + pickle_file_adress, 'rb') as handle:
-            hogs_a_rhog_xml_batch = pickle.load(handle)  #  list of hog object.
-            hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch)
-            # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object.
-
-    logger_hog.debug("number of hogs in all batches is "+str(len(hogs_a_rhog_xml_all))+" .")
     groups_xml = ET.SubElement(orthoxml_file, "groups")
-
-    for hogs_a_rhog_xml in hogs_a_rhog_xml_all:
+    for hogs_a_rhog_xml in load_hogs(Path(pickle_folder)):
         groups_xml.append(hogs_a_rhog_xml)
     logger_hog.debug("converting the xml object to string.")
 
@@ -82,14 +96,10 @@ def collect_subhogs():
         file_xml.write(xml_str)
     file_xml.close()
 
-    logger_hog.info("orthoxml is written in " + output_xml_name)
-
-
-
-
-
+    logger_hog.info("orthoxml is written in %s", output_xml_name)
 
 
+def write_fasta():
     logger_hog.info("Now writing OG fasta files ")
 
 
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 48882f3..be18a13 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -113,7 +113,11 @@ process collect_subhogs{
     path "rootHOGs.tsv"
   script:
     """
-        collect-subhogs
+        collect-subhogs --pickle-folder pickle_folders/ \
+                        --roothogs-folder omamer_rhogs/ \
+                        --gene-id-pickle-file gene_id_dic_xml.pickle \
+                        --out output_hog.orthoxml \
+                        -vv
     """
 }
 

From 151ede885789a6d85aae69876a25d1d27c193c40 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Tue, 14 Nov 2023 15:48:10 +0100
Subject: [PATCH 08/22] [WIP] fix remaining output - big roothogs not done yet

---
 FastOMA/collect_subhogs.py | 53 ++++++++++++++++++++++----------------
 FastOMA_light.nf           |  6 ++---
 2 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py
index af3923e..60e0c50 100644
--- a/FastOMA/collect_subhogs.py
+++ b/FastOMA/collect_subhogs.py
@@ -46,11 +46,22 @@ def collect_subhogs():
                         help="file containing the gene-id dictionary in pickle format")
     parser.add_argument('--out', required=False, default="output_hog.orthoxml", help="output filename in orthoxml")
     parser.add_argument('-v', action="count", default=0)
+    parser.add_argument('--roothog-tsv', default=None, required=False,
+                        help="If specified, a tsv file with the given path will be produced containing the roothog asignments as TSV file.")
+    parser.add_argument('--marker-groups-fasta', default=None, required=False,
+                        help="If specified, a folder named OrthologousFasta and a TSV file with the name provided in "
+                             "this argument will be generated that contains single copy groups, i.e. groups which have " 
+                             "at most one gene per species. Useful as phylogenetic marker genes to reconstruct species "
+                             "trees.")
     conf = parser.parse_args()
     logger_hog.setLevel(level=30 - 10 * min(conf.v, 2))
     logger_hog.debug(conf)
 
     write_hog_orthoxml(conf.pickle_folder, conf.out, conf.gene_id_pickle_file)
+    if conf.roothog_tsv is not None:
+        write_roothogs(conf.out, conf.roothog_tsv)
+    if conf.marker_groups_fasta is not None:
+        write_group_files(conf.out, conf.roothogs_folder, conf.marker_groups_fasta)
 
 
 def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file):
@@ -99,10 +110,13 @@ def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file):
     logger_hog.info("orthoxml is written in %s", output_xml_name)
 
 
-def write_fasta():
-    logger_hog.info("Now writing OG fasta files ")
-
+def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=None, output_fasta_groups=None):
+    if output_file_og_tsv is None:
+        output_file_og_tsv = "OrthologousGroups.tsv"
+    if output_fasta_groups is None:
+        output_fasta_groups = "OrthologousGroupsFasta"
 
+    logger_hog.info("Start writing OG fasta files")
     def max_og_tree(tree):
         for node in tree.traverse("preorder"):
             # for node in xml_tree.traverse(strategy="preorder", is_leaf_fn=lambda n: hasattr(n, "attriremoved") and n.attriremoved==True):
@@ -145,14 +159,12 @@ def max_og_tree(tree):
 
         return og_prot_list
 
-    input_orthoxml = output_xml_name # sys.argv[1]  # "out_folder/output_hog_.orthoxml"
     rhog_all_folder = "./omamer_rhogs/" #sys.argv[2] + "/"  # "out_folder/rhogs_all/"
     fasta_format = "fa"  # of the rhogs
 
-    output_file_og_tsv = "OrthologousGroups.tsv"
 
-    trees, species_dic = orthoxml_to_newick(input_orthoxml, return_gene_to_species=True)  # encode_levels_as_nhx=False,  xref_tag="protId",
-    print("We extracted " + str(len(trees)) + " trees  in NHX format from the input HOG orthoxml" + input_orthoxml)
+    trees, species_dic = orthoxml_to_newick(orthoxml, return_gene_to_species=True)
+    logger_hog.info("We extracted %d trees in NHX format from the input HOG orthoxml %s", len(trees),  orthoxml)
 
     OGs = {}
     for hog_id, tree_string in trees.items():
@@ -171,10 +183,9 @@ def max_og_tree(tree):
 
     print("We wrote the protein families information in the file " + output_file_og_tsv)
 
-    out_folder_ogs = "OrthologousGroupsFasta/"
-    os.makedirs(out_folder_ogs)
+    os.makedirs(output_fasta_groups)
 
-    print("start writing " + str(len(OGs)) + " OGs as fasta files in folder " + out_folder_ogs)
+    print("start writing " + str(len(OGs)) + " OGs as fasta files in folder " + output_fasta_groups)
     for hog_id, og_prot_list in OGs.items():  # hog_id="HOG_0667494_sub10524"
         rhog_id = "_".join(hog_id.split("_")[:2])
 
@@ -190,30 +201,28 @@ def max_og_tree(tree):
                 og_prots.append(rhogs_prot)
 
         og_id = "OG_" + hog_id  # one OG per rootHOG      # "/HOG_"+ rhogid
-        SeqIO.write(og_prots, out_folder_ogs + og_id + ".fa", "fasta")
-    print("writing done")
-
+        SeqIO.write(og_prots, output_fasta_groups + og_id + ".fa", "fasta")
+    logger_hog.info("writing done")
 
 
+def write_roothogs(orthoxml, out):
     # import sys
     # input_orthoxml = output_xml_name
-    output_file = "rootHOGs.tsv"
+    if out is None:
+        out = "rootHOGs.tsv"
 
     toplevel_groups = []
-    for grp in extract_flat_groups_at_level(input_orthoxml):
+    for grp in extract_flat_groups_at_level(orthoxml):
         toplevel_groups.append(set(g.xref for g in grp))
 
-    # toplevel_groups is a list of sets
+    logger_hog.info("We extracted %d protein families from the input HOG orthoxml %s", len(toplevel_groups), orthoxml)
+    logger_hog.info("The first one contain %d proteins.", len(toplevel_groups[0]))
 
-    print("We extracted "+str(len(toplevel_groups))+" protein families from the input HOG orthoxml"+input_orthoxml)
-    print("The first one contain "+str(len(toplevel_groups[0]))+" proteins.")
-
-    with open(output_file, 'w') as handle:
+    with open(out, 'w') as handle:
         for toplevel_group_idx, toplevel_group in enumerate(toplevel_groups):
             line_text = str(toplevel_group_idx)+"\t"+str(toplevel_group)[1:-1]+"\n"
             handle.write(line_text)
-    handle.close()
 
-    print("We wrote the protein families information in the file "+output_file)
+    logger_hog.info("We wrote the protein families information in the file %s", out)
 
 
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index be18a13..002274f 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -117,6 +117,8 @@ process collect_subhogs{
                         --roothogs-folder omamer_rhogs/ \
                         --gene-id-pickle-file gene_id_dic_xml.pickle \
                         --out output_hog.orthoxml \
+                        --marker-groups-fasta OrthologousGroups.tsv \
+                        --roothog-tsv rootHOGs.tsv \
                         -vv
     """
 }
@@ -150,9 +152,7 @@ workflow {
     //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
     //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree)
 
-    //rhogsrest_tree =  rhogs_rest_list.combine(species_tree)
-    //rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
-    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogs_rest_batches.flatten(), species_tree)
+    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree)
     pickle_rest_rhog.view()
 
     (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(pickle_rest_rhog.collect(), gene_id_dic_xml, omamer_rhogs)

From 39fa2d9af267aa773161fa9fa65c29771f395e9e Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Thu, 16 Nov 2023 10:57:20 +0100
Subject: [PATCH 09/22] [WIP] integrate big_hogs part into pipeline, write
 combined orthoxml file.

---
 FastOMA/_config.py        |  2 +-
 FastOMA/batch_roothogs.py |  1 +
 FastOMA/infer_roothogs.py |  4 ++--
 FastOMA_light.nf          | 37 +++++++++++++++++--------------------
 4 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/FastOMA/_config.py b/FastOMA/_config.py
index 46fdf62..29d1504 100644
--- a/FastOMA/_config.py
+++ b/FastOMA/_config.py
@@ -92,7 +92,7 @@
 inferhog_min_hog_size_xml = 2     # by setting this as 1, pyham won't work on xml output.
 
 # batch_roothogs
-big_rhog_filesize_thresh = 400 * 1000
+big_rhog_filesize_thresh = 1300
 sum_list_rhogs_filesize_thresh = 2 * 1e3
 
 
diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py
index 9245219..49b314e 100644
--- a/FastOMA/batch_roothogs.py
+++ b/FastOMA/batch_roothogs.py
@@ -23,6 +23,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     def add_hog(self, hog_file: Path):
         self.cur_batch.append(hog_file)
         self.cur_size += hog_file.stat().st_size
+        logger.debug("adding %s with size %d to batch %d", hog_file, hog_file.stat().st_size, self.counter)
         if self.cur_size > self.max_size:
             self._flush()
             self.counter += 1
diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py
index 09771dd..42848ea 100644
--- a/FastOMA/infer_roothogs.py
+++ b/FastOMA/infer_roothogs.py
@@ -33,10 +33,10 @@ def infer_roothogs():
     # sys.exit(0)
 
     #folder = "/scratch/smajidi1/euk_omamer200.dev8_2/test/hogmap/FastOMA-main/testdata/in_folder/"
-    species_names, prot_recs_lists,fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder
+    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder
     prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists)
 
-    hogmaps = _utils_roothog.parse_hogmap_omamer(species_names,fasta_format_keep) # optional input folder
+    hogmaps = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep) # optional input folder
 
     splice_files =  os.path.exists("./splice/")
     if splice_files:
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 002274f..254c613 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -16,7 +16,7 @@ params.genetrees_folder = params.output_folder + "/genetrees"
 
 process omamer_run{
   time {4.h}
-  publishDir params.hogmap_folder
+  publishDir params.hogmap_folder, mode: 'copy'
 
   input:
   tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder)
@@ -67,16 +67,20 @@ process batch_roothogs{
 process hog_big{
   cpus  2
   time {20.h}     // for very big rhog it might need more, or you could re-run and add `-resume`
-  input:
-    val rhogsbig_tree_ready
+    input:
+    each rhogsbig
+    path species_tree
   output:
-    path "*.pickle"
-    path "*.fa", optional: true   // msa         if write True
-    path "*.nwk", optional: true  // gene trees  if write True
-    val true
+    path "pickle_hogs"
+    path "msa/*.fa" , optional: true          // msa         if write True
+    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
   script:
     """
-        infer-subhogs  --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel --fragment-detection --low-so-detection
+        infer-subhogs  --input-rhog-folder ${rhogsbig}  \
+                       --species-tree ${species_tree} \
+                       --fragment-detection \
+                       --low-so-detection \
+                       --parallel
     """
 }
 
@@ -86,9 +90,8 @@ process hog_rest{
     path species_tree
   output:
     path "pickle_hogs"
-    path "msa/*.fa" , optional: true   // msa         if write True
+    path "msa/*.fa" , optional: true          // msa         if write True
     path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
-    val true
   script:
     """
         infer-subhogs  --input-rhog-folder ${rhogsrest}  \
@@ -143,19 +146,13 @@ workflow {
 
     (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder)
 
-    (rhogs_rest_batches, rhogs_big_list) = batch_roothogs(omamer_rhogs)
-    rhogs_rest_batches.flatten().view{ "batch $it" }
-
-
-    //rhogsbig = rhogs_big_list.flatten()
-    //rhogsbig_tree =  rhogsbig.combine(species_tree)
-    //rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
-    //(pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree)
+    (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs)
 
+    (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree)
     (pickle_rest_rhog,  msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree)
-    pickle_rest_rhog.view()
+    channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle }
 
-    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(pickle_rest_rhog.collect(), gene_id_dic_xml, omamer_rhogs)
+    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs)
     orthoxml_file.view{" output orthoxml file ${it}"}
 
 }

From 2ab83614f0e1d40cb1c0ca4fc17ae9ee03b54e01 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Fri, 17 Nov 2023 16:39:10 +0100
Subject: [PATCH 10/22] [WIP] made pipeline work again after merge

---
 FastOMA/_utils_roothog.py      |  97 +++++++-------
 FastOMA/check_fastoma_input.py | 230 +++++++++++++++++----------------
 FastOMA/infer_roothogs.py      |  64 ++++-----
 FastOMA_light.nf               |  42 +++---
 pyproject.toml                 |   2 +-
 5 files changed, 214 insertions(+), 221 deletions(-)

diff --git a/FastOMA/_utils_roothog.py b/FastOMA/_utils_roothog.py
index d6006df..122e3d3 100644
--- a/FastOMA/_utils_roothog.py
+++ b/FastOMA/_utils_roothog.py
@@ -1,3 +1,5 @@
+import collections
+import csv
 
 from Bio import SeqIO
 import pickle
@@ -6,9 +8,10 @@
 
 from ._utils_subhog import logger_hog
 from . import _config
+HOGMapData = collections.namedtuple("HOGMapData", ("hogid", "score", "seqlen", "subfamily_medianseqlen"))
 
 
-def parse_proteomes(folder="./"):  # list_oma_species
+def parse_proteomes(folder=None):  # list_oma_species
     """
     parsing fasta files of proteins located in /proteome/
     using Bio.SeqIO.parse
@@ -16,28 +19,31 @@ def parse_proteomes(folder="./"):  # list_oma_species
     All proteoems should be with the same extension.
     output: prot_recs_lists: a dic with key as species name and  its value list of Biopython record of species.
     """
+    if folder is None:
+        folder = "./proteome"
 
-    project_files = listdir(folder+"/proteome/")
-    species_names = [] # query/input species name based on the file name
+    fasta_format_keep = ""
+    project_files = listdir(folder)
+    logger_hog.debug("using '%s' as proteome folder, found %d files", folder, len(project_files))
+    species_names = []  # query/input species name based on the file name
     for file in project_files:
-        fasta_format = file.split(".")[-1]
-        if fasta_format == "fa" or fasta_format == "fasta":
-            file_name_split = file.split(".")[:-1]
-            species_names.append('.'.join(file_name_split))
-            fasta_format_keep = fasta_format # last one is stored either fa or fasta
+        species_name, ext = file.rsplit('.', 1)
+        logger_hog.debug("%s: %s | %s", file, species_name, ext)
+        if ext in ("fa", "fasta"):
+            species_names.append(species_name)
+            fasta_format_keep = ext  # last one is stored either fa or fasta
+
     # todo accept all fasta formats in the input prtoeome folder, fasta, fa, fna, ..
     prot_recs_lists = {} # key: species name, value is a dic of query protein Biopython records. # 'MYCGE': [SeqRecord(seq=Seq('MDFDK
-
     for species_name in species_names:
-        prot_address = folder+"/proteome/" + species_name + "."+fasta_format_keep
+        prot_address = os.path.join(folder, species_name + "." + fasta_format_keep)
         prots_record = list(SeqIO.parse(prot_address, "fasta"))
-        prot_recs_lists[species_name]=prots_record
+        prot_recs_lists[species_name] = prots_record
 
-    logger_hog.info("There are "+str(len(species_names))+" species in the proteome folder.")
+    logger_hog.info("There are %d species in the proteome folder.", len(species_names))
     return species_names, prot_recs_lists, fasta_format_keep
 
 
-
 def add_species_name_prot_id(species_names, prot_recs_lists):
     """
     adding the name of species to each protein record
@@ -77,7 +83,7 @@ def add_species_name_prot_id(species_names, prot_recs_lists):
     return prot_recs_all
 
 
-def parse_hogmap_omamer(species_names, fasta_format_keep,  folder="./"):
+def parse_hogmap_omamer(species_names, fasta_format_keep,  folder=None):
     """
      function for parsing output of omamer (hogmap files) located in /hogmap/
     Each hogmap file correspond to one fasta file of species, with the same name.
@@ -87,41 +93,29 @@ def parse_hogmap_omamer(species_names, fasta_format_keep,  folder="./"):
     # sp|O66907|ATPA_AQUAE	HOG:C0886513.1b	Eukaryota	696.5519485850672	187	0.37302594463771466	0.2643067275644273	120	504	551	0.8230616302186878
     output is dic of dic for all species:
     """
+    if folder is None:
+        folder = "./hogmap"
+
     hogmaps = {}
-    unmapped = {}
+    unmapped = collections.defaultdict(list)
     for species_name in species_names:
-        hogmap_address = folder + "/hogmap/" + species_name + "."+fasta_format_keep+".hogmap"
-        hogmap_file = open(hogmap_address, 'r')
-
-        for line in hogmap_file:
-            line_strip = line.strip()
-            if not line_strip.startswith('!') and not line_strip.startswith('qs'):
-                # qseqid	hogid	hoglevel	family_p	family_count	family_normcount	subfamily_score	subfamily_count	qseqlen	subfamily_medianseqlen	qseq_overlap
-                line_split = line_strip.split("\t")
-                prot_id = line_split[0]
-                hogid = line_split[1]
-                score = line_split[3]
-                seqlen = line_split[8]
-                subfamily_medianseqlen = line_split[9]
-
-                if hogid == "N/A":
-                    if species_name in unmapped:
-                        unmapped[species_name].append(prot_id)
-                    else:
-                        unmapped[species_name] = [prot_id]
+        hogmap_address = os.path.join(folder, species_name + "." + fasta_format_keep + ".hogmap")
+        cur_species_hogmap = collections.defaultdict(list)
+        with open(hogmap_address, 'rt') as hogmap_file:
+            reader = csv.DictReader((line for line in hogmap_file if not line.startswith('!')),
+                                    dialect="excel-tab")
+            for row in reader:
+                if row['hogid'] == "N/A":
+                    unmapped[species_name].append(row['qseqid'])
                 else:
-                    if species_name in hogmaps:
-                        if prot_id in hogmaps[species_name]:
-                            hogmaps[species_name][prot_id].append((hogid, score, seqlen, subfamily_medianseqlen))
-
-                        else:
-                            hogmaps[species_name][prot_id] = [(hogid, score, seqlen, subfamily_medianseqlen)]
-                    else:
-                        hogmaps[species_name] = {prot_id: [(hogid, score, seqlen, subfamily_medianseqlen)]}
-
-    logger_hog.info("There are " + str(len(hogmaps)) + " species in the hogmap folder.")
-    logger_hog.info("The first species " + species_names[0] + " contains " + str(len(hogmaps[species_names[0]])) + " proteins.")
-
+                    cur_species_hogmap[row['qseqid']].append(
+                        HOGMapData(row['hogid'], row['family_p'], row['qseqlen'], row['subfamily_medianseqlen']))
+        hogmaps[species_name] = cur_species_hogmap
+        logger_hog.info("hogmap %s: %d proteins mapped to %d hogs, %d not mapped",
+                        species_name, len(cur_species_hogmap), sum(len(x) for x in cur_species_hogmap.values()),
+                        len(unmapped[species_name]))
+
+    logger_hog.info("There are %d species in the hogmap folder.", len(hogmaps))
     return hogmaps, unmapped
 
 
@@ -583,14 +577,13 @@ def write_clusters(address_rhogs_folder, min_rhog_size):
     return cluster_iter-1
 
 
-
-def parse_isoform_file(species_names, folder="."):
+def parse_isoform_file(species_names, folder=None):
+    if folder is None:
+        folder = "./splice"
     isoform_by_gene_all = {}
 
     for species_name in species_names:  # from fasta file
-
-        file_splice_name = folder + "/splice/" + species_name + ".splice"
-
+        file_splice_name = os.path.join(folder,  species_name + ".splice")
         if os.path.isfile(file_splice_name):
             # from OMArk
             isoform_by_gene = []
@@ -601,12 +594,10 @@ def parse_isoform_file(species_names, folder="."):
                     isoform_by_gene.append(splice)
         else:
             isoform_by_gene = []
-
         isoform_by_gene_all[species_name] = isoform_by_gene
     return isoform_by_gene_all
 
 
-
 def find_nonbest_isoform(species_names, isoform_by_gene_all, hogmaps):
     isoform_selected = {}
     isoform_not_selected = {}
diff --git a/FastOMA/check_fastoma_input.py b/FastOMA/check_fastoma_input.py
index ee3b7f4..7f00b3e 100644
--- a/FastOMA/check_fastoma_input.py
+++ b/FastOMA/check_fastoma_input.py
@@ -1,65 +1,57 @@
 
-
 import sys
 from ete3 import Tree
 import os
 import collections
+import argparse
 
 from . import _config
 from . import _utils_roothog
-from ._utils_subhog import logger_hog
-
-
-
-"""
-cd in_folder
-python check_fastoma_input.py
-"""
+logger = _config.logger_hog
 
 
-def check_proteome_files():
+def check_proteome_files(proteome_folder):
+    proteome_files = os.listdir(proteome_folder)
+    logger.info("There are %d files in the proteome folder.", len(proteome_files))
 
-    proteome_files = os.listdir("./proteome/")
-    logger_hog.info("There are " + str(len(proteome_files)) + " files in the proteome folder. ")
-
-    not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta") )]
+    not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta"))]
     if not_fa:
-        logger_hog.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these " + str(not_fa) )
-
-    fa_fasta = [i.split(".")[-1] for i in proteome_files if  i.endswith(".fa") or i.endswith(".fasta")]
-    if len(fa_fasta)<=2:
-        logger_hog.error("There are not enough proteomes in the folder ")
-        logger_hog.error("Check input failed. FastOMA halted!")
-        sys.exit(1)
+        logger.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these %s", not_fa)
 
-    if len(set(fa_fasta))>1:
-        logger_hog.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them " + str(proteome_files) )
+    fa_fasta = [i.split(".")[-1] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")]
+    if len(fa_fasta) <= 2:
+        logger.error("There are not enough proteomes in the folder ")
+        logger.error("Check input failed. FastOMA halted!")
+        return False
 
+    if len(set(fa_fasta)) > 1:
+        logger.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them %s.", proteome_files)
+        return False
 
-    return 1
+    return True
 
 
-def check_proteome(species_names,prot_recs_lists):
-
+def check_proteome(species_names, prot_recs_lists, proteome_folder):
+    isOk = True
     for species_name in species_names:
         num_prots = len(prot_recs_lists[species_name])
         if num_prots <= 2:
-            logger_hog.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it,  in_folder/proteome/" + species_name + "."+fasta_format_keep)
-
-    return 1
-
+            logger.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it, %s/%s.fa", proteome_folder, species_name)
+            isOk = False
+    return isOk
 
 
-def check_hogmap_files():
+def check_hogmap_files(hogmap_folder):
 
-    hogmap_files = os.listdir("./hogmap_in/")
-    logger_hog.info("There are " + str(len(hogmap_files)) + " files in the hogmap folder. ")
-    not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap") ]
+    hogmap_files = os.listdir(hogmap_folder)
+    logger.info("There are %d files in the hogmap folder.", len(hogmap_files))
+    not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap")]
     if not_hogmap:
-        logger_hog.warning("We expect that only hogmap files are in the hogmap_in folder. Better to remove these " + str(not_hogmap) )
+        logger.warning("We expect that only hogmap files are in the %s folder. Better to remove these %s",
+                       hogmap_folder, not_hogmap)
 
-    species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if  i.endswith(".hogmap")]
-    logger_hog.info("There are "+str(len(species_hogmaps))+" hogmaps.")
+    species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if i.endswith(".hogmap")]
+    logger.info("There are "+str(len(species_hogmaps))+" hogmaps.")
 
     return species_hogmaps
 
@@ -72,63 +64,63 @@ def check_speciestree_internalnode(species_tree):
         if not node.is_leaf():
             internal_node_name = []
             if (not node.name) or len(node.name) < 1:
-                logger_hog.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.")
+                logger.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.")
             else:
                 internal_node_name.append(node.name)
 
     if len(internal_node_name) != len(set(internal_node_name)):
-        logger_hog.warning("All the internal node names should be unique. One of the internal node is repeated:")
-        logger_hog.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1])
-        logger_hog.warning("We'll change the internal node names.")
-
-
-
+        logger.warning("All the internal node names should be unique. One of the internal node is repeated:")
+        logger.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1])
+        logger.warning("We'll change the internal node names.")
     return 1
 
-def check_speciestree_leaves(species_tree,species_names):
 
+def check_speciestree_leaves(species_tree, species_names):
     leaves_name = [i.name for i in species_tree.get_leaves()]
-    logger_hog.info("The species tree has " + str(len(leaves_name))+" leaves.")
-    if len(set(leaves_name))!=len(leaves_name):
-        logger_hog.error("Leaves name should be unique in the species tree. You may try ete3 prune")
-        logger_hog.error("Check input failed. FastOMA halted!")
-        sys.exit(1)
+    logger.info("The species tree has %d leaves.", len(leaves_name))
+    if len(set(leaves_name)) != len(leaves_name):
+        logger.error("Leaves name should be unique in the species tree. You may try ete3 prune")
+        logger.error("Check input failed. FastOMA halted!")
+        return False
 
-    species_names_not_intree = [i for i in species_names if i not in  leaves_name]
+    species_names_not_intree = [i for i in species_names if i not in leaves_name]
     if species_names_not_intree:
-        logger_hog.error("These species are not in the species tree "+ str(species_names_not_intree))
-        logger_hog.error("This is a tree template:  ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files:  AQUAE.fa  CHLTR.fa  MYCGE.fa. ")
-        logger_hog.error("Check input failed. FastOMA halted!")
-        sys.exit(1)
-    leaves_tree_not_proteome = [i for i in leaves_name  if i not in species_names]
+        logger.error("These species are not in the species tree:\n %s", "\n ".join(species_names_not_intree))
+        logger.error("This is a tree template:  ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files:  AQUAE.fa  CHLTR.fa  MYCGE.fa. ")
+        logger.error("Check input failed. FastOMA halted!")
+        return False
+
+    leaves_tree_not_proteome = [i for i in leaves_name if i not in species_names]
     if leaves_tree_not_proteome:
-        logger_hog.warning("there are "+str(len(leaves_tree_not_proteome))+" leaves in the species tree that there is no proteome. So we will discard them.")
+        logger.warning("there are %d leaves in the species tree that there is no proteome. So we will discard them.", len(leaves_tree_not_proteome))
+    return True
 
-    return 1
 
-def check_omamer_db(omamerdb_adress="omamerdb.h5"):
+def check_omamer_db(omamerdb_adress=None):
+    if omamerdb_adress is None:
+        logger.warning("omamer_db not passed. will assume you know what you are doing.")
+        return True
 
-    if  os.path.exists(omamerdb_adress):
+    if os.path.exists(omamerdb_adress):
         if os.path.getsize(omamerdb_adress) > 10000:  # 3 bytes
             omamerdb = True
             # todo we can do some checks on version omamer v2
         else:
-            logger_hog.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress)
+            logger.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress)
             omamerdb = False
     else:
         omamerdb =False
-        logger_hog.info("OMAmer db does not exist.")
+        logger.info("OMAmer db does not exist.")
 
     return omamerdb
 
 
-def add_internal_node_prune(species_tree,species_names):
-
+def add_internal_node_prune(species_tree, species_names, out_fname):
     # add name for the internal, if no name is provided, removintg special chars
     counter_internal = 0
     node_names = set()
     for node in species_tree.traverse(strategy="postorder"):
-        if not node.is_leaf() :
+        if not node.is_leaf():
             node_name = node.name
 
             if len(node_name) < 3 or node_name in node_names:
@@ -136,27 +128,26 @@ def add_internal_node_prune(species_tree,species_names):
                     node.name = "internal_" + str(counter_internal)
                     counter_internal += 1
                     node_names.add(node.name)
-                    logger_hog.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name)
-            elif not all(e.isalnum() or e=="_" or e=="-"  for e in node_name):  #any(not c.isalnum() for c in node_name):
-                node_name_new = ''.join(e for e in node_name if e.isalnum() or e=="_" or e=="-") # removing special chars
+                    logger.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name)
+            elif not all(e.isalnum() or e in ("_", "-") for e in node_name):  #any(not c.isalnum() for c in node_name):
+                node_name_new = ''.join(e for e in node_name if e.isalnum() or e in ("_", "-"))  # removing special chars
                 if node_name_new in node_names:
-                    node.name =node_name_new+"_"+str(counter_internal)
+                    node_name_new += "_" + str(counter_internal)
                     counter_internal += 1
-                else:
-                    node.name=node_name_new
-
+                logger.debug("The internal node name has special chars or repeated '%s' which is changed to '%s'",
+                             node.name, node_name_new)
+                node.name = node_name_new
                 node_names.add(node.name)
-                logger_hog.debug("The internal node name has special chars or repeated " + node_name + " which is changed to " + node.name)
             else:
                 node_names.add(node_name)
 
-    logger_hog.info("The species tree has " + str(len(species_tree)) + " leaves")
-    species_tree.prune(species_names) # , preserve_branch_length=True)
-    logger_hog.info("After pruning, the species tree has " + str(len(species_tree)) + " leaves")
+    logger.info("The species tree has %d leaves", len(species_tree))
+    species_tree.prune(species_names)  # , preserve_branch_length=True)
+    logger.info("After pruning, the species tree has %d leaves", len(species_tree))
 
-    species_tree_output= "./species_tree_checked.nwk"
-    species_tree.write(format=1, format_root_node=True, outfile=species_tree_output)
-    logger_hog.debug("The new species tree is written "+species_tree_output)
+    species_tree_output = "./species_tree_checked.nwk"
+    species_tree.write(format=1, format_root_node=True, outfile=out_fname)
+    logger.debug("The new species tree is written %s", out_fname)
 
     return species_tree
 
@@ -174,12 +165,12 @@ def check_splice(isoform_by_gene_all):
                     total_isoforms+=len(isoform)
                     total_genes+=1
 
-    logger_hog.debug("For "+str(spliced_species_num)+"  species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.")
+    logger.debug("For "+str(spliced_species_num)+"  species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.")
     if total_genes ==0:
-        logger_hog.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ")
+        logger.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ")
         sys.exit(1)
 
-    logger_hog.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+"  splices.")
+    logger.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+"  splices.")
 
     # make sys back
     return 1
@@ -190,63 +181,74 @@ def check_splice(isoform_by_gene_all):
 
 
 def check_fastoma_input():
-    _config.set_configs()
-
-    # print(_config.in_folder)
-    print(_config.logger_level)
-
-    print(_config.species_tree_address)  # nwk format
+    import argparse
+    parser = argparse.ArgumentParser(description="checking parameters for FastOMA")
+    parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes")
+    parser.add_argument("--species-tree", required=True, help="Path to the input species tree file in newick format")
+    parser.add_argument("--out-tree", required=True, help="Path to output file for sanitised species tree. ")
+    parser.add_argument("--splice", help="Path to the folder containing the splice information files")
+    parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files")
+    parser.add_argument("--omamer_db", help="Path to the omamer database")
+    parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
+    conf = parser.parse_args()
+    logger.setLevel(level=30 - 10 * min(conf.v, 2))
+    logger.debug("Arguments: %s", conf)
+
+    proteomes_ok = check_proteome_files(conf.proteomes)
+    if not proteomes_ok:
+        logger.error("Halting FastOMA because of invalid proteome input data")
+        sys.exit(1)
 
-    result = check_proteome_files()
 
+    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes)
+    if not check_proteome(species_names, prot_recs_lists, conf.proteomes):
+        logger.error("Halting FastOMA because of invalid proteome input data")
+        sys.exit(1)
 
-    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes()  # optional input folder
-    check_proteome(species_names, prot_recs_lists)
+    #check species tree
     try:
-        species_tree = Tree(_config.species_tree_address, format=1)
+        species_tree = Tree(conf.species_tree, format=1)
     except:
         try:
-            species_tree = Tree(_config.species_tree_address)
+            species_tree = Tree(conf.species_tree)
             # todo add check fro Phyloxml
         except:
-            logger_hog.error("We have problem with parsing species tree "+str(_config.species_tree_address) + " using ete3 Tree. Maybe there are some special chars.")
+            logger.error("We have problem with parsing species tree %s using ete3 Tree. Maybe there are some special chars.", conf.species_tree)
+            sys.exit(1)
 
 
     check_speciestree_internalnode(species_tree)
-    check_speciestree_leaves(species_tree,species_names)
+    if not check_speciestree_leaves(species_tree, species_names):
+        logger.error("Halting FastOMA because of invalid species tree")
+        sys.exit(1)
 
-    add_internal_node_prune(species_tree,species_names)
+    add_internal_node_prune(species_tree, species_names, conf.out_tree)
 
-    hogmap_files = os.path.exists("./hogmap_in/")
-    species_hogmaps =[]
+    hogmap_files = conf.hogmap is not None and os.path.exists(conf.hogmap)
+    species_hogmaps = []
     if hogmap_files:
-        species_hogmaps = check_hogmap_files()
+        species_hogmaps = check_hogmap_files(conf.hogmap)
 
-    #print("species_hogmaps",species_hogmaps)
-    #print("species_names",species_names)
     hogmap_complete = False
     if set(species_hogmaps) == set(species_names):
         hogmap_complete = True
 
-    # print("hogmap_complete", hogmap_complete)
-    omamerdb = check_omamer_db()
+    omamerdb = check_omamer_db(conf.omamer_db)
 
     if not(omamerdb or hogmap_complete):
-        logger_hog.error("OMAmer db does not exit and no hogmap provided. ")
-        logger_hog.error("Check input failed. FastOMA halted!")
+        logger.error("OMAmer db does not exist and no hogmap provided.")
+        logger.error("Check input failed. FastOMA halted!")
         sys.exit(1)
     else:
-        logger_hog.info("OMAmer db or hogmap exist. It looks ok. ")
+        logger.info("OMAmer db or hogmap exist. It looks ok.")
 
-    #todo  check splice file format . if the name matches with the proteome files.
-    splice_files = os.path.exists("./splice/")
+    # todo  check splice file format . if the name matches with the proteome files.
+    splice_files = conf.splice is not None and os.path.exists(conf.splice)
     if splice_files:
-        logger_hog.debug("splice folder exist. Let's see its inside.")
-        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names)
+        logger.debug("splice folder exist. Let's see its inside.")
+        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, conf.splice)
         check_splice(isoform_by_gene_all)
     else:
-        logger_hog.info("Splice folder doesn't exist and that's ok.")
-
-
-    logger_hog.info("Input check finished ! ")
+        logger.info("Splice folder doesn't exist and that's ok.")
+    logger.info("Input check finished ! ")
 
diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py
index c08cf2b..6d62b35 100644
--- a/FastOMA/infer_roothogs.py
+++ b/FastOMA/infer_roothogs.py
@@ -1,11 +1,9 @@
-
-
 import os.path
 from shutil import which
 
-from ._utils_subhog import logger_hog
 from . import _utils_roothog
 from . import _config
+logger = _config.logger_hog
 
 """
 proteomes of species as fasta files in /proteome/
@@ -14,55 +12,47 @@
 """
 
 
-# if not os.path.exists(_config.in_folder):
-#    os.mkdir(_config.in_folder) # s
-
-# in_folder+"omamer_database/oma_path/OmaServer.h5"
-# logger_hog.info("rHOG inferece has started. The oma database address is in " + _config.oma_database_address)
-# (oma_db, list_oma_species) = _utils_rhog.parse_oma_db(_config.oma_database_address)
-# (query_species_names, query_prot_recs) = _utils_rhog.parse_proteome(list_oma_species)
-
-
-
 def infer_roothogs():
-    _config.set_configs()
-
-    # print(_config.in_folder)
-    print(_config.logger_level)
-    # import sys
-    # sys.exit(0)
-
-    #folder = "/scratch/smajidi1/euk_omamer200.dev8_2/test/hogmap/FastOMA-main/testdata/in_folder/"
-    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder
+    import argparse
+    parser = argparse.ArgumentParser(description="checking parameters for FastOMA")
+    parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes")
+    parser.add_argument("--splice", help="Path to the folder containing the splice information files")
+    parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files")
+    parser.add_argument("--out-rhog-folder", required=True, help="Folder where the roothog fasta files are written")
+    parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
+    conf = parser.parse_args()
+    logger.setLevel(level=30 - 10 * min(conf.v, 2))
+    logger.debug("Arguments: %s", conf)
+
+    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes)  # optional input folder
     prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists)
 
-    hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep) # optional input folder
+    hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep)  # optional input folder
 
-    splice_files = os.path.exists("./splice/")
+    splice_files = conf.splice is not None and os.path.exists(conf.splice)
     if splice_files:
-        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names)
-        isoform_selected,  isoform_not_selected = _utils_roothog.find_nonbest_isoform(species_names,isoform_by_gene_all,hogmaps)
-        _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected,prot_recs_lists)
+        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, folder=conf.splice)
+        isoform_selected,  isoform_not_selected = _utils_roothog.find_nonbest_isoform(
+            species_names, isoform_by_gene_all, hogmaps
+        )
+        _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_lists)
         # for each isoform file, there will be a file ending with _selected_isoforms.tsv
-        hogmaps = _utils_roothog.handle_splice(hogmaps,isoform_not_selected)
-
+        hogmaps = _utils_roothog.handle_splice(hogmaps, isoform_not_selected)
 
     rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps)
-
-    rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots,hogmaps)
+    rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots, hogmaps)
     rhogs_prots = _utils_roothog.merge_rhogs(hogmaps, rhogs_prots)
     rhogs_prots = _utils_roothog.filter_big_roothogs(hogmaps, rhogs_prots)
 
 
-    address_rhogs_folder = "./temp_omamer_rhogs/"
     min_rhog_size = 2
-    rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, address_rhogs_folder, min_rhog_size)
-    linclust_available=which("mmseqs") #  True #
+    rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size)
+    linclust_available=which("mmseqs")  # True #
     # if memseqs is not installed the output will be empty / None
     if linclust_available:
         num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all,  "singleton_unmapped.fa")
         if num_unmapped_singleton:
             result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa")
-            logger_hog.debug(" linclust is done "+ result_linclust)
-            num_clusters = _utils_roothog.write_clusters(address_rhogs_folder, min_rhog_size)
-            logger_hog.debug("we wrote "+str(num_clusters)+"  new clusters with linclust ")
+            logger.debug(" linclust is done %s", result_linclust)
+            num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size)
+            logger.debug("we wrote %d new clusters with linclust ", num_clusters)
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index df34866..1be668f 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -4,22 +4,22 @@
 params.input_folder = "./in_folder/"
 params.output_folder = "./out_folder/"
 params.proteome_folder = params.input_folder + "/proteome"
-params.proteomes = params.proteome_folder + "/*"
 params.hogmap_in = params.input_folder + "/hogmap_in"
-
-params.hogmap_folder = params.output_folder + "/hogmap"
 params.splice_folder = params.input_folder + "/splice"
 params.species_tree = params.input_folder + "/species_tree.nwk"
-params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk"
 
-params.temp_pickles = params.output_folder + "/temp_pickles"
+
+
+
+// output subfolder definition
 params.genetrees_folder = params.output_folder + "/genetrees"
+params.hogmap_folder = params.output_folder + "/hogmap"
 
 params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs"
 
 
-process check_input{
 
+process check_input{
 
     publishDir params.output_folder, mode: 'copy'
     input:
@@ -30,10 +30,15 @@ process check_input{
         path splice_folder
     output:
         path "species_tree_checked.nwk"
-        val true
     script:
         """
-        check-fastoma-input
+        check-fastoma-input --proteomes ${proteome_folder} \
+                            --species-tree ${species_tree} \
+                            --out-tree species_tree_checked.nwk \
+                            --splice ${splice_folder} \
+                            --hogmap ${hogmap_folder} \
+                            --omamer_db ${omamerdb} \
+                            -vv
         """
 }
 
@@ -77,7 +82,11 @@ process infer_roothogs{
     path "selected_isoforms" , optional: true
   script:
     """
-       infer-roothogs  --logger-level DEBUG
+       infer-roothogs  --proteomes ${proteome_folder} \
+                       --hogmap ${hogmap_folder} \
+                       --splice ${splice_folder} \
+                       --out-rhog-folder "omamer_rhogs/" \
+                       -vv
     """
 }
 
@@ -157,20 +166,21 @@ process collect_subhogs{
 }
 
 workflow {
-    proteomes = Channel.fromPath(params.proteomes, type:'any', checkIfExists:true)
-    proteome_folder = Channel.fromPath(params.proteome_folder)
-    hogmap_folder = Channel.fromPath(params.hogmap_folder)
-    splice_folder = Channel.fromPath(params.splice_folder)
-    species_tree = Channel.fromPath(params.species_tree)
+    proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first()
+    proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true)
+    species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first()
+
+    hogmap_folder = Channel.fromPath(params.hogmap_folder, type: "dir")
+    splice_folder = Channel.fromPath(params.splice_folder, type: "dir")
 
-    genetrees_folder = Channel.fromPath(params.genetrees_folder)
+    genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir')
     hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir')
 
     omamerdb = Channel.fromPath(params.omamer_db)
     proteomes_omamerdb = proteomes.combine(omamerdb)
     proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in)
 
-    (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder)
+    (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder)
     (hogmap, ready_omamer_run) = omamer_run(proteomes_omamerdb_inputhog)
     ready_omamer_run_c = ready_omamer_run.collect()
 
diff --git a/pyproject.toml b/pyproject.toml
index fb8a4fa..66957d5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ nextflow = [
 
 [project.scripts]
 batch-roothogs = "FastOMA.batch_roothogs:batch_roothogs"
-check-input-fastoma = "FastOMA.check_input_fastoma:check_input_fastoma"
+check-fastoma-input = "FastOMA.check_fastoma_input:check_fastoma_input"
 collect-subhogs = "FastOMA.collect_subhogs:collect_subhogs"
 infer-roothogs = "FastOMA.infer_roothogs:infer_roothogs"
 infer-subhogs = "FastOMA.infer_subhogs:infer_subhogs"

From 2abc946aa3a6e95c25fb51993bdf6e59ec6ae5d0 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Mon, 20 Nov 2023 07:19:21 +0100
Subject: [PATCH 11/22] fixing pipeline, all dependencies via dataflow

---
 FastOMA/_utils_roothog.py |  5 +----
 FastOMA/infer_roothogs.py |  2 +-
 FastOMA_light.nf          | 25 +++++++++----------------
 nextflow.config           |  1 +
 4 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/FastOMA/_utils_roothog.py b/FastOMA/_utils_roothog.py
index 122e3d3..aca5c46 100644
--- a/FastOMA/_utils_roothog.py
+++ b/FastOMA/_utils_roothog.py
@@ -647,13 +647,10 @@ def write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_list
     write isoforms
 
     """
-    selected_isoforms_folder= "selected_isoforms/"
+    selected_isoforms_folder = "selected_isoforms/"
     if not os.path.exists(selected_isoforms_folder):
         os.mkdir(selected_isoforms_folder)
 
-
-
-
     for species, isoform_selected_sp in isoform_selected.items():
         isoform_by_gene = isoform_by_gene_all[species]
         assert len(isoform_by_gene) == len(isoform_selected_sp)
diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py
index 6d62b35..3c14a4f 100644
--- a/FastOMA/infer_roothogs.py
+++ b/FastOMA/infer_roothogs.py
@@ -27,7 +27,7 @@ def infer_roothogs():
     species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes)  # optional input folder
     prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists)
 
-    hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep)  # optional input folder
+    hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep, folder=conf.hogmap)  # optional input folder
 
     splice_files = conf.splice is not None and os.path.exists(conf.splice)
     if splice_files:
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 1be668f..8bb68c0 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -10,7 +10,6 @@ params.species_tree = params.input_folder + "/species_tree.nwk"
 
 
 
-
 // output subfolder definition
 params.genetrees_folder = params.output_folder + "/genetrees"
 params.hogmap_folder = params.output_folder + "/hogmap"
@@ -20,7 +19,6 @@ params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs"
 
 
 process check_input{
-
     publishDir params.output_folder, mode: 'copy'
     input:
         path proteome_folder
@@ -30,6 +28,7 @@ process check_input{
         path splice_folder
     output:
         path "species_tree_checked.nwk"
+        val "check_completed"
     script:
         """
         check-fastoma-input --proteomes ${proteome_folder} \
@@ -48,11 +47,10 @@ process omamer_run{
   publishDir params.hogmap_folder, mode: 'copy'
 
   input:
-  tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder)
+  tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready)
 
   output:
   path "*.hogmap"
-  val true
 
   script:
   """
@@ -68,12 +66,12 @@ process omamer_run{
 process infer_roothogs{
   publishDir = [
     path: params.temp_output,
-    mode: 'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename }
+    enabled: params.debug_enabled,
+    //mode: 'copy', saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename }
     ]
 
   input:
-    val ready_omamer_run
-    path hogmap_folder
+    path hogmaps, stageAs: "hogmaps/*"
     path proteome_folder
     path splice_folder
   output:
@@ -83,7 +81,7 @@ process infer_roothogs{
   script:
     """
        infer-roothogs  --proteomes ${proteome_folder} \
-                       --hogmap ${hogmap_folder} \
+                       --hogmap hogmaps \
                        --splice ${splice_folder} \
                        --out-rhog-folder "omamer_rhogs/" \
                        -vv
@@ -170,21 +168,16 @@ workflow {
     proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true)
     species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first()
 
-    hogmap_folder = Channel.fromPath(params.hogmap_folder, type: "dir")
     splice_folder = Channel.fromPath(params.splice_folder, type: "dir")
-
     genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir')
     hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir')
 
     omamerdb = Channel.fromPath(params.omamer_db)
-    proteomes_omamerdb = proteomes.combine(omamerdb)
-    proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in)
-
     (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder)
-    (hogmap, ready_omamer_run) = omamer_run(proteomes_omamerdb_inputhog)
-    ready_omamer_run_c = ready_omamer_run.collect()
+    omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check)
+    hogmap = omamer_run(omamer_input_channel)
 
-    (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder)
+    (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder)
 
     (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs)
 
diff --git a/nextflow.config b/nextflow.config
index 866c569..c810e92 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -8,6 +8,7 @@ params {
   container_name = "dessimozlab/fastoma"
   container_version = "latest"
   omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5"
+  debug_enabled = false
 }
 
 // Profiles configure nextflow depending on the environment (local, integration, live, etc.)

From b7d5fa1cdf77f4756e6cabf6e0a8de6ed9a00a42 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Wed, 22 Nov 2023 10:26:03 +0100
Subject: [PATCH 12/22] [WIP] adding conda profile - not tested yet

---
 FastOMA/requirements  | 166 ------------------------------------------
 environment-conda.yml |  12 +++
 nextflow.config       |  11 ++-
 3 files changed, 22 insertions(+), 167 deletions(-)
 delete mode 100644 FastOMA/requirements
 create mode 100644 environment-conda.yml

diff --git a/FastOMA/requirements b/FastOMA/requirements
deleted file mode 100644
index fcba280..0000000
--- a/FastOMA/requirements
+++ /dev/null
@@ -1,166 +0,0 @@
-$ conda list
-# packages in environment at /miniconda/envs/fastoma:
-#
-# Name                    Version                   Build  Channel
-_libgcc_mutex             0.1                 conda_forge    conda-forge
-_openmp_mutex             4.5                       2_gnu    conda-forge
-anyio                     3.7.1              pyhd8ed1ab_0    conda-forge
-argon2-cffi               23.1.0             pyhd8ed1ab_0    conda-forge
-argon2-cffi-bindings      21.2.0           py39hb9d737c_3    conda-forge
-arrow                     1.2.3              pyhd8ed1ab_0    conda-forge
-asttokens                 2.2.1              pyhd8ed1ab_0    conda-forge
-async-lru                 2.0.4              pyhd8ed1ab_0    conda-forge
-attrs                     23.1.0             pyh71513ae_1    conda-forge
-babel                     2.12.1             pyhd8ed1ab_1    conda-forge
-backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
-backports                 1.0                pyhd8ed1ab_3    conda-forge
-backports.functools_lru_cache 1.6.5              pyhd8ed1ab_0    conda-forge
-beautifulsoup4            4.12.2             pyha770c72_0    conda-forge
-biopython                 1.81                     pypi_0    pypi
-bleach                    6.0.0              pyhd8ed1ab_0    conda-forge
-blosc2                    2.0.0                    pypi_0    pypi
-brotli-python             1.0.9            py39h5a03fae_9    conda-forge
-bzip2                     1.0.8                h7f98852_4    conda-forge
-ca-certificates           2023.7.22            hbcca054_0    conda-forge
-cached-property           1.5.2                hd8ed1ab_1    conda-forge
-cached_property           1.5.2              pyha770c72_1    conda-forge
-certifi                   2023.7.22          pyhd8ed1ab_0    conda-forge
-cffi                      1.15.1           py39he91dace_3    conda-forge
-charset-normalizer        3.2.0              pyhd8ed1ab_0    conda-forge
-comm                      0.1.4              pyhd8ed1ab_0    conda-forge
-cython                    3.0.1                    pypi_0    pypi
-debugpy                   1.6.8            py39h3d6467e_0    conda-forge
-decorator                 5.1.1              pyhd8ed1ab_0    conda-forge
-defusedxml                0.7.1              pyhd8ed1ab_0    conda-forge
-dendropy                  4.6.1                    pypi_0    pypi
-entrypoints               0.4                pyhd8ed1ab_0    conda-forge
-ete3                      3.1.3                    pypi_0    pypi
-exceptiongroup            1.1.3              pyhd8ed1ab_0    conda-forge
-executing                 1.2.0              pyhd8ed1ab_0    conda-forge
-fastoma                   0.0.6                     dev_0    <develop>
-fasttree                  2.1.11               h031d066_2    bioconda
-fqdn                      1.5.1              pyhd8ed1ab_0    conda-forge
-future                    0.18.3                   pypi_0    pypi
-humanfriendly             10.0                     pypi_0    pypi
-idna                      3.4                pyhd8ed1ab_0    conda-forge
-importlib-metadata        6.8.0              pyha770c72_0    conda-forge
-importlib_metadata        6.8.0                hd8ed1ab_0    conda-forge
-importlib_resources       6.0.1              pyhd8ed1ab_0    conda-forge
-ipykernel                 6.25.1             pyh71e2992_0    conda-forge
-ipython                   8.14.0             pyh41d4057_0    conda-forge
-iqtree                    2.2.3                h21ec9f0_0    bioconda
-isoduration               20.11.0            pyhd8ed1ab_0    conda-forge
-jedi                      0.19.0             pyhd8ed1ab_0    conda-forge
-jinja2                    3.1.2              pyhd8ed1ab_1    conda-forge
-json5                     0.9.14             pyhd8ed1ab_0    conda-forge
-jsonpointer               2.0                        py_0    conda-forge
-jsonschema                4.19.0             pyhd8ed1ab_1    conda-forge
-jsonschema-specifications 2023.7.1           pyhd8ed1ab_0    conda-forge
-jsonschema-with-format-nongpl 4.19.0             pyhd8ed1ab_1    conda-forge
-jupyter-lsp               2.2.0              pyhd8ed1ab_0    conda-forge
-jupyter_client            8.3.0              pyhd8ed1ab_0    conda-forge
-jupyter_core              5.3.1            py39hf3d152e_0    conda-forge
-jupyter_events            0.7.0              pyhd8ed1ab_2    conda-forge
-jupyter_server            2.7.1              pyhd8ed1ab_0    conda-forge
-jupyter_server_terminals  0.4.4              pyhd8ed1ab_1    conda-forge
-jupyterlab                4.0.5              pyhd8ed1ab_0    conda-forge
-jupyterlab_pygments       0.2.2              pyhd8ed1ab_0    conda-forge
-jupyterlab_server         2.24.0             pyhd8ed1ab_0    conda-forge
-ld_impl_linux-64          2.40                 h41732ed_0    conda-forge
-libffi                    3.4.2                h7f98852_5    conda-forge
-libgcc-ng                 13.1.0               he5830b7_0    conda-forge
-libgomp                   13.1.0               he5830b7_0    conda-forge
-libnsl                    2.0.0                h7f98852_0    conda-forge
-libsodium                 1.0.18               h36c2ea0_1    conda-forge
-libsqlite                 3.43.0               h2797004_0    conda-forge
-libstdcxx-ng              13.1.0               hfd8a6a1_0    conda-forge
-libuuid                   2.38.1               h0b41bf4_0    conda-forge
-libzlib                   1.2.13               hd590300_5    conda-forge
-llvmlite                  0.40.1                   pypi_0    pypi
-lxml                      4.9.3                    pypi_0    pypi
-mafft                     7.520                h031d066_2    bioconda
-markupsafe                2.1.3            py39hd1e30aa_0    conda-forge
-matplotlib-inline         0.1.6              pyhd8ed1ab_0    conda-forge
-mistune                   3.0.1              pyhd8ed1ab_0    conda-forge
-msgpack                   1.0.5                    pypi_0    pypi
-nbclient                  0.8.0              pyhd8ed1ab_0    conda-forge
-nbconvert-core            7.7.4              pyhd8ed1ab_0    conda-forge
-nbformat                  5.9.2              pyhd8ed1ab_0    conda-forge
-ncurses                   6.4                  hcb278e6_0    conda-forge
-nest-asyncio              1.5.6              pyhd8ed1ab_0    conda-forge
-nextflow                  23.4.3                   pypi_0    pypi
-notebook-shim             0.2.3              pyhd8ed1ab_0    conda-forge
-numba                     0.57.1                   pypi_0    pypi
-numexpr                   2.8.5                    pypi_0    pypi
-numpy                     1.24.4                   pypi_0    pypi
-omamer                    0.2.6                    pypi_0    pypi
-openssl                   3.1.2                hd590300_0    conda-forge
-overrides                 7.4.0              pyhd8ed1ab_0    conda-forge
-packaging                 23.1               pyhd8ed1ab_0    conda-forge
-pandas                    2.0.3                    pypi_0    pypi
-pandocfilters             1.5.0              pyhd8ed1ab_0    conda-forge
-parso                     0.8.3              pyhd8ed1ab_0    conda-forge
-pexpect                   4.8.0              pyh1a96a4e_2    conda-forge
-pickleshare               0.7.5                   py_1003    conda-forge
-pip                       23.2.1             pyhd8ed1ab_0    conda-forge
-pkgutil-resolve-name      1.3.10             pyhd8ed1ab_0    conda-forge
-platformdirs              3.10.0             pyhd8ed1ab_0    conda-forge
-prometheus_client         0.17.1             pyhd8ed1ab_0    conda-forge
-prompt-toolkit            3.0.39             pyha770c72_0    conda-forge
-prompt_toolkit            3.0.39               hd8ed1ab_0    conda-forge
-property-manager          3.0                      pypi_0    pypi
-psutil                    5.9.5            py39h72bdee0_0    conda-forge
-ptyprocess                0.7.0              pyhd3deb0d_0    conda-forge
-pure_eval                 0.2.2              pyhd8ed1ab_0    conda-forge
-py-cpuinfo                9.0.0                    pypi_0    pypi
-pycparser                 2.21               pyhd8ed1ab_0    conda-forge
-pygments                  2.16.1             pyhd8ed1ab_0    conda-forge
-pyparsing                 3.1.1                    pypi_0    pypi
-pysais                    1.1.0                    pypi_0    pypi
-pysocks                   1.7.1              pyha2e5f31_6    conda-forge
-python                    3.9.17          h0755675_0_cpython    conda-forge
-python-dateutil           2.8.2              pyhd8ed1ab_0    conda-forge
-python-fastjsonschema     2.18.0             pyhd8ed1ab_0    conda-forge
-python-json-logger        2.0.7              pyhd8ed1ab_0    conda-forge
-python_abi                3.9                      3_cp39    conda-forge
-pytz                      2023.3             pyhd8ed1ab_0    conda-forge
-pyyaml                    6.0.1            py39hd1e30aa_0    conda-forge
-pyzmq                     25.1.1           py39hb257651_0    conda-forge
-readline                  8.2                  h8228510_1    conda-forge
-referencing               0.30.2             pyhd8ed1ab_0    conda-forge
-requests                  2.31.0             pyhd8ed1ab_0    conda-forge
-rfc3339-validator         0.1.4              pyhd8ed1ab_0    conda-forge
-rfc3986-validator         0.1.1              pyh9f0ad1d_0    conda-forge
-rpds-py                   0.9.2            py39h9fdd4d6_0    conda-forge
-scipy                     1.11.2                   pypi_0    pypi
-send2trash                1.8.2              pyh41d4057_0    conda-forge
-setuptools                68.1.2             pyhd8ed1ab_0    conda-forge
-six                       1.16.0             pyh6c4a22f_0    conda-forge
-sniffio                   1.3.0              pyhd8ed1ab_0    conda-forge
-soupsieve                 2.3.2.post1        pyhd8ed1ab_0    conda-forge
-stack_data                0.6.2              pyhd8ed1ab_0    conda-forge
-tables                    3.8.0                    pypi_0    pypi
-terminado                 0.17.1             pyh41d4057_0    conda-forge
-tinycss2                  1.2.1              pyhd8ed1ab_0    conda-forge
-tk                        8.6.12               h27826a3_0    conda-forge
-tomli                     2.0.1              pyhd8ed1ab_0    conda-forge
-tornado                   6.3.3            py39hd1e30aa_0    conda-forge
-tqdm                      4.66.1                   pypi_0    pypi
-traitlets                 5.9.0              pyhd8ed1ab_0    conda-forge
-typing-extensions         4.7.1                hd8ed1ab_0    conda-forge
-typing_extensions         4.7.1              pyha770c72_0    conda-forge
-typing_utils              0.1.0              pyhd8ed1ab_0    conda-forge
-tzdata                    2023.3                   pypi_0    pypi
-uri-template              1.3.0              pyhd8ed1ab_0    conda-forge
-urllib3                   2.0.4              pyhd8ed1ab_0    conda-forge
-verboselogs               1.7                      pypi_0    pypi
-wcwidth                   0.2.6              pyhd8ed1ab_0    conda-forge
-webcolors                 1.13               pyhd8ed1ab_0    conda-forge
-webencodings              0.5.1                      py_1    conda-forge
-websocket-client          1.6.2              pyhd8ed1ab_0    conda-forge
-wheel                     0.41.2             pyhd8ed1ab_0    conda-forge
-xz                        5.2.6                h166bdaf_0    conda-forge
-yaml                      0.2.5                h7f98852_2    conda-forge
-zeromq                    4.3.4                h9c3ff4c_1    conda-forge
-zipp                      3.16.2             pyhd8ed1ab_0    conda-forge
-linclust
\ No newline at end of file
diff --git a/environment-conda.yml b/environment-conda.yml
new file mode 100644
index 0000000..906210a
--- /dev/null
+++ b/environment-conda.yml
@@ -0,0 +1,12 @@
+name: fastoma-env
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - omamer
+  - mafft
+  - fasttree
+  - pip
+  - pip:
+    - .
diff --git a/nextflow.config b/nextflow.config
index c810e92..d031874 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -11,14 +11,16 @@ params {
   debug_enabled = false
 }
 
-// Profiles configure nextflow depending on the environment (local, integration, live, etc.)
+// Profiles configure nextflow depending on the environment (local, docker, singularity)
 profiles {
+
   docker {
     process {
       container = "$params.container_name:$params.container_version"
     }
     docker.enabled = true
   }
+
   singularity {
     process {
       container = "$params.container_name:$params.container_version"
@@ -26,8 +28,15 @@ profiles {
     singularity.enabled = true
     singularity.autoMounts = true
   }
+
   standard {
     process.executor = 'local'
   }
+
+  conda {
+    process.conda = "envrionment-conda.yml"
+    conda.enabled = true
+  }
+
 }
 

From 6c5010f5dc89f4584925650ea6e9cf0a1ae69689 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Wed, 22 Nov 2023 13:28:29 +0100
Subject: [PATCH 13/22] [FIX] conda environment, add help text

---
 FastOMA_light.nf | 91 +++++++++++++++++++++++++++++++++++++++++++++++-
 nextflow.config  | 27 +++++++++++++-
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 8bb68c0..c6a05e7 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -1,7 +1,7 @@
 
 // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
 
-params.input_folder = "./in_folder/"
+params.input_folder = "test_data/in_folder"
 params.output_folder = "./out_folder/"
 params.proteome_folder = params.input_folder + "/proteome"
 params.hogmap_in = params.input_folder + "/hogmap_in"
@@ -18,6 +18,95 @@ params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs"
 
 
 
+if (params.help) {
+    log.info """
+    ===========================================
+      FastOMA -- PIPELINE
+    ===========================================
+    Usage:
+    Run the pipeline with default parameters:
+    nexflow run FastOMA.nf
+
+    Run with user parameters:
+    nextflow run FastOMA.nf --input_folder {input.dir}  --output_folder {results.dir}
+
+    Mandatory arguments:
+        --input_folder          Input data folder. Defaults to ${params.input_folder}. This folder
+                                must contain the proteomes (in a subfolder named 'proteome') and 
+                                a species tree file. Optionally the folder might contain 
+                                 - a sub-folder 'splice' containing splicing variant mappings
+                                 - a sub-folder 'hogmap_in' containing precomputed OMAmer
+                                   placement results for all proteomes
+                                
+                                All sub-folders and sub-files can also be placed in orther
+                                locations if you provide alternative values for them (see below on 
+                                optional arguments section).
+
+        --output_folder         Path where all the output should be stored. Defaults to
+                                ${params.output_folder}
+
+
+    Profile selection:
+        -profile                FastOMA can be run using several execution profiles. The default
+                                set of available profiles is
+                                 - docker       Run pipeline using docker containers. Docker needs
+                                                to be installed on your system. Containers will be
+                                                fetched automatically from dockerhub. See also 
+                                                additional options '--container_version' and 
+                                                '--container_name'.
+           
+                                 - singlularity Run pipeline using singularity. Singularity needs 
+                                                to be installed on your system. On HPC clusters, 
+                                                it often needs to be loaded as a seperate module.
+                                                Containers will be fetched automatically from 
+                                                dockerhub. See also additional options 
+                                                '--container_version' and '--container_name'.
+
+                                 - conda        Run pipeline in a conda environment. Conda needs 
+                                                to be installed on your system. The environment
+                                                will be created automatically.
+
+                                 - standard     Run pipeline on your local system. Mainly intended
+                                                for development purpose. All dependencies must be 
+                                                installed in the calling environment.
+
+                                 - slurm-singularity
+                                                Run pipeline using SLURM job scheduler and 
+                                                singularity containers. This profile can also be a 
+                                                template for other HPC clusters that use different 
+                                                schedulers.
+
+                                 - slurm-conda  Run pipeline using SLURM job scheduler and conda
+                                                environment.
+                                                   
+                                Profiles are defined in nextflow.config and can be extended or
+                                adjusted according to your needs.
+
+
+    Additional options:
+        --proteome_folder       Overwrite location of proteomes (default ${params.proteome_folder})
+        --species_tree          Overwrite location of species tree file (newick format). 
+                                Defaults to ${params.species_tree}
+        --splice_folder         Overwrite location of splice file folder. The splice files must be 
+                                named <proteome_file>.splice.
+                                Defaults to ${params.splice_folder}
+        --omamer_db             Path or URL to download the OMAmer database from. 
+                                Defaults to ${params.omamer_db}
+        --hogmap_in             Optional path where precomputed omamer mapping files are located.
+                                Defaults to ${params.hogmap_in}
+
+    Flags:
+        --help                  Display this message
+        --debug_enabled         Store addtional information that might be helpful to debug in case 
+                                of a problem with FastOMA. 
+                                
+    """.stripIndent()
+
+    exit 1
+}
+
+
+
 process check_input{
     publishDir params.output_folder, mode: 'copy'
     input:
diff --git a/nextflow.config b/nextflow.config
index d031874..e1d6889 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -9,6 +9,7 @@ params {
   container_version = "latest"
   omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5"
   debug_enabled = false
+  help = false
 }
 
 // Profiles configure nextflow depending on the environment (local, docker, singularity)
@@ -34,9 +35,33 @@ profiles {
   }
 
   conda {
-    process.conda = "envrionment-conda.yml"
+    process.conda = "environment-conda.yml"
     conda.enabled = true
   }
 
+  slurm-singularity {
+    process {
+      container = "$params.container_name:$params.container_version"
+      executor = "slurm"
+      time = 4.h
+      memory = 20.GB
+    }
+    singularity.enabled = true
+    singularity.autoMounts = true
+  }
+
+  slurm-conda {
+    process {
+      conda = "environment-conda.yml"
+      executor = "slurm"
+      time = 4.h
+      memory = 20.GB
+    }
+    conda.enabled = true
+  }
+
+
+
+
 }
 

From 0b1a92c302886c0053e65f170c1d399aab950c84 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Wed, 22 Nov 2023 14:03:17 +0100
Subject: [PATCH 14/22] [FIX] rename profiles and fix default input param

---
 FastOMA_light.nf | 8 ++++----
 nextflow.config  | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index c6a05e7..a3b8606 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -1,8 +1,8 @@
 
 // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
 
-params.input_folder = "test_data/in_folder"
-params.output_folder = "./out_folder/"
+params.input_folder = "testdata/in_folder"
+params.output_folder = "out_folder/"
 params.proteome_folder = params.input_folder + "/proteome"
 params.hogmap_in = params.input_folder + "/hogmap_in"
 params.splice_folder = params.input_folder + "/splice"
@@ -70,13 +70,13 @@ if (params.help) {
                                                 for development purpose. All dependencies must be 
                                                 installed in the calling environment.
 
-                                 - slurm-singularity
+                                 - slurm_singularity
                                                 Run pipeline using SLURM job scheduler and 
                                                 singularity containers. This profile can also be a 
                                                 template for other HPC clusters that use different 
                                                 schedulers.
 
-                                 - slurm-conda  Run pipeline using SLURM job scheduler and conda
+                                 - slurm_conda  Run pipeline using SLURM job scheduler and conda
                                                 environment.
                                                    
                                 Profiles are defined in nextflow.config and can be extended or
diff --git a/nextflow.config b/nextflow.config
index e1d6889..0b4bfcd 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -39,7 +39,7 @@ profiles {
     conda.enabled = true
   }
 
-  slurm-singularity {
+  slurm_singularity {
     process {
       container = "$params.container_name:$params.container_version"
       executor = "slurm"
@@ -50,7 +50,7 @@ profiles {
     singularity.autoMounts = true
   }
 
-  slurm-conda {
+  slurm_conda {
     process {
       conda = "environment-conda.yml"
       executor = "slurm"

From 83bcde59178952854091a82663f917c4511275f8 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Mon, 27 Nov 2023 23:29:01 +0100
Subject: [PATCH 15/22] make sure __pycache__ stays out of repo

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 35f96f5..a0c6a24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ dist/
 archive
 .git
 .gitignore
+__pycache__

From a9d875813f7527919391ae8ab34011acbc3196b5 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Tue, 28 Nov 2023 09:10:58 +0100
Subject: [PATCH 16/22] adding UnionFind module from zoo

---
 FastOMA/batch_roothogs.py |   2 +-
 FastOMA/zoo/unionfind.py  | 124 ++++++++++++++++++++++++++++++++++++++
 pyproject.toml            |  10 +--
 3 files changed, 130 insertions(+), 6 deletions(-)
 create mode 100644 FastOMA/zoo/unionfind.py

diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py
index 49b314e..4937071 100644
--- a/FastOMA/batch_roothogs.py
+++ b/FastOMA/batch_roothogs.py
@@ -51,7 +51,7 @@ def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_re
                 rest_hogs.add_hog(hog)
 
 
-def batch_roothogs():
+def fastoma_batch_roothogs():
     import argparse
     parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis")
     parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored")
diff --git a/FastOMA/zoo/unionfind.py b/FastOMA/zoo/unionfind.py
new file mode 100644
index 0000000..83036e1
--- /dev/null
+++ b/FastOMA/zoo/unionfind.py
@@ -0,0 +1,124 @@
+import collections
+
+"""UnionFind.py
+
+Union-find data structure. Based on Josiah Carlson's code,
+http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912
+with significant additional changes by D. Eppstein and
+Adrian Altenhoff.
+"""
+
+
+class UnionFind(object):
+    """Union-find data structure.
+
+    Each unionFind instance X maintains a family of disjoint sets of
+    hashable objects, supporting the following two methods:
+
+    - X[item] returns a name for the set containing the given item.
+      Each set is named by an arbitrarily-chosen one of its members; as
+      long as the set remains unchanged it will keep the same name. If
+      the item is not yet part of a set in X, a new singleton set is
+      created for it.
+
+    - X.union(item1, item2, ...) merges the sets containing each item
+      into a single larger set.  If any item is not yet part of a set
+      in X, it is added to X as one of the members of the merged set.
+    """
+
+    def __init__(self, elements=None):
+        """Create a new union-find structure.
+
+        If elements is not None, the structure gets initialized
+        with each element as a singleton component.
+
+        :param elements: an iterable to initialize the structure.
+        """
+
+        self.weights = {}
+        self.parents = {}
+        if elements is not None:
+            for elem in iter(elements):
+                self.parents[elem] = elem
+                self.weights[elem] = 1
+
+    def __getitem__(self, obj):
+        """return the name of set which contains obj.
+
+        :param obj: the query object
+
+        :SeeAlso: :meth:`find`"""
+        return self.find(obj)
+
+    def find(self, obj):
+        """Find and return the name of the set containing the obj.
+
+        If the object is not found in any set, a new singleton set
+        is created that holds only this object until it is further merged."""
+
+        # check for previously unknown obj. If unknown, add it
+        # as a new cluster
+        if obj not in self.parents:
+            self.parents[obj] = obj
+            self.weights[obj] = 1
+            return obj
+
+        # find path of objects leading to the root
+        path = [obj]
+        root = self.parents[obj]
+        while root != path[-1]:
+            path.append(root)
+            root = self.parents[root]
+
+        # compress the path and return
+        for ancestor in path:
+            self.parents[ancestor] = root
+        return root
+
+    def remove(self, obj):
+        """Remove an object from the sets.
+
+        Removes an object entirly from the datastructure. The
+        containing set will shrink by this one element.
+
+        :Note: If one tries to accessed it afterwards using
+            :meth:`find`, it will be created newly and put as a
+            singleton.
+        """
+        if obj not in self.parents:
+            return
+        comp = self.find(obj)
+        self.weights[comp] -= 1
+        self.parents.pop(obj)
+
+    def __iter__(self):
+        """Iterate through all items ever found or unioned by this structure."""
+        return iter(self.parents)
+
+    def union(self, *objects):
+        """Find the sets containing the objects and merge them.
+
+        any number of objects can be passed to this method and
+        all of them will be merged into one set containing at
+        least these objects.
+
+        :param objects: the objects to be merged. they have to be all
+            hashable. If they haven't been initialy added to the UnionFind
+            datastructre at instantiation time, they are added at this point
+            in time.
+        """
+        roots = [self[x] for x in objects]
+        heaviest = max([(self.weights[r], r) for r in roots], key=lambda x: x[0])[1]
+        for r in roots:
+            if r != heaviest:
+                self.weights[heaviest] += self.weights[r]
+                self.parents[r] = heaviest
+
+    def get_components(self):
+        """return a list of sets corresponding to the connected
+        components of the structure."""
+        comp_dict = collections.defaultdict(set)
+        for elem in iter(self):
+            comp_dict[self[elem]].add(elem)
+        comp = list(comp_dict.values())
+        return comp
diff --git a/pyproject.toml b/pyproject.toml
index 66957d5..d0669c0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,11 +30,11 @@ nextflow = [
 
 
 [project.scripts]
-batch-roothogs = "FastOMA.batch_roothogs:batch_roothogs"
-check-fastoma-input = "FastOMA.check_fastoma_input:check_fastoma_input"
-collect-subhogs = "FastOMA.collect_subhogs:collect_subhogs"
-infer-roothogs = "FastOMA.infer_roothogs:infer_roothogs"
-infer-subhogs = "FastOMA.infer_subhogs:infer_subhogs"
+fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs"
+fastoma-check-fastoma-input = "FastOMA.check_fastoma_input:fastoma_check_fastoma_input"
+fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs"
+fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs"
+fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs"
 
 [project.urls]
 Homepage = "https://github.com/DessimozLab/FastOMA"

From 93c4ea95c2d5e2cc092b4af4984c51b5d26dce74 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Tue, 28 Nov 2023 09:11:47 +0100
Subject: [PATCH 17/22] rename and rewire script

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d0669c0..fc5e48e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,7 @@ nextflow = [
 
 [project.scripts]
 fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs"
-fastoma-check-fastoma-input = "FastOMA.check_fastoma_input:fastoma_check_fastoma_input"
+fastoma-check-input = "FastOMA.check_input:fastoma_check_input"
 fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs"
 fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs"
 fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs"

From 2e5d521f63eaf1f1c8048a71c3e3f9753bfb1b32 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Tue, 28 Nov 2023 11:08:23 +0100
Subject: [PATCH 18/22] [FIX] write fasta files for OrthologGroups

---
 FastOMA/collect_subhogs.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py
index 8732245..6fbe840 100644
--- a/FastOMA/collect_subhogs.py
+++ b/FastOMA/collect_subhogs.py
@@ -16,6 +16,7 @@
 
 # This code collect subhogs and writes outputs.
 
+
 def load_hogs(pickle_folder: Path):
     hogs_all = []
     cnt = 0
@@ -78,8 +79,6 @@ def max_og_tree(tree, species_dic):
     return og_prot_list
 
 
-
-
 def fastoma_collect_subhogs():
     import argparse
     parser = argparse.ArgumentParser(description="collecting all computed HOGs and combine into a single orthoxml")
@@ -105,7 +104,7 @@ def fastoma_collect_subhogs():
     if conf.roothog_tsv is not None:
         write_roothogs(conf.out, conf.roothog_tsv)
     if conf.marker_groups_fasta is not None:
-        write_group_files(conf.out, conf.roothogs_folder, conf.marker_groups_fasta)
+        write_group_files(Path(conf.out), Path(conf.roothogs_folder), conf.marker_groups_fasta)
 
 
 def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file):
@@ -159,13 +158,13 @@ def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=N
         output_file_og_tsv = "OrthologousGroups.tsv"
     if output_fasta_groups is None:
         output_fasta_groups = "OrthologousGroupsFasta"
+    output_fasta_groups = Path(output_fasta_groups)
+
 
     logger_hog.info("Start writing OG fasta files ")
 
-    rhog_all_folder = "./omamer_rhogs/" #sys.argv[2] + "/"  # "out_folder/rhogs_all/"
     fasta_format = "fa"  # of the rhogs
 
-
     trees, species_dic = orthoxml_to_newick(orthoxml, return_gene_to_species=True)
     logger_hog.info("We extracted %d trees in NHX format from the input HOG orthoxml %s", len(trees),  orthoxml)
 
@@ -181,7 +180,7 @@ def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=N
                 raise
 
         og_prot_list = max_og_tree(tree, species_dic)
-        if len(og_prot_list) >= 2: # a group should have at least 1 member/protein
+        if len(og_prot_list) >= 2:  # a group should have at least 1 member/protein
             OGs[hog_id] = og_prot_list
 
     logger_hog.info("done extracting trees")
@@ -190,29 +189,31 @@ def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=N
         for hog_id, og_prot_list in OGs.items():
             line_text = str(hog_id) + "\t" + str(og_prot_list)[1:-1] + "\n"
             handle.write(line_text)
-    handle.close()
-
-    logger_hog.info("We wrote the protein families information in the file " + output_file_og_tsv)
 
-    os.makedirs(output_fasta_groups)
+    logger_hog.info("We wrote the protein families information in the file %s", output_file_og_tsv)
 
+    output_fasta_groups.mkdir(parents=True, exist_ok=True)
     logger_hog.info("start writing %d OGs as fasta files in folder %s.", len(OGs), output_fasta_groups)
     for hog_id, og_prot_list in OGs.items():  # hog_id="HOG_0667494_sub10524"
         rhog_id = "_".join(hog_id.split("_")[:2])
 
-        omamer_rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format
-        omamer_rhogs_all_prots = list(SeqIO.parse(omamer_rhogs_all_address, "fasta"))
+        rhog_fasta = roothog_folder / (rhog_id + "." + fasta_format)
+        omamer_rhogs_all_prots = list(SeqIO.parse(rhog_fasta, "fasta"))
 
         og_prots = []
         og_prot_list = OGs[hog_id]
         for rhogs_prot in omamer_rhogs_all_prots:
-            if rhogs_prot.id.split("||")[0] in og_prot_list:
+            prot_id = rhogs_prot.id
+            if _config.protein_format_qfo_dataset_before2022:
+                prot_id = prot_id.split('|')[1]
+
+            if prot_id in og_prot_list:
                 sp = rhogs_prot.id.split("||")[1]
                 rhogs_prot.description += " [" + sp + "]"
                 og_prots.append(rhogs_prot)
 
         og_id = "OG_" + hog_id  # one OG per rootHOG      # "/HOG_"+ rhogid
-        SeqIO.write(og_prots, output_fasta_groups + og_id + ".fa", "fasta")
+        SeqIO.write(og_prots, output_fasta_groups / (og_id + ".fa"), "fasta")
     logger_hog.info("writing done")
 
 
@@ -237,3 +238,5 @@ def write_roothogs(orthoxml, out):
     logger_hog.info("We wrote the protein families information in the file %s", out)
 
 
+if __name__ == "__main__":
+    fastoma_collect_subhogs()
\ No newline at end of file

From 0b8d63da2f0f1d61b438f740a37f1678cb2b6cf7 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Wed, 29 Nov 2023 09:36:54 +0100
Subject: [PATCH 19/22] [WIP] towards better reporting

---
 .dockerignore      |  1 +
 Dockerfile         |  1 +
 FastOMA/_config.py |  4 ++--
 FastOMA_light.nf   | 31 ++++++++++++++++++++++++++++++-
 nextflow.config    | 23 +++++++++++++++++------
 5 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 466368d..2021e50 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,3 +5,4 @@ work
 output
 testdata
 dist
+archive/
diff --git a/Dockerfile b/Dockerfile
index 99ac18e..ee6c1f3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -34,6 +34,7 @@ RUN apt-get update \
        fasttree \
        libxml2 \
        mafft \
+       procps \
     && apt-get -y autoremove \
     && apt-get -y autoclean \
     && rm -rf /var/lib/apt/lists/*
diff --git a/FastOMA/_config.py b/FastOMA/_config.py
index ae845c6..62db7bd 100644
--- a/FastOMA/_config.py
+++ b/FastOMA/_config.py
@@ -78,8 +78,8 @@
 inferhog_min_hog_size_xml = 2     # by setting this as 1, pyham won't work on xml output.
 
 # batch_roothogs
-big_rhog_filesize_thresh = 1300
-sum_list_rhogs_filesize_thresh = 2 * 1e3
+big_rhog_filesize_thresh = 400 * 1000
+sum_list_rhogs_filesize_thresh = 2 * 1e6
 
 
 orthoxml_v03 = True
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 0023a5c..78ae58c 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -106,6 +106,27 @@ if (params.help) {
 }
 
 
+log.info """
+===========================================
+  FastOMA -- PIPELINE
+===========================================
+
+ Project : ${workflow.projectDir}
+ Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}]
+ Cmd line: ${workflow.commandLine}
+ Manifest's pipeline version: ${workflow.manifest.version}
+
+Parameters:
+   input_folder              ${params.input_folder}
+   proteome folder           ${params.proteome}
+   species_tree              ${params.species_tree}
+   splice_folder             ${params.splice_folder}        (optional)
+   omamer_db                 ${params.omamer_db}
+   hogmap_in                 ${params.hogmap_in}            (optional)
+   
+   debug_enabled             ${params.debug_enabled}
+""".stripIndent()
+
 
 process check_input{
     publishDir params.output_folder, mode: 'copy'
@@ -278,6 +299,14 @@ workflow {
     channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle }
 
     (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs)
-    orthoxml_file.view{" output orthoxml file ${it}"}
 
 }
+
+//workflow.onComplete {
+//    println "Completed at    : $workflow.complete"
+//    println "Duration        : $workflow.duration"
+//    println "Output in       : $params.output_folder"
+//    println "Nextflow report : $params.statsdir"
+//	println ( workflow.success ? "Done!" : "Oops .. something went wrong" )
+//}
+
diff --git a/nextflow.config b/nextflow.config
index 0b4bfcd..d9a0320 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -1,7 +1,11 @@
 // General configuration used in all profiles
 manifest {
-  description = 'FastOMA - Infer orthology '
-  nextflowVersion = '22.10.4'
+  name = "FastOMA - Orthology inference"
+  description = """FastOMA computes Hierarchical Orthologous Groups from proteomes."""
+  author = "Sina Majidian, Adrian Altenhoff"
+  homePage = "https://omabrowser.org"
+  mainScript = "FastOMA.nf"
+  nextflowVersion = "22.10.4"
 }
 
 params {
@@ -10,6 +14,9 @@ params {
   omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5"
   debug_enabled = false
   help = false
+
+  output_folder = "Output"
+  statsdir = "${params.output_folder}/stats"
 }
 
 // Profiles configure nextflow depending on the environment (local, docker, singularity)
@@ -59,9 +66,13 @@ profiles {
     }
     conda.enabled = true
   }
-
-
-
-
 }
 
+timeline {
+  enabled = true
+  file = "${params.statsdir}/timeline.html"
+}
+report {
+  enabled = false
+  file = "${params.statsdir}/report.html"
+}
\ No newline at end of file

From 6ea0659e7ffa2305173e6fb841e6074b01f9bcd8 Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Wed, 29 Nov 2023 12:03:28 +0100
Subject: [PATCH 20/22] [FIX] better reporting of pipeline

---
 FastOMA_light.nf | 27 +++++++++++++++++----------
 nextflow.config  |  5 +++--
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index 78ae58c..e8c28a2 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -1,3 +1,4 @@
+import groovy.json.JsonBuilder;
 
 // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
 
@@ -99,6 +100,8 @@ if (params.help) {
         --help                  Display this message
         --debug_enabled         Store addtional information that might be helpful to debug in case
                                 of a problem with FastOMA.
+        --report                Produce nextflow report and timeline and store in in
+                                $params.statdir
 
     """.stripIndent()
 
@@ -118,13 +121,14 @@ log.info """
 
 Parameters:
    input_folder              ${params.input_folder}
-   proteome folder           ${params.proteome}
+   proteome folder           ${params.proteome_folder}
    species_tree              ${params.species_tree}
-   splice_folder             ${params.splice_folder}        (optional)
+   splice_folder             ${params.splice_folder}
    omamer_db                 ${params.omamer_db}
-   hogmap_in                 ${params.hogmap_in}            (optional)
+   hogmap_in                 ${params.hogmap_in}
    
    debug_enabled             ${params.debug_enabled}
+   report                    ${params.report}
 """.stripIndent()
 
 
@@ -302,11 +306,14 @@ workflow {
 
 }
 
-//workflow.onComplete {
-//    println "Completed at    : $workflow.complete"
-//    println "Duration        : $workflow.duration"
-//    println "Output in       : $params.output_folder"
-//    println "Nextflow report : $params.statsdir"
-//	println ( workflow.success ? "Done!" : "Oops .. something went wrong" )
-//}
+workflow.onComplete {
+    def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : "");
+    println ""
+    println "Completed at    : $workflow.complete"
+    println "Duration        : $workflow.duration"
+    println "Processes       : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)"
+    println "Output in       : $params.output_folder" + report
+    println ( workflow.success ? "Done!" : "Oops .. something went wrong" )
+}
+
 
diff --git a/nextflow.config b/nextflow.config
index d9a0320..0e2720d 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -14,6 +14,7 @@ params {
   omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5"
   debug_enabled = false
   help = false
+  report = false
 
   output_folder = "Output"
   statsdir = "${params.output_folder}/stats"
@@ -69,10 +70,10 @@ profiles {
 }
 
 timeline {
-  enabled = true
+  enabled = params.report
   file = "${params.statsdir}/timeline.html"
 }
 report {
-  enabled = false
+  enabled = params.report
   file = "${params.statsdir}/report.html"
 }
\ No newline at end of file

From 87f419ca3008938787ca3ae68da26a4197caf54f Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Wed, 29 Nov 2023 23:10:14 +0100
Subject: [PATCH 21/22] add resource limits to workflow

---
 FastOMA_light.nf | 55 ++++++++++++++++++++++++++----------------------
 conf/base.config | 55 ++++++++++++++++++++++++++++++++++++++++++++++++
 nextflow.config  | 53 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 136 insertions(+), 27 deletions(-)
 create mode 100644 conf/base.config

diff --git a/FastOMA_light.nf b/FastOMA_light.nf
index e8c28a2..aac82a6 100644
--- a/FastOMA_light.nf
+++ b/FastOMA_light.nf
@@ -133,31 +133,34 @@ Parameters:
 
 
 process check_input{
-    publishDir params.output_folder, mode: 'copy'
-    input:
-        path proteome_folder
-        path hogmap_folder
-        path species_tree
-        path omamerdb
-        path splice_folder
-    output:
-        path "species_tree_checked.nwk"
-        val "check_completed"
-    script:
-        """
-        fastoma-check-input --proteomes ${proteome_folder} \
-                            --species-tree ${species_tree} \
-                            --out-tree species_tree_checked.nwk \
-                            --splice ${splice_folder} \
-                            --hogmap ${hogmap_folder} \
-                            --omamer_db ${omamerdb} \
-                            -vv
-        """
+  label "process_single"
+  publishDir params.output_folder, mode: 'copy'
+
+  input:
+    path proteome_folder
+    path hogmap_folder
+    path species_tree
+    path omamerdb
+    path splice_folder
+  output:
+    path "species_tree_checked.nwk"
+    val "check_completed"
+  script:
+    """
+    fastoma-check-input --proteomes ${proteome_folder} \
+                        --species-tree ${species_tree} \
+                        --out-tree species_tree_checked.nwk \
+                        --splice ${splice_folder} \
+                        --hogmap ${hogmap_folder} \
+                        --omamer_db ${omamerdb} \
+                        -vv
+    """
 }
 
 
 process omamer_run{
-  time {4.h}
+  label "process_single"
+  tag "$proteome"
   publishDir params.hogmap_folder, mode: 'copy'
 
   input:
@@ -178,11 +181,11 @@ process omamer_run{
 
 
 process infer_roothogs{
+  label "process_single"
   publishDir = [
     path: params.temp_output,
     enabled: params.debug_enabled,
-    //mode: 'copy', saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename }
-    ]
+  ]
 
   input:
     path hogmaps, stageAs: "hogmaps/*"
@@ -219,8 +222,8 @@ process batch_roothogs{
 }
 
 process hog_big{
-  cpus  2
-  time {20.h}     // for very big rhog it might need more, or you could re-run and add `-resume`
+  label "process_high"
+
   input:
     each rhogsbig
     path species_tree
@@ -239,6 +242,8 @@ process hog_big{
 }
 
 process hog_rest{
+  label "process_single"
+
   input:
     each rhogsrest
     path species_tree
diff --git a/conf/base.config b/conf/base.config
new file mode 100644
index 0000000..c92b610
--- /dev/null
+++ b/conf/base.config
@@ -0,0 +1,55 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dessimozlab/FastOMA Nextflow base config file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    A 'blank slate' config file, appropriate for general use on most high performance
+    compute environments. Assumes that all software is installed and available on
+    the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
+    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
+    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+    shell  = ['/bin/bash', '-euo', 'pipefail']
+
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 1
+    maxErrors     = '-1'
+
+    withLabel:process_single {
+        cpus   = { check_max( 1                   , 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_low {
+        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_medium {
+        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high {
+        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_long {
+        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high_memory {
+        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+    }
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+    }
+}
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 0e2720d..7529c85 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -18,6 +18,12 @@ params {
 
   output_folder = "Output"
   statsdir = "${params.output_folder}/stats"
+
+  // Max resource options
+  // Defaults only, expecting to be overwritten
+  max_memory                 = '128.GB'
+  max_cpus                   = 24
+  max_time                   = '120.h'
 }
 
 // Profiles configure nextflow depending on the environment (local, docker, singularity)
@@ -69,11 +75,54 @@ profiles {
   }
 }
 
+def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
 timeline {
   enabled = params.report
-  file = "${params.statsdir}/timeline.html"
+  file = "${params.statsdir}/timeline_${trace_timestamp}.html"
 }
 report {
   enabled = params.report
-  file = "${params.statsdir}/report.html"
+  file = "${params.statsdir}/report_{trace_timestamp}.html"
+}
+trace {
+    enabled = params.report
+    file    = "${params.statsdir}/trace_${trace_timestamp}.txt"
+}
+dag {
+    enabled = params.report
+    file    = "${params.statsdir}/pipeline_dag_${trace_timestamp}.html"
+}
+
+includeConfig "conf/base.config"
+
+// function to check maximum resources
+def check_max(obj, type) {
+    if (type == 'memory') {
+        try {
+            if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
+                return params.max_memory as nextflow.util.MemoryUnit
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'time') {
+        try {
+            if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
+                return params.max_time as nextflow.util.Duration
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'cpus') {
+        try {
+            return Math.min( obj, params.max_cpus as int )
+        } catch (all) {
+            println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
+            return obj
+        }
+    }
 }
\ No newline at end of file

From 18a8862644c2d7106284d21c26d825ab0c5b7c1d Mon Sep 17 00:00:00 2001
From: Adrian Altenhoff <adrian.altenhoff@inf.ethz.ch>
Date: Wed, 29 Nov 2023 23:18:13 +0100
Subject: [PATCH 22/22] use only FastOMA.nf, light version can go

---
 FastOMA.nf       | 448 +++++++++++++++++++++++++++++------------------
 FastOMA_light.nf | 324 ----------------------------------
 2 files changed, 278 insertions(+), 494 deletions(-)
 delete mode 100644 FastOMA_light.nf

diff --git a/FastOMA.nf b/FastOMA.nf
index bb9c546..aac82a6 100644
--- a/FastOMA.nf
+++ b/FastOMA.nf
@@ -1,216 +1,324 @@
+import groovy.json.JsonBuilder;
 
 // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
 
-params.input_folder = "./in_folder/"
-params.output_folder = "./out_folder/"
+params.input_folder = "testdata/in_folder"
+params.output_folder = "out_folder/"
 params.proteome_folder = params.input_folder + "/proteome"
-params.proteomes = params.proteome_folder + "/*"
 params.hogmap_in = params.input_folder + "/hogmap_in"
-
-params.hogmap_folder = params.output_folder + "/hogmap"
 params.splice_folder = params.input_folder + "/splice"
 params.species_tree = params.input_folder + "/species_tree.nwk"
-params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk"
 
-params.temp_pickles = params.output_folder + "/temp_pickles"
+
+
+// output subfolder definition
 params.genetrees_folder = params.output_folder + "/genetrees"
+params.hogmap_folder = params.output_folder + "/hogmap"
 
 params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs"
 
 
+
+if (params.help) {
+    log.info """
+    ===========================================
+      FastOMA -- PIPELINE
+    ===========================================
+    Usage:
+    Run the pipeline with default parameters:
+    nexflow run FastOMA.nf
+
+    Run with user parameters:
+    nextflow run FastOMA.nf --input_folder {input.dir}  --output_folder {results.dir}
+
+    Mandatory arguments:
+        --input_folder          Input data folder. Defaults to ${params.input_folder}. This folder
+                                must contain the proteomes (in a subfolder named 'proteome') and
+                                a species tree file. Optionally the folder might contain
+                                 - a sub-folder 'splice' containing splicing variant mappings
+                                 - a sub-folder 'hogmap_in' containing precomputed OMAmer
+                                   placement results for all proteomes
+
+                                All sub-folders and sub-files can also be placed in orther
+                                locations if you provide alternative values for them (see below on
+                                optional arguments section).
+
+        --output_folder         Path where all the output should be stored. Defaults to
+                                ${params.output_folder}
+
+
+    Profile selection:
+        -profile                FastOMA can be run using several execution profiles. The default
+                                set of available profiles is
+                                 - docker       Run pipeline using docker containers. Docker needs
+                                                to be installed on your system. Containers will be
+                                                fetched automatically from dockerhub. See also
+                                                additional options '--container_version' and
+                                                '--container_name'.
+
+                                 - singlularity Run pipeline using singularity. Singularity needs
+                                                to be installed on your system. On HPC clusters,
+                                                it often needs to be loaded as a seperate module.
+                                                Containers will be fetched automatically from
+                                                dockerhub. See also additional options
+                                                '--container_version' and '--container_name'.
+
+                                 - conda        Run pipeline in a conda environment. Conda needs
+                                                to be installed on your system. The environment
+                                                will be created automatically.
+
+                                 - standard     Run pipeline on your local system. Mainly intended
+                                                for development purpose. All dependencies must be
+                                                installed in the calling environment.
+
+                                 - slurm_singularity
+                                                Run pipeline using SLURM job scheduler and
+                                                singularity containers. This profile can also be a
+                                                template for other HPC clusters that use different
+                                                schedulers.
+
+                                 - slurm_conda  Run pipeline using SLURM job scheduler and conda
+                                                environment.
+
+                                Profiles are defined in nextflow.config and can be extended or
+                                adjusted according to your needs.
+
+
+    Additional options:
+        --proteome_folder       Overwrite location of proteomes (default ${params.proteome_folder})
+        --species_tree          Overwrite location of species tree file (newick format).
+                                Defaults to ${params.species_tree}
+        --splice_folder         Overwrite location of splice file folder. The splice files must be
+                                named <proteome_file>.splice.
+                                Defaults to ${params.splice_folder}
+        --omamer_db             Path or URL to download the OMAmer database from.
+                                Defaults to ${params.omamer_db}
+        --hogmap_in             Optional path where precomputed omamer mapping files are located.
+                                Defaults to ${params.hogmap_in}
+
+    Flags:
+        --help                  Display this message
+        --debug_enabled         Store addtional information that might be helpful to debug in case
+                                of a problem with FastOMA.
+        --report                Produce nextflow report and timeline and store in in
+                                $params.statdir
+
+    """.stripIndent()
+
+    exit 1
+}
+
+
+log.info """
+===========================================
+  FastOMA -- PIPELINE
+===========================================
+
+ Project : ${workflow.projectDir}
+ Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}]
+ Cmd line: ${workflow.commandLine}
+ Manifest's pipeline version: ${workflow.manifest.version}
+
+Parameters:
+   input_folder              ${params.input_folder}
+   proteome folder           ${params.proteome_folder}
+   species_tree              ${params.species_tree}
+   splice_folder             ${params.splice_folder}
+   omamer_db                 ${params.omamer_db}
+   hogmap_in                 ${params.hogmap_in}
+   
+   debug_enabled             ${params.debug_enabled}
+   report                    ${params.report}
+""".stripIndent()
+
+
 process check_input{
-    time {5.h}
-    memory {200.GB}
-    publishDir params.output_folder, mode: 'copy'
-    input:
-        path proteome_folder
-        path hogmap_folder
-        path species_tree
-        path omamerdb
-        path splice_folder
-    output:
-        path "species_tree_checked.nwk"
-        val true
-    script:
-        """
-        fastoma-check-input
-        """
+  label "process_single"
+  publishDir params.output_folder, mode: 'copy'
+
+  input:
+    path proteome_folder
+    path hogmap_folder
+    path species_tree
+    path omamerdb
+    path splice_folder
+  output:
+    path "species_tree_checked.nwk"
+    val "check_completed"
+  script:
+    """
+    fastoma-check-input --proteomes ${proteome_folder} \
+                        --species-tree ${species_tree} \
+                        --out-tree species_tree_checked.nwk \
+                        --splice ${splice_folder} \
+                        --hogmap ${hogmap_folder} \
+                        --omamer_db ${omamerdb} \
+                        -vv
+    """
 }
 
 
 process omamer_run{
-    time {5.h}
-    memory {95.GB}
-    publishDir params.hogmap_folder, mode: 'copy'
-    input:
-        path proteomes_omamerdb_inputhog
-        val ready_input_check_c
-    output:
-        path "*.hogmap"
-        val true
-    script: //todo this if condition can be done as part of nextflow, so it won't submit job for cp
-        """
-            if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ]
-        then
-            cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap  ${proteomes_omamerdb_inputhog[0]}.hogmap
-        else
-            omamer search -n 10 --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap
-        fi
-        """  // --nthreads 10
+  label "process_single"
+  tag "$proteome"
+  publishDir params.hogmap_folder, mode: 'copy'
+
+  input:
+  tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready)
+
+  output:
+  path "*.hogmap"
+
+  script:
+  """
+    if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then
+        cp ${precomputed_hogmap_folder}/${proteome}.hogmap  ${proteome}.hogmap
+    else
+        omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap
+    fi
+  """
 }
 
 
-
 process infer_roothogs{
-    cpus  10      // useful for linclust
-    time {15.h}    // including linclust
-    memory {350.GB}
-    publishDir = [
-        path: params.temp_output,
-        mode:  'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename }
-        ]
-    input:
-        val ready_omamer_run
-        path hogmap_folder
-        path proteome_folder
-        path splice_folder
-    output:
-        path "temp_omamer_rhogs"
-        path "gene_id_dic_xml.pickle"
-        path "selected_isoforms" , optional: true  // SPECIESNAME_selected_isoforms.tsv
-        val true         // nextflow-io.github.io/patterns/state-dependency/
-    script:
-        """
-        fastoma-infer-roothogs  --logger-level DEBUG
-        """
+  label "process_single"
+  publishDir = [
+    path: params.temp_output,
+    enabled: params.debug_enabled,
+  ]
+
+  input:
+    path hogmaps, stageAs: "hogmaps/*"
+    path proteome_folder
+    path splice_folder
+  output:
+    path "omamer_rhogs/*"
+    path "gene_id_dic_xml.pickle"
+    path "selected_isoforms" , optional: true
+  script:
+    """
+       fastoma-infer-roothogs  --proteomes ${proteome_folder} \
+                               --hogmap hogmaps \
+                               --splice ${splice_folder} \
+                               --out-rhog-folder "omamer_rhogs/" \
+                               -vv
+    """
 }
 
 
 process batch_roothogs{
-    time {15.h}    // including linclust
-    memory {95.GB}
-    input:
-        val ready_infer_roothogs
-        path "temp_omamer_rhogs"
-    output:
-        path "rhogs_rest/*", optional: true
-        path "rhogs_big/*" , optional: true
-        val true
-    script:
-        """
-        fastoma-batch-roothogs
-        """
+  input:
+    path rhogs, stageAs: "omamer_rhogs/*"
+  output:
+    path "rhogs_rest/*", optional: true
+    path "rhogs_big/*" , optional: true
+  script:
+    """
+        fastoma-batch-roothogs --input-roothogs omamer_rhogs/ \
+                               --out-big rhogs_big \
+                               --out-rest rhogs_rest \
+                               -vv
+    """
 }
 
 process hog_big{
-    cpus  6
-    time {10.h}     // for very big rhog it might need more, or you could re-run and add `-resume`
-    memory {200.GB}
-    publishDir params.temp_pickles
-    input:
-        val rhogsbig_tree_ready
-    output:
-        path "*.pickle"
-        path "*.fa", optional: true   // msa         if write True
-        path "*.nwk", optional: true  // gene trees  if write True
-        val true
-    script:
-        """
-        fastoma-infer-subhogs  --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel
-        """
+  label "process_high"
+
+  input:
+    each rhogsbig
+    path species_tree
+  output:
+    path "pickle_hogs"
+    path "msa/*.fa" , optional: true          // msa         if write True
+    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
+  script:
+    """
+        fastoma-infer-subhogs  --input-rhog-folder ${rhogsbig}  \
+                               --species-tree ${species_tree} \
+                               --fragment-detection \
+                               --low-so-detection \
+                               --parallel
+    """
 }
-temp_pickles =2
-
-
 
 process hog_rest{
-    time {3.h}     // for very big rhog it might need more, or you could re-run and add `-resume`
-    memory {95.GB}
-    publishDir params.temp_pickles
-    input:
-        val rhogsrest_tree_ready
-    output:
-        path "*.pickle"
-        path "*.fa" , optional: true   // msa         if write True
-        path "*.nwk" , optional: true  // gene trees  if write True
-        val true
-    script:
-        """
-        fastoma-infer-subhogs  --input-rhog-folder ${rhogsrest_tree_ready[0]}  --species-tree ${rhogsrest_tree_ready[1]}
-        """  // --parrallel False
+  label "process_single"
+
+  input:
+    each rhogsrest
+    path species_tree
+  output:
+    path "pickle_hogs"
+    path "msa/*.fa" , optional: true          // msa         if write True
+    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
+  script:
+    """
+        fastoma-infer-subhogs --input-rhog-folder ${rhogsrest}  \
+                              --species-tree ${species_tree} \
+                              --fragment-detection \
+                              --low-so-detection
+                              #--out pickle_hogs
+    """
 }
 
 
 process collect_subhogs{
-    time {10.h}
-    memory {300.GB} // orthoxml string can be very huge in memory
-    publishDir params.output_folder, mode: 'copy'
-    input:
-        val ready_hog_rest
-        val ready_hog_big
-        path "temp_pickles"   // this is the folder includes pickles_rhogs
-        path "gene_id_dic_xml.pickle"
-        path "temp_omamer_rhogs"
-    output:
-        path "output_hog.orthoxml"
-        path "OrthologousGroupsFasta"
-        path "OrthologousGroups.tsv"
-        path "rootHOGs.tsv"
-    script:
-        """
-        fastoma-collect-subhogs
-        """
+  publishDir params.output_folder, mode: 'copy'
+  input:
+    path pickles, stageAs: "pickle_folders/?"
+    path "gene_id_dic_xml.pickle"
+    path rhogs, stageAs: "omamer_rhogs/*"
+  output:
+    path "output_hog.orthoxml"
+    path "OrthologousGroupsFasta"
+    path "OrthologousGroups.tsv"
+    path "rootHOGs.tsv"
+  script:
+    """
+        fastoma-collect-subhogs --pickle-folder pickle_folders/ \
+                                --roothogs-folder omamer_rhogs/ \
+                                --gene-id-pickle-file gene_id_dic_xml.pickle \
+                                --out output_hog.orthoxml \
+                                --marker-groups-fasta OrthologousGroups.tsv \
+                                --roothog-tsv rootHOGs.tsv \
+                                -vv
+    """
 }
 
+workflow {
+    proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first()
+    proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true)
+    species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first()
 
+    splice_folder = Channel.fromPath(params.splice_folder, type: "dir")
+    genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir')
+    hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir')
 
+    omamerdb = Channel.fromPath(params.omamer_db)
+    (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder)
+    omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check)
+    hogmap = omamer_run(omamer_input_channel)
 
-workflow {
-    proteomes = Channel.fromPath(params.proteomes,  type:'any' ,checkIfExists:true)
-    proteome_folder = Channel.fromPath(params.proteome_folder)
-    hogmap_folder = Channel.fromPath(params.hogmap_folder)
-    splice_folder = Channel.fromPath(params.splice_folder)
-    temp_output = Channel.fromPath(params.temp_output)
-    genetrees_folder = Channel.fromPath(params.genetrees_folder)
-    hogmap_in = Channel.fromPath(params.hogmap_in)
-
-    temp_pickles =  Channel.fromPath(params.temp_pickles)
-    omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
-    species_tree = Channel.fromPath(params.species_tree)
-    species_tree_checked = Channel.fromPath(params.species_tree_checked)
-
-    (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder)
-    ready_input_check_c = ready_input_check.collect()
-    //species_tree_checked.view{"species_tree_checked ${it}"}
-
-    proteomes_omamerdb = proteomes.combine(omamerdb)
-    proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
-    //proteomes_omamerdb_inputhog_inputcheck =  proteomes_omamerdb_inputhog.combine(ready_input_check_c)
-    (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog,ready_input_check_c)
-    ready_omamer_run_c = ready_omamer_run.collect()
-    (temp_omamer_rhogs, gene_id_dic_xml,selected_isoforms, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder)
-    ready_infer_roothogs_c = ready_infer_roothogs.collect()
-
-    (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, temp_omamer_rhogs)
-    ready_batch_roothogs_c = ready_batch_roothogs.collect()
-
-
-    rhogsbig = rhogs_big_list.flatten()
-    rhogsbig_tree =  rhogsbig.combine(species_tree_checked)
-    rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
-    (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready)
-
-    rhogsrest = rhogs_rest_list.flatten()
-    rhogsrest_tree =  rhogsrest.combine(species_tree_checked)
-    rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
-    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
-
-    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), temp_pickles, gene_id_dic_xml, temp_omamer_rhogs)
-    // temp_omamer_rhogs.view{" output omamer_rhogs ${it}"}
-    // orthoxml_file.view{" output orthoxml file ${it}"}
+    (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder)
+
+    (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs)
+
+    (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree)
+    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree)
+    channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle }
+
+    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs)
 
 }
 
+workflow.onComplete {
+    def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : "");
+    println ""
+    println "Completed at    : $workflow.complete"
+    println "Duration        : $workflow.duration"
+    println "Processes       : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)"
+    println "Output in       : $params.output_folder" + report
+    println ( workflow.success ? "Done!" : "Oops .. something went wrong" )
+}
 
 
-// todo: check input files very beginning (before omamer starts) e.g all species are in the species tree. No species chars in fasta record.
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
deleted file mode 100644
index aac82a6..0000000
--- a/FastOMA_light.nf
+++ /dev/null
@@ -1,324 +0,0 @@
-import groovy.json.JsonBuilder;
-
-// NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
-
-params.input_folder = "testdata/in_folder"
-params.output_folder = "out_folder/"
-params.proteome_folder = params.input_folder + "/proteome"
-params.hogmap_in = params.input_folder + "/hogmap_in"
-params.splice_folder = params.input_folder + "/splice"
-params.species_tree = params.input_folder + "/species_tree.nwk"
-
-
-
-// output subfolder definition
-params.genetrees_folder = params.output_folder + "/genetrees"
-params.hogmap_folder = params.output_folder + "/hogmap"
-
-params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs"
-
-
-
-if (params.help) {
-    log.info """
-    ===========================================
-      FastOMA -- PIPELINE
-    ===========================================
-    Usage:
-    Run the pipeline with default parameters:
-    nexflow run FastOMA.nf
-
-    Run with user parameters:
-    nextflow run FastOMA.nf --input_folder {input.dir}  --output_folder {results.dir}
-
-    Mandatory arguments:
-        --input_folder          Input data folder. Defaults to ${params.input_folder}. This folder
-                                must contain the proteomes (in a subfolder named 'proteome') and
-                                a species tree file. Optionally the folder might contain
-                                 - a sub-folder 'splice' containing splicing variant mappings
-                                 - a sub-folder 'hogmap_in' containing precomputed OMAmer
-                                   placement results for all proteomes
-
-                                All sub-folders and sub-files can also be placed in orther
-                                locations if you provide alternative values for them (see below on
-                                optional arguments section).
-
-        --output_folder         Path where all the output should be stored. Defaults to
-                                ${params.output_folder}
-
-
-    Profile selection:
-        -profile                FastOMA can be run using several execution profiles. The default
-                                set of available profiles is
-                                 - docker       Run pipeline using docker containers. Docker needs
-                                                to be installed on your system. Containers will be
-                                                fetched automatically from dockerhub. See also
-                                                additional options '--container_version' and
-                                                '--container_name'.
-
-                                 - singlularity Run pipeline using singularity. Singularity needs
-                                                to be installed on your system. On HPC clusters,
-                                                it often needs to be loaded as a seperate module.
-                                                Containers will be fetched automatically from
-                                                dockerhub. See also additional options
-                                                '--container_version' and '--container_name'.
-
-                                 - conda        Run pipeline in a conda environment. Conda needs
-                                                to be installed on your system. The environment
-                                                will be created automatically.
-
-                                 - standard     Run pipeline on your local system. Mainly intended
-                                                for development purpose. All dependencies must be
-                                                installed in the calling environment.
-
-                                 - slurm_singularity
-                                                Run pipeline using SLURM job scheduler and
-                                                singularity containers. This profile can also be a
-                                                template for other HPC clusters that use different
-                                                schedulers.
-
-                                 - slurm_conda  Run pipeline using SLURM job scheduler and conda
-                                                environment.
-
-                                Profiles are defined in nextflow.config and can be extended or
-                                adjusted according to your needs.
-
-
-    Additional options:
-        --proteome_folder       Overwrite location of proteomes (default ${params.proteome_folder})
-        --species_tree          Overwrite location of species tree file (newick format).
-                                Defaults to ${params.species_tree}
-        --splice_folder         Overwrite location of splice file folder. The splice files must be
-                                named <proteome_file>.splice.
-                                Defaults to ${params.splice_folder}
-        --omamer_db             Path or URL to download the OMAmer database from.
-                                Defaults to ${params.omamer_db}
-        --hogmap_in             Optional path where precomputed omamer mapping files are located.
-                                Defaults to ${params.hogmap_in}
-
-    Flags:
-        --help                  Display this message
-        --debug_enabled         Store addtional information that might be helpful to debug in case
-                                of a problem with FastOMA.
-        --report                Produce nextflow report and timeline and store in in
-                                $params.statdir
-
-    """.stripIndent()
-
-    exit 1
-}
-
-
-log.info """
-===========================================
-  FastOMA -- PIPELINE
-===========================================
-
- Project : ${workflow.projectDir}
- Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}]
- Cmd line: ${workflow.commandLine}
- Manifest's pipeline version: ${workflow.manifest.version}
-
-Parameters:
-   input_folder              ${params.input_folder}
-   proteome folder           ${params.proteome_folder}
-   species_tree              ${params.species_tree}
-   splice_folder             ${params.splice_folder}
-   omamer_db                 ${params.omamer_db}
-   hogmap_in                 ${params.hogmap_in}
-   
-   debug_enabled             ${params.debug_enabled}
-   report                    ${params.report}
-""".stripIndent()
-
-
-process check_input{
-  label "process_single"
-  publishDir params.output_folder, mode: 'copy'
-
-  input:
-    path proteome_folder
-    path hogmap_folder
-    path species_tree
-    path omamerdb
-    path splice_folder
-  output:
-    path "species_tree_checked.nwk"
-    val "check_completed"
-  script:
-    """
-    fastoma-check-input --proteomes ${proteome_folder} \
-                        --species-tree ${species_tree} \
-                        --out-tree species_tree_checked.nwk \
-                        --splice ${splice_folder} \
-                        --hogmap ${hogmap_folder} \
-                        --omamer_db ${omamerdb} \
-                        -vv
-    """
-}
-
-
-process omamer_run{
-  label "process_single"
-  tag "$proteome"
-  publishDir params.hogmap_folder, mode: 'copy'
-
-  input:
-  tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready)
-
-  output:
-  path "*.hogmap"
-
-  script:
-  """
-    if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then
-        cp ${precomputed_hogmap_folder}/${proteome}.hogmap  ${proteome}.hogmap
-    else
-        omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap
-    fi
-  """
-}
-
-
-process infer_roothogs{
-  label "process_single"
-  publishDir = [
-    path: params.temp_output,
-    enabled: params.debug_enabled,
-  ]
-
-  input:
-    path hogmaps, stageAs: "hogmaps/*"
-    path proteome_folder
-    path splice_folder
-  output:
-    path "omamer_rhogs/*"
-    path "gene_id_dic_xml.pickle"
-    path "selected_isoforms" , optional: true
-  script:
-    """
-       fastoma-infer-roothogs  --proteomes ${proteome_folder} \
-                               --hogmap hogmaps \
-                               --splice ${splice_folder} \
-                               --out-rhog-folder "omamer_rhogs/" \
-                               -vv
-    """
-}
-
-
-process batch_roothogs{
-  input:
-    path rhogs, stageAs: "omamer_rhogs/*"
-  output:
-    path "rhogs_rest/*", optional: true
-    path "rhogs_big/*" , optional: true
-  script:
-    """
-        fastoma-batch-roothogs --input-roothogs omamer_rhogs/ \
-                               --out-big rhogs_big \
-                               --out-rest rhogs_rest \
-                               -vv
-    """
-}
-
-process hog_big{
-  label "process_high"
-
-  input:
-    each rhogsbig
-    path species_tree
-  output:
-    path "pickle_hogs"
-    path "msa/*.fa" , optional: true          // msa         if write True
-    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
-  script:
-    """
-        fastoma-infer-subhogs  --input-rhog-folder ${rhogsbig}  \
-                               --species-tree ${species_tree} \
-                               --fragment-detection \
-                               --low-so-detection \
-                               --parallel
-    """
-}
-
-process hog_rest{
-  label "process_single"
-
-  input:
-    each rhogsrest
-    path species_tree
-  output:
-    path "pickle_hogs"
-    path "msa/*.fa" , optional: true          // msa         if write True
-    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
-  script:
-    """
-        fastoma-infer-subhogs --input-rhog-folder ${rhogsrest}  \
-                              --species-tree ${species_tree} \
-                              --fragment-detection \
-                              --low-so-detection
-                              #--out pickle_hogs
-    """
-}
-
-
-process collect_subhogs{
-  publishDir params.output_folder, mode: 'copy'
-  input:
-    path pickles, stageAs: "pickle_folders/?"
-    path "gene_id_dic_xml.pickle"
-    path rhogs, stageAs: "omamer_rhogs/*"
-  output:
-    path "output_hog.orthoxml"
-    path "OrthologousGroupsFasta"
-    path "OrthologousGroups.tsv"
-    path "rootHOGs.tsv"
-  script:
-    """
-        fastoma-collect-subhogs --pickle-folder pickle_folders/ \
-                                --roothogs-folder omamer_rhogs/ \
-                                --gene-id-pickle-file gene_id_dic_xml.pickle \
-                                --out output_hog.orthoxml \
-                                --marker-groups-fasta OrthologousGroups.tsv \
-                                --roothog-tsv rootHOGs.tsv \
-                                -vv
-    """
-}
-
-workflow {
-    proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first()
-    proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true)
-    species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first()
-
-    splice_folder = Channel.fromPath(params.splice_folder, type: "dir")
-    genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir')
-    hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir')
-
-    omamerdb = Channel.fromPath(params.omamer_db)
-    (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder)
-    omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check)
-    hogmap = omamer_run(omamer_input_channel)
-
-    (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder)
-
-    (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs)
-
-    (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree)
-    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree)
-    channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle }
-
-    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs)
-
-}
-
-workflow.onComplete {
-    def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : "");
-    println ""
-    println "Completed at    : $workflow.complete"
-    println "Duration        : $workflow.duration"
-    println "Processes       : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)"
-    println "Output in       : $params.output_folder" + report
-    println ( workflow.success ? "Done!" : "Oops .. something went wrong" )
-}
-
-