diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..2021e50
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,8 @@
+work
+.nextflow*
+.idea
+.git
+output
+testdata
+dist
+archive/
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 0000000..73e5fb6
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,73 @@
+name: Docker Image CI
+
+on:
+  push:
+  pull_request:
+  release:
+    type: [published]
+
+env:
+  TEST_TAG: dessimozlab/fastoma:test
+
+jobs:
+
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v4
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            dessimozlab/fastoma
+          # generate Docker tags based on the following events/attributes
+          tags: |
+            type=schedule
+            type=ref,event=branch
+            type=ref,event=pr
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+            type=sha
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Build and export to docker for testing
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          load: true
+          tags: ${{ env.TEST_TAG }}
+
+      #- name: Test
+      #  run: |
+      #    docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/cds-marker_genes.fasta.gz --reads /reads/sample_1.fastq --output_path /out
+      #    if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKER_HUB_USERNAME }}
+          password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
+
+      - name: Build and push
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          push: true
+          #${{ github.event_name != 'push' && github.event_name != 'pull_request' }}
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a0c6a24
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,8 @@
+.nextflow*
+work/
+.idea/
+dist/
+archive
+.git
+.gitignore
+__pycache__
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..ee6c1f3
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,43 @@
+FROM python:3.11-slim as basis
+
+# set environment varibles
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+
+FROM basis as builder
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       build-essential \
+       fasttree \
+       libxml2 \
+       mafft \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+RUN pip install --upgrade hatch pip
+COPY pyproject.toml .
+RUN python -m venv /app \
+    && hatch dep show requirements > requirements.txt \
+    && /app/bin/pip install -r requirements.txt
+
+COPY . .
+RUN ls -la \
+    && hatch build \
+    && ls -la dist/ \
+    && /app/bin/pip install dist/*.whl
+
+
+FROM basis as runtime
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends \
+       fasttree \
+       libxml2 \
+       mafft \
+       procps \
+    && apt-get -y autoremove \
+    && apt-get -y autoclean \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=builder /app /app
+ENV PATH="/app/bin:$PATH"
diff --git a/FastOMA.nf b/FastOMA.nf
index bb9c546..aac82a6 100644
--- a/FastOMA.nf
+++ b/FastOMA.nf
@@ -1,216 +1,324 @@
+import groovy.json.JsonBuilder;
 
 // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
 
-params.input_folder = "./in_folder/"
-params.output_folder = "./out_folder/"
+params.input_folder = "testdata/in_folder"
+params.output_folder = "out_folder/"
 params.proteome_folder = params.input_folder + "/proteome"
-params.proteomes = params.proteome_folder + "/*"
 params.hogmap_in = params.input_folder + "/hogmap_in"
-
-params.hogmap_folder = params.output_folder + "/hogmap"
 params.splice_folder = params.input_folder + "/splice"
 params.species_tree = params.input_folder + "/species_tree.nwk"
-params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk"
 
-params.temp_pickles = params.output_folder + "/temp_pickles"
+
+
+// output subfolder definition
 params.genetrees_folder = params.output_folder + "/genetrees"
+params.hogmap_folder = params.output_folder + "/hogmap"
 
 params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs"
 
 
+
+if (params.help) {
+    log.info """
+    ===========================================
+      FastOMA -- PIPELINE
+    ===========================================
+    Usage:
+    Run the pipeline with default parameters:
+    nexflow run FastOMA.nf
+
+    Run with user parameters:
+    nextflow run FastOMA.nf --input_folder {input.dir}  --output_folder {results.dir}
+
+    Mandatory arguments:
+        --input_folder          Input data folder. Defaults to ${params.input_folder}. This folder
+                                must contain the proteomes (in a subfolder named 'proteome') and
+                                a species tree file. Optionally the folder might contain
+                                 - a sub-folder 'splice' containing splicing variant mappings
+                                 - a sub-folder 'hogmap_in' containing precomputed OMAmer
+                                   placement results for all proteomes
+
+                                All sub-folders and sub-files can also be placed in orther
+                                locations if you provide alternative values for them (see below on
+                                optional arguments section).
+
+        --output_folder         Path where all the output should be stored. Defaults to
+                                ${params.output_folder}
+
+
+    Profile selection:
+        -profile                FastOMA can be run using several execution profiles. The default
+                                set of available profiles is
+                                 - docker       Run pipeline using docker containers. Docker needs
+                                                to be installed on your system. Containers will be
+                                                fetched automatically from dockerhub. See also
+                                                additional options '--container_version' and
+                                                '--container_name'.
+
+                                 - singlularity Run pipeline using singularity. Singularity needs
+                                                to be installed on your system. On HPC clusters,
+                                                it often needs to be loaded as a seperate module.
+                                                Containers will be fetched automatically from
+                                                dockerhub. See also additional options
+                                                '--container_version' and '--container_name'.
+
+                                 - conda        Run pipeline in a conda environment. Conda needs
+                                                to be installed on your system. The environment
+                                                will be created automatically.
+
+                                 - standard     Run pipeline on your local system. Mainly intended
+                                                for development purpose. All dependencies must be
+                                                installed in the calling environment.
+
+                                 - slurm_singularity
+                                                Run pipeline using SLURM job scheduler and
+                                                singularity containers. This profile can also be a
+                                                template for other HPC clusters that use different
+                                                schedulers.
+
+                                 - slurm_conda  Run pipeline using SLURM job scheduler and conda
+                                                environment.
+
+                                Profiles are defined in nextflow.config and can be extended or
+                                adjusted according to your needs.
+
+
+    Additional options:
+        --proteome_folder       Overwrite location of proteomes (default ${params.proteome_folder})
+        --species_tree          Overwrite location of species tree file (newick format).
+                                Defaults to ${params.species_tree}
+        --splice_folder         Overwrite location of splice file folder. The splice files must be
+                                named <proteome_file>.splice.
+                                Defaults to ${params.splice_folder}
+        --omamer_db             Path or URL to download the OMAmer database from.
+                                Defaults to ${params.omamer_db}
+        --hogmap_in             Optional path where precomputed omamer mapping files are located.
+                                Defaults to ${params.hogmap_in}
+
+    Flags:
+        --help                  Display this message
+        --debug_enabled         Store addtional information that might be helpful to debug in case
+                                of a problem with FastOMA.
+        --report                Produce nextflow report and timeline and store in in
+                                $params.statdir
+
+    """.stripIndent()
+
+    exit 1
+}
+
+
+log.info """
+===========================================
+  FastOMA -- PIPELINE
+===========================================
+
+ Project : ${workflow.projectDir}
+ Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}]
+ Cmd line: ${workflow.commandLine}
+ Manifest's pipeline version: ${workflow.manifest.version}
+
+Parameters:
+   input_folder              ${params.input_folder}
+   proteome folder           ${params.proteome_folder}
+   species_tree              ${params.species_tree}
+   splice_folder             ${params.splice_folder}
+   omamer_db                 ${params.omamer_db}
+   hogmap_in                 ${params.hogmap_in}
+   
+   debug_enabled             ${params.debug_enabled}
+   report                    ${params.report}
+""".stripIndent()
+
+
 process check_input{
-    time {5.h}
-    memory {200.GB}
-    publishDir params.output_folder, mode: 'copy'
-    input:
-        path proteome_folder
-        path hogmap_folder
-        path species_tree
-        path omamerdb
-        path splice_folder
-    output:
-        path "species_tree_checked.nwk"
-        val true
-    script:
-        """
-        fastoma-check-input
-        """
+  label "process_single"
+  publishDir params.output_folder, mode: 'copy'
+
+  input:
+    path proteome_folder
+    path hogmap_folder
+    path species_tree
+    path omamerdb
+    path splice_folder
+  output:
+    path "species_tree_checked.nwk"
+    val "check_completed"
+  script:
+    """
+    fastoma-check-input --proteomes ${proteome_folder} \
+                        --species-tree ${species_tree} \
+                        --out-tree species_tree_checked.nwk \
+                        --splice ${splice_folder} \
+                        --hogmap ${hogmap_folder} \
+                        --omamer_db ${omamerdb} \
+                        -vv
+    """
 }
 
 
 process omamer_run{
-    time {5.h}
-    memory {95.GB}
-    publishDir params.hogmap_folder, mode: 'copy'
-    input:
-        path proteomes_omamerdb_inputhog
-        val ready_input_check_c
-    output:
-        path "*.hogmap"
-        val true
-    script: //todo this if condition can be done as part of nextflow, so it won't submit job for cp
-        """
-            if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ]
-        then
-            cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap  ${proteomes_omamerdb_inputhog[0]}.hogmap
-        else
-            omamer search -n 10 --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap
-        fi
-        """  // --nthreads 10
+  label "process_single"
+  tag "$proteome"
+  publishDir params.hogmap_folder, mode: 'copy'
+
+  input:
+  tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready)
+
+  output:
+  path "*.hogmap"
+
+  script:
+  """
+    if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then
+        cp ${precomputed_hogmap_folder}/${proteome}.hogmap  ${proteome}.hogmap
+    else
+        omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap
+    fi
+  """
 }
 
 
-
 process infer_roothogs{
-    cpus  10      // useful for linclust
-    time {15.h}    // including linclust
-    memory {350.GB}
-    publishDir = [
-        path: params.temp_output,
-        mode:  'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename }
-        ]
-    input:
-        val ready_omamer_run
-        path hogmap_folder
-        path proteome_folder
-        path splice_folder
-    output:
-        path "temp_omamer_rhogs"
-        path "gene_id_dic_xml.pickle"
-        path "selected_isoforms" , optional: true  // SPECIESNAME_selected_isoforms.tsv
-        val true         // nextflow-io.github.io/patterns/state-dependency/
-    script:
-        """
-        fastoma-infer-roothogs  --logger-level DEBUG
-        """
+  label "process_single"
+  publishDir = [
+    path: params.temp_output,
+    enabled: params.debug_enabled,
+  ]
+
+  input:
+    path hogmaps, stageAs: "hogmaps/*"
+    path proteome_folder
+    path splice_folder
+  output:
+    path "omamer_rhogs/*"
+    path "gene_id_dic_xml.pickle"
+    path "selected_isoforms" , optional: true
+  script:
+    """
+       fastoma-infer-roothogs  --proteomes ${proteome_folder} \
+                               --hogmap hogmaps \
+                               --splice ${splice_folder} \
+                               --out-rhog-folder "omamer_rhogs/" \
+                               -vv
+    """
 }
 
 
 process batch_roothogs{
-    time {15.h}    // including linclust
-    memory {95.GB}
-    input:
-        val ready_infer_roothogs
-        path "temp_omamer_rhogs"
-    output:
-        path "rhogs_rest/*", optional: true
-        path "rhogs_big/*" , optional: true
-        val true
-    script:
-        """
-        fastoma-batch-roothogs
-        """
+  input:
+    path rhogs, stageAs: "omamer_rhogs/*"
+  output:
+    path "rhogs_rest/*", optional: true
+    path "rhogs_big/*" , optional: true
+  script:
+    """
+        fastoma-batch-roothogs --input-roothogs omamer_rhogs/ \
+                               --out-big rhogs_big \
+                               --out-rest rhogs_rest \
+                               -vv
+    """
 }
 
 process hog_big{
-    cpus  6
-    time {10.h}     // for very big rhog it might need more, or you could re-run and add `-resume`
-    memory {200.GB}
-    publishDir params.temp_pickles
-    input:
-        val rhogsbig_tree_ready
-    output:
-        path "*.pickle"
-        path "*.fa", optional: true   // msa         if write True
-        path "*.nwk", optional: true  // gene trees  if write True
-        val true
-    script:
-        """
-        fastoma-infer-subhogs  --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel
-        """
+  label "process_high"
+
+  input:
+    each rhogsbig
+    path species_tree
+  output:
+    path "pickle_hogs"
+    path "msa/*.fa" , optional: true          // msa         if write True
+    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
+  script:
+    """
+        fastoma-infer-subhogs  --input-rhog-folder ${rhogsbig}  \
+                               --species-tree ${species_tree} \
+                               --fragment-detection \
+                               --low-so-detection \
+                               --parallel
+    """
 }
-temp_pickles =2
-
-
 
 process hog_rest{
-    time {3.h}     // for very big rhog it might need more, or you could re-run and add `-resume`
-    memory {95.GB}
-    publishDir params.temp_pickles
-    input:
-        val rhogsrest_tree_ready
-    output:
-        path "*.pickle"
-        path "*.fa" , optional: true   // msa         if write True
-        path "*.nwk" , optional: true  // gene trees  if write True
-        val true
-    script:
-        """
-        fastoma-infer-subhogs  --input-rhog-folder ${rhogsrest_tree_ready[0]}  --species-tree ${rhogsrest_tree_ready[1]}
-        """  // --parrallel False
+  label "process_single"
+
+  input:
+    each rhogsrest
+    path species_tree
+  output:
+    path "pickle_hogs"
+    path "msa/*.fa" , optional: true          // msa         if write True
+    path "gene_trees/*.nwk" , optional: true  // gene trees  if write True
+  script:
+    """
+        fastoma-infer-subhogs --input-rhog-folder ${rhogsrest}  \
+                              --species-tree ${species_tree} \
+                              --fragment-detection \
+                              --low-so-detection
+                              #--out pickle_hogs
+    """
 }
 
 
 process collect_subhogs{
-    time {10.h}
-    memory {300.GB} // orthoxml string can be very huge in memory
-    publishDir params.output_folder, mode: 'copy'
-    input:
-        val ready_hog_rest
-        val ready_hog_big
-        path "temp_pickles"   // this is the folder includes pickles_rhogs
-        path "gene_id_dic_xml.pickle"
-        path "temp_omamer_rhogs"
-    output:
-        path "output_hog.orthoxml"
-        path "OrthologousGroupsFasta"
-        path "OrthologousGroups.tsv"
-        path "rootHOGs.tsv"
-    script:
-        """
-        fastoma-collect-subhogs
-        """
+  publishDir params.output_folder, mode: 'copy'
+  input:
+    path pickles, stageAs: "pickle_folders/?"
+    path "gene_id_dic_xml.pickle"
+    path rhogs, stageAs: "omamer_rhogs/*"
+  output:
+    path "output_hog.orthoxml"
+    path "OrthologousGroupsFasta"
+    path "OrthologousGroups.tsv"
+    path "rootHOGs.tsv"
+  script:
+    """
+        fastoma-collect-subhogs --pickle-folder pickle_folders/ \
+                                --roothogs-folder omamer_rhogs/ \
+                                --gene-id-pickle-file gene_id_dic_xml.pickle \
+                                --out output_hog.orthoxml \
+                                --marker-groups-fasta OrthologousGroups.tsv \
+                                --roothog-tsv rootHOGs.tsv \
+                                -vv
+    """
 }
 
+workflow {
+    proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first()
+    proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true)
+    species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first()
 
+    splice_folder = Channel.fromPath(params.splice_folder, type: "dir")
+    genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir')
+    hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir')
 
+    omamerdb = Channel.fromPath(params.omamer_db)
+    (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder)
+    omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check)
+    hogmap = omamer_run(omamer_input_channel)
 
-workflow {
-    proteomes = Channel.fromPath(params.proteomes,  type:'any' ,checkIfExists:true)
-    proteome_folder = Channel.fromPath(params.proteome_folder)
-    hogmap_folder = Channel.fromPath(params.hogmap_folder)
-    splice_folder = Channel.fromPath(params.splice_folder)
-    temp_output = Channel.fromPath(params.temp_output)
-    genetrees_folder = Channel.fromPath(params.genetrees_folder)
-    hogmap_in = Channel.fromPath(params.hogmap_in)
-
-    temp_pickles =  Channel.fromPath(params.temp_pickles)
-    omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
-    species_tree = Channel.fromPath(params.species_tree)
-    species_tree_checked = Channel.fromPath(params.species_tree_checked)
-
-    (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder)
-    ready_input_check_c = ready_input_check.collect()
-    //species_tree_checked.view{"species_tree_checked ${it}"}
-
-    proteomes_omamerdb = proteomes.combine(omamerdb)
-    proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
-    //proteomes_omamerdb_inputhog_inputcheck =  proteomes_omamerdb_inputhog.combine(ready_input_check_c)
-    (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog,ready_input_check_c)
-    ready_omamer_run_c = ready_omamer_run.collect()
-    (temp_omamer_rhogs, gene_id_dic_xml,selected_isoforms, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder)
-    ready_infer_roothogs_c = ready_infer_roothogs.collect()
-
-    (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, temp_omamer_rhogs)
-    ready_batch_roothogs_c = ready_batch_roothogs.collect()
-
-
-    rhogsbig = rhogs_big_list.flatten()
-    rhogsbig_tree =  rhogsbig.combine(species_tree_checked)
-    rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
-    (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready)
-
-    rhogsrest = rhogs_rest_list.flatten()
-    rhogsrest_tree =  rhogsrest.combine(species_tree_checked)
-    rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
-    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
-
-    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), temp_pickles, gene_id_dic_xml, temp_omamer_rhogs)
-    // temp_omamer_rhogs.view{" output omamer_rhogs ${it}"}
-    // orthoxml_file.view{" output orthoxml file ${it}"}
+    (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder)
+
+    (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs)
+
+    (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree)
+    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree)
+    channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle }
+
+    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs)
 
 }
 
+workflow.onComplete {
+    def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : "");
+    println ""
+    println "Completed at    : $workflow.complete"
+    println "Duration        : $workflow.duration"
+    println "Processes       : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)"
+    println "Output in       : $params.output_folder" + report
+    println ( workflow.success ? "Done!" : "Oops .. something went wrong" )
+}
 
 
-// todo: check input files very beginning (before omamer starts) e.g all species are in the species tree. No species chars in fasta record.
diff --git a/FastOMA/_config.py b/FastOMA/_config.py
index 3453348..62db7bd 100644
--- a/FastOMA/_config.py
+++ b/FastOMA/_config.py
@@ -77,6 +77,7 @@
 
 inferhog_min_hog_size_xml = 2     # by setting this as 1, pyham won't work on xml output.
 
+# batch_roothogs
 big_rhog_filesize_thresh = 400 * 1000
 sum_list_rhogs_filesize_thresh = 2 * 1e6
 
diff --git a/FastOMA/_utils_roothog.py b/FastOMA/_utils_roothog.py
index e33cd20..5e87639 100644
--- a/FastOMA/_utils_roothog.py
+++ b/FastOMA/_utils_roothog.py
@@ -1,3 +1,5 @@
+import collections
+import csv
 
 from Bio import SeqIO
 import pickle
@@ -7,6 +9,7 @@
 from ._utils_subhog import logger_hog
 from .zoo.unionfind import UnionFind
 from . import _config
+HOGMapData = collections.namedtuple("HOGMapData", ("hogid", "score", "seqlen", "subfamily_medianseqlen"))
 
 import collections
 
@@ -134,7 +137,7 @@ def get_components(self):
         return comp
 
 
-def parse_proteomes(folder="./"):  # list_oma_species
+def parse_proteomes(folder=None):  # list_oma_species
     """
     parsing fasta files of proteins located in /proteome/
     using Bio.SeqIO.parse
@@ -142,28 +145,31 @@ def parse_proteomes(folder="./"):  # list_oma_species
     All proteoems should be with the same extension.
     output: prot_recs_lists: a dic with key as species name and  its value list of Biopython record of species.
     """
+    if folder is None:
+        folder = "./proteome"
 
-    project_files = listdir(folder+"/proteome/")
-    species_names = [] # query/input species name based on the file name
+    fasta_format_keep = ""
+    project_files = listdir(folder)
+    logger_hog.debug("using '%s' as proteome folder, found %d files", folder, len(project_files))
+    species_names = []  # query/input species name based on the file name
     for file in project_files:
-        fasta_format = file.split(".")[-1]
-        if fasta_format == "fa" or fasta_format == "fasta":
-            file_name_split = file.split(".")[:-1]
-            species_names.append('.'.join(file_name_split))
-            fasta_format_keep = fasta_format # last one is stored either fa or fasta
+        species_name, ext = file.rsplit('.', 1)
+        logger_hog.debug("%s: %s | %s", file, species_name, ext)
+        if ext in ("fa", "fasta"):
+            species_names.append(species_name)
+            fasta_format_keep = ext  # last one is stored either fa or fasta
+
     # todo accept all fasta formats in the input prtoeome folder, fasta, fa, fna, ..
     prot_recs_lists = {} # key: species name, value is a dic of query protein Biopython records. # 'MYCGE': [SeqRecord(seq=Seq('MDFDK
-
     for species_name in species_names:
-        prot_address = folder+"/proteome/" + species_name + "."+fasta_format_keep
+        prot_address = os.path.join(folder, species_name + "." + fasta_format_keep)
         prots_record = list(SeqIO.parse(prot_address, "fasta"))
-        prot_recs_lists[species_name]=prots_record
+        prot_recs_lists[species_name] = prots_record
 
-    logger_hog.info("There are "+str(len(species_names))+" species in the proteome folder.")
+    logger_hog.info("There are %d species in the proteome folder.", len(species_names))
     return species_names, prot_recs_lists, fasta_format_keep
 
 
-
 def add_species_name_prot_id(species_names, prot_recs_lists):
     """
     adding the name of species to each protein record
@@ -203,7 +209,7 @@ def add_species_name_prot_id(species_names, prot_recs_lists):
     return prot_recs_all
 
 
-def parse_hogmap_omamer(species_names, fasta_format_keep,  folder="./"):
+def parse_hogmap_omamer(species_names, fasta_format_keep,  folder=None):
     """
      function for parsing output of omamer (hogmap files) located in /hogmap/
     Each hogmap file correspond to one fasta file of species, with the same name.
@@ -213,41 +219,29 @@ def parse_hogmap_omamer(species_names, fasta_format_keep,  folder="./"):
     # sp|O66907|ATPA_AQUAE	HOG:C0886513.1b	Eukaryota	696.5519485850672	187	0.37302594463771466	0.2643067275644273	120	504	551	0.8230616302186878
     output is dic of dic for all species:
     """
+    if folder is None:
+        folder = "./hogmap"
+
     hogmaps = {}
-    unmapped = {}
+    unmapped = collections.defaultdict(list)
     for species_name in species_names:
-        hogmap_address = folder + "/hogmap/" + species_name + "."+fasta_format_keep+".hogmap"
-        hogmap_file = open(hogmap_address, 'r')
-
-        for line in hogmap_file:
-            line_strip = line.strip()
-            if not line_strip.startswith('!') and not line_strip.startswith('qs'):
-                # qseqid	hogid	hoglevel	family_p	family_count	family_normcount	subfamily_score	subfamily_count	qseqlen	subfamily_medianseqlen	qseq_overlap
-                line_split = line_strip.split("\t")
-                prot_id = line_split[0]
-                hogid = line_split[1]
-                score = line_split[3]
-                seqlen = line_split[8]
-                subfamily_medianseqlen = line_split[9]
-
-                if hogid == "N/A":
-                    if species_name in unmapped:
-                        unmapped[species_name].append(prot_id)
-                    else:
-                        unmapped[species_name] = [prot_id]
+        hogmap_address = os.path.join(folder, species_name + "." + fasta_format_keep + ".hogmap")
+        cur_species_hogmap = collections.defaultdict(list)
+        with open(hogmap_address, 'rt') as hogmap_file:
+            reader = csv.DictReader((line for line in hogmap_file if not line.startswith('!')),
+                                    dialect="excel-tab")
+            for row in reader:
+                if row['hogid'] == "N/A":
+                    unmapped[species_name].append(row['qseqid'])
                 else:
-                    if species_name in hogmaps:
-                        if prot_id in hogmaps[species_name]:
-                            hogmaps[species_name][prot_id].append((hogid, score, seqlen, subfamily_medianseqlen))
-
-                        else:
-                            hogmaps[species_name][prot_id] = [(hogid, score, seqlen, subfamily_medianseqlen)]
-                    else:
-                        hogmaps[species_name] = {prot_id: [(hogid, score, seqlen, subfamily_medianseqlen)]}
-
-    logger_hog.info("There are " + str(len(hogmaps)) + " species in the hogmap folder.")
-    logger_hog.info("The first species " + species_names[0] + " contains " + str(len(hogmaps[species_names[0]])) + " proteins.")
-
+                    cur_species_hogmap[row['qseqid']].append(
+                        HOGMapData(row['hogid'], row['family_p'], row['qseqlen'], row['subfamily_medianseqlen']))
+        hogmaps[species_name] = cur_species_hogmap
+        logger_hog.info("hogmap %s: %d proteins mapped to %d hogs, %d not mapped",
+                        species_name, len(cur_species_hogmap), sum(len(x) for x in cur_species_hogmap.values()),
+                        len(unmapped[species_name]))
+
+    logger_hog.info("There are %d species in the hogmap folder.", len(hogmaps))
     return hogmaps, unmapped
 
 
@@ -793,14 +787,13 @@ def write_clusters(address_rhogs_folder, min_rhog_size):
     return cluster_iter-1
 
 
-
-def parse_isoform_file(species_names, folder="."):
+def parse_isoform_file(species_names, folder=None):
+    if folder is None:
+        folder = "./splice"
     isoform_by_gene_all = {}
 
     for species_name in species_names:  # from fasta file
-
-        file_splice_name = folder + "/splice/" + species_name + ".splice"
-
+        file_splice_name = os.path.join(folder,  species_name + ".splice")
         if os.path.isfile(file_splice_name):
             # from OMArk
             isoform_by_gene = []
@@ -811,12 +804,10 @@ def parse_isoform_file(species_names, folder="."):
                     isoform_by_gene.append(splice)
         else:
             isoform_by_gene = []
-
         isoform_by_gene_all[species_name] = isoform_by_gene
     return isoform_by_gene_all
 
 
-
 def find_nonbest_isoform(species_names, isoform_by_gene_all, hogmaps):
     isoform_selected = {}
     isoform_not_selected = {}
@@ -866,13 +857,10 @@ def write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_list
     write isoforms
 
     """
-    selected_isoforms_folder= "selected_isoforms/"
+    selected_isoforms_folder = "selected_isoforms/"
     if not os.path.exists(selected_isoforms_folder):
         os.mkdir(selected_isoforms_folder)
 
-
-
-
     for species, isoform_selected_sp in isoform_selected.items():
         isoform_by_gene = isoform_by_gene_all[species]
         assert len(isoform_by_gene) == len(isoform_selected_sp)
diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py
index 1d76a31..4937071 100644
--- a/FastOMA/batch_roothogs.py
+++ b/FastOMA/batch_roothogs.py
@@ -1,91 +1,66 @@
-
-
-import os
-from os import makedirs
 import shutil
-from os import listdir
-
 from . import _config
-
-
-def list_rhog_fastas(address_rhogs_folder):
-    """
-     create orthoxml_to_newick.py list of rootHOG IDs  stored in the folder of rHOG .
-     input: folder address
-     output: list of rhog Id (integer)
-    """
-    rhog_files = listdir(address_rhogs_folder)
-    rhogid_list= []
-    for rhog_file in rhog_files:
-        if rhog_file.split(".")[-1] == "fa" and rhog_file.split("_")[0] == "HOG":
-            rhogid = rhog_file.split(".")[0].split("_")[1]
-            rhogid_list.append(rhogid)
-
-    return rhogid_list
-
-
-
-def folder_1h_rhog(address_rhogs_folder, output_folder_big, output_folder_rest):
-
-    # in_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/working_nfp/"
-
-    #work_fldor_out = in_folder
-
-    rhogid_list = list_rhog_fastas(address_rhogs_folder)
-    len(rhogid_list)
-    dic_rhognum_size = {}
-    for rhogid in rhogid_list[2:]:
-        rhog_i_prot_address = address_rhogs_folder + "/HOG_" + rhogid + ".fa"
-        # rhog_i = list(SeqIO.parse(rhog_i_prot_address, "fasta"))
-        rhog_i_size = os.path.getsize(rhog_i_prot_address)
-        dic_rhognum_size[rhogid] = rhog_i_size
-    len(dic_rhognum_size)
-
-    # to have at least one big and one rest rhog, this makes nextflow pipline easier,
-    # otherwise we need if condition  collect_subhogs won't satisfy if one of them is empty
-
-    list_list_rest_rhog = [[rhogid_list[0]],[]]  # each insid list should take around 1h
-    list_list_rest_size = [[],[]]
-    list_list_big = [rhogid_list[1]]
-
-    for rhognum, size in dic_rhognum_size.items():
-        # print(rhognum, size)
-        if size > _config.big_rhog_filesize_thresh:
-            list_list_big.append(rhognum)
-        else:
-            if sum(list_list_rest_size[-1]) < _config.sum_list_rhogs_filesize_thresh:
-                list_list_rest_rhog[-1].append(rhognum)
-                list_list_rest_size[-1].append(size)
+from pathlib import Path
+logger = _config.logger_hog
+
+
+class BatchBuilder:
+    def __init__(self, outdir: Path, max_size: int):
+        self.outdir = outdir
+        self.max_size = max_size
+
+    def __enter__(self):
+        self.cur_batch = []
+        self.cur_size = 0
+        self.counter = 0
+        self.outdir.mkdir(parents=True, exist_ok=True)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if len(self.cur_batch) > 0:
+            self._flush()
+
+    def add_hog(self, hog_file: Path):
+        self.cur_batch.append(hog_file)
+        self.cur_size += hog_file.stat().st_size
+        logger.debug("adding %s with size %d to batch %d", hog_file, hog_file.stat().st_size, self.counter)
+        if self.cur_size > self.max_size:
+            self._flush()
+            self.counter += 1
+
+    def _flush(self):
+        batch_dir = self.outdir / str(self.counter)
+        batch_dir.mkdir()
+        for fn in self.cur_batch:
+            shutil.copy(fn, batch_dir)
+        logger.debug("creating batch %s with %d families; total size of files is %d",
+                     batch_dir, len(self.cur_batch), self.cur_size)
+        self.cur_size = 0
+        self.cur_batch = []
+
+
+def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_rest: Path):
+    # create a list of hogs in descending filesize order
+    hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1])
+    with BatchBuilder(output_folder_big, 1) as big_hogs, \
+            BatchBuilder(output_folder_rest, _config.sum_list_rhogs_filesize_thresh) as rest_hogs:
+        for hog, fsize in hog_size_tuples:
+            if fsize > _config.big_rhog_filesize_thresh:
+                big_hogs.add_hog(hog)
             else:
-                list_list_rest_rhog.append([rhognum])
-                list_list_rest_size.append([size])
-
-    # makedirs(work_fldor_out+"rhogs_rest")
-    for folder_id, list_rhog in enumerate(list_list_rest_rhog):
-        # print(folder_id)
-        # output_folder_rest = work_fldor_out + "/rhogs_rest/"
-        makedirs( output_folder_rest + str(folder_id))
-        for rhogid in list_rhog:
-            name = "HOG_" + rhogid + ".fa"
-            folder_rest =output_folder_rest + str(folder_id) + "/"
-            shutil.copy(address_rhogs_folder + name, folder_rest + name)
-
-    # makedirs(work_fldor_out+"rhogs_big")
-    #  output_folder_big = work_fldor_out + "rhogs_big/"
-    for folder_id, rhogid in enumerate(list_list_big):
-        name = "HOG_" + rhogid + ".fa"
-        folder_big = output_folder_big + "b" + str(folder_id) + "/"
-        makedirs(folder_big)
-        shutil.copy(address_rhogs_folder + name, folder_big + name)
-
-    return 1
-
+                rest_hogs.add_hog(hog)
 
 
 def fastoma_batch_roothogs():
-
-    input_rhog = "./temp_omamer_rhogs/" #
-    output_folder_big = "./rhogs_big/"
-    output_folder_rest = "./rhogs_rest/"
-    folder_1h_rhog(input_rhog, output_folder_big, output_folder_rest)
+    import argparse
+    parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis")
+    parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored")
+    parser.add_argument('--out-big', required=True, help="folder where the big single family hogs should be stored")
+    parser.add_argument('--out-rest', required=True, help="folder where the remaining families should be stored in"
+                                                          "batch subfolder structure.")
+    parser.add_argument('-v', default=0, action="count", help="incrase verbosity")
+    conf = parser.parse_args()
+    logger.setLevel(level=30 - 10 * min(conf.v, 2))
+
+    folder_1h_rhog(Path(conf.input_roothogs), Path(conf.out_big), Path(conf.out_rest))
 
diff --git a/FastOMA/check_input.py b/FastOMA/check_input.py
index de3b506..e46441a 100644
--- a/FastOMA/check_input.py
+++ b/FastOMA/check_input.py
@@ -1,72 +1,64 @@
-
-
 import sys
 from ete3 import Tree
 import os
 import collections
+import argparse
 
 from . import _config
 from . import _utils_roothog
-from ._utils_subhog import logger_hog
-
-
-
-"""
-cd in_folder
-python check_input.py
-"""
+logger = _config.logger_hog
 
 
-def check_proteome_files():
+def check_proteome_files(proteome_folder):
+    proteome_files = os.listdir(proteome_folder)
+    logger.info("There are %d files in the proteome folder.", len(proteome_files))
 
-    proteome_files = os.listdir("./proteome/")
-    logger_hog.info("There are " + str(len(proteome_files)) + " files in the proteome folder. ")
-
-    not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta") )]
+    not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta"))]
     if not_fa:
-        logger_hog.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these " + str(not_fa) )
+        logger.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these %s", not_fa)
 
-    fa_fasta = [i.split(".")[-1] for i in proteome_files if  i.endswith(".fa") or i.endswith(".fasta")]
-    if len(fa_fasta)<=2:
-        logger_hog.error("There are not enough proteomes in the folder ")
-        logger_hog.error("Check input failed. FastOMA halted!")
-        sys.exit(1)
+    fa_fasta = [i.split(".")[-1] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")]
+    if len(fa_fasta) <= 2:
+        logger.error("There are not enough proteomes in the folder ")
+        logger.error("Check input failed. FastOMA halted!")
+        return False
 
-    if len(set(fa_fasta))>1:
-        logger_hog.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them " + str(proteome_files) )
+    if len(set(fa_fasta)) > 1:
+        logger.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them %s.", proteome_files)
+        return False
 
-    species_names1= [".".join(i.split(".")[:-1]) for i in proteome_files if  i.endswith(".fa") or i.endswith(".fasta")]
-    return species_names1
+    species_names = [os.path.splitext(i)[0] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")]
+    return species_names
 
 
-def check_proteome(species_names,prot_recs_lists):
+def check_proteome(species_names, prot_recs_lists, proteome_folder):
     # ids_set=set()
+    isOk = True
     num_prots_all = 0
     for species_name in species_names:
         num_prots = len(prot_recs_lists[species_name])
-        num_prots_all+=num_prots
+        num_prots_all += num_prots
         # todo , check duplicated Ids  for seq in prot_recs_lists[species_name]:
         #  if seq.id in ids_set: report duplciated   ids_set.add(seq.id),
         if num_prots <= 2:
-            logger_hog.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it,  in_folder/proteome/" + species_name + "."+fasta_format_keep)
+            logger.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it, %s/%s.fa", proteome_folder, species_name)
+            isOk = False
     # todo write new protoems with cleaned record ids, keep the mapping, to be used in orthoxml writing
     # use the mapping back for orhtoxml
-    logger_hog.error("There are "+str(num_prots_all)+" proteins in total in the input proteome folder. ")
-    return 1
-
+    logger.info("There are %d proteins in total in the input proteome folder.", num_prots_all)
+    return isOk
 
 
-def check_hogmap_files():
-
-    hogmap_files = os.listdir("./hogmap_in/")
-    logger_hog.info("There are " + str(len(hogmap_files)) + " files in the hogmap folder. ")
-    not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap") ]
+def check_hogmap_files(hogmap_folder):
+    hogmap_files = os.listdir(hogmap_folder)
+    logger.info("There are %d files in the hogmap folder.", len(hogmap_files))
+    not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap")]
     if not_hogmap:
-        logger_hog.warning("We expect that only hogmap files are in the hogmap_in folder. Better to remove these " + str(not_hogmap) )
-
-    species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if  i.endswith(".hogmap")]
-    logger_hog.info("There are "+str(len(species_hogmaps))+" hogmaps.")
+        logger.warning("We expect that only hogmap files are in the %s folder. Better to remove these %s",
+                       hogmap_folder, not_hogmap)
 
+    species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if i.endswith(".hogmap")]
+    logger.info("There are "+str(len(species_hogmaps))+" hogmaps.")
     return species_hogmaps
 
 
@@ -78,63 +70,63 @@ def check_speciestree_internalnode(species_tree):
         if not node.is_leaf():
             internal_node_name = []
             if (not node.name) or len(node.name) < 1:
-                logger_hog.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.")
+                logger.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.")
             else:
                 internal_node_name.append(node.name)
 
     if len(internal_node_name) != len(set(internal_node_name)):
-        logger_hog.warning("All the internal node names should be unique. One of the internal node is repeated:")
-        logger_hog.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1])
-        logger_hog.warning("We'll change the internal node names.")
-
-
-
+        logger.warning("All the internal node names should be unique. One of the internal node is repeated:")
+        logger.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1])
+        logger.warning("We'll change the internal node names.")
     return 1
 
-def check_speciestree_leaves(species_tree,species_names):
 
+def check_speciestree_leaves(species_tree, species_names):
     leaves_name = [i.name for i in species_tree.get_leaves()]
-    logger_hog.info("The species tree has " + str(len(leaves_name))+" leaves.")
-    if len(set(leaves_name))!=len(leaves_name):
-        logger_hog.error("Leaves name should be unique in the species tree. You may try ete3 prune")
-        logger_hog.error("Check input failed. FastOMA halted!")
-        sys.exit(1)
+    logger.info("The species tree has %d leaves.", len(leaves_name))
+    if len(set(leaves_name)) != len(leaves_name):
+        logger.error("Leaves name should be unique in the species tree. You may try ete3 prune")
+        logger.error("Check input failed. FastOMA halted!")
+        return False
 
-    species_names_not_intree = [i for i in species_names if i not in  leaves_name]
+    species_names_not_intree = [i for i in species_names if i not in leaves_name]
     if species_names_not_intree:
-        logger_hog.error("These species are not in the species tree "+ str(species_names_not_intree))
-        logger_hog.error("This is a tree template:  ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files:  AQUAE.fa  CHLTR.fa  MYCGE.fa. ")
-        logger_hog.error("Check input failed. FastOMA halted!")
-        sys.exit(1)
-    leaves_tree_not_proteome = [i for i in leaves_name  if i not in species_names]
+        logger.error("These species are not in the species tree:\n %s", "\n ".join(species_names_not_intree))
+        logger.error("This is a tree template:  ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files:  AQUAE.fa  CHLTR.fa  MYCGE.fa. ")
+        logger.error("Check input failed. FastOMA halted!")
+        return False
+
+    leaves_tree_not_proteome = [i for i in leaves_name if i not in species_names]
     if leaves_tree_not_proteome:
-        logger_hog.warning("there are "+str(len(leaves_tree_not_proteome))+" leaves in the species tree that there is no proteome. So we will discard them.")
+        logger.warning("there are %d leaves in the species tree that there is no proteome. So we will discard them.", len(leaves_tree_not_proteome))
+    return True
 
-    return 1
 
-def check_omamer_db(omamerdb_adress="omamerdb.h5"):
+def check_omamer_db(omamerdb_adress=None):
+    if omamerdb_adress is None:
+        logger.warning("omamer_db not passed. will assume you know what you are doing.")
+        return True
 
-    if  os.path.exists(omamerdb_adress):
+    if os.path.exists(omamerdb_adress):
         if os.path.getsize(omamerdb_adress) > 10000:  # 3 bytes
             omamerdb = True
             # todo we can do some checks on version omamer v2
         else:
-            logger_hog.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress)
+            logger.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress)
             omamerdb = False
     else:
         omamerdb =False
-        logger_hog.info("OMAmer db does not exist.")
+        logger.info("OMAmer db does not exist.")
 
     return omamerdb
 
 
-def add_internal_node_prune(species_tree,species_names):
-
+def add_internal_node_prune(species_tree, species_names, out_fname):
     # add name for the internal, if no name is provided, removintg special chars
     counter_internal = 0
     node_names = set()
     for node in species_tree.traverse(strategy="postorder"):
-        if not node.is_leaf() :
+        if not node.is_leaf():
             node_name = node.name
 
             if len(node_name) < 3 or node_name in node_names:
@@ -142,27 +134,26 @@ def add_internal_node_prune(species_tree,species_names):
                     node.name = "internal_" + str(counter_internal)
                     counter_internal += 1
                     node_names.add(node.name)
-                    logger_hog.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name)
-            elif not all(e.isalnum() or e=="_" or e=="-"  for e in node_name):  #any(not c.isalnum() for c in node_name):
-                node_name_new = ''.join(e for e in node_name if e.isalnum() or e=="_" or e=="-") # removing special chars
+                    logger.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name)
+            elif not all(e.isalnum() or e in ("_", "-") for e in node_name):  #any(not c.isalnum() for c in node_name):
+                node_name_new = ''.join(e for e in node_name if e.isalnum() or e in ("_", "-"))  # removing special chars
                 if node_name_new in node_names:
-                    node.name =node_name_new+"_"+str(counter_internal)
+                    node_name_new += "_" + str(counter_internal)
                     counter_internal += 1
-                else:
-                    node.name=node_name_new
-
+                logger.debug("The internal node name has special chars or repeated '%s' which is changed to '%s'",
+                             node.name, node_name_new)
+                node.name = node_name_new
                 node_names.add(node.name)
-                logger_hog.debug("The internal node name has special chars or repeated " + node_name + " which is changed to " + node.name)
             else:
                 node_names.add(node_name)
 
-    logger_hog.info("The species tree has " + str(len(species_tree)) + " leaves")
-    species_tree.prune(species_names) # , preserve_branch_length=True)
-    logger_hog.info("After pruning, the species tree has " + str(len(species_tree)) + " leaves")
+    logger.info("The species tree has %d leaves", len(species_tree))
+    species_tree.prune(species_names)  # , preserve_branch_length=True)
+    logger.info("After pruning, the species tree has %d leaves", len(species_tree))
 
-    species_tree_output= "./species_tree_checked.nwk"
-    species_tree.write(format=1, format_root_node=True, outfile=species_tree_output)
-    logger_hog.debug("The new species tree is written "+species_tree_output)
+    species_tree_output = "./species_tree_checked.nwk"
+    species_tree.write(format=1, format_root_node=True, outfile=out_fname)
+    logger.debug("The new species tree is written %s", out_fname)
 
     return species_tree
 
@@ -180,76 +171,83 @@ def check_splice(isoform_by_gene_all):
                     total_isoforms+=len(isoform)
                     total_genes+=1
 
-    logger_hog.debug("For "+str(spliced_species_num)+"  species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.")
+    logger.debug("For "+str(spliced_species_num)+"  species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.")
     if total_genes ==0:
-        logger_hog.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ")
+        logger.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ")
         sys.exit(1)
 
-    logger_hog.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+"  splices.")
+    logger.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+"  splices.")
 
     # make sys back
     return 1
 
 
-
-
-
-
 def fastoma_check_input():
-    _config.set_configs()
-
-    # print(_config.in_folder)
-    print(_config.logger_level)
-
-    print(_config.species_tree_address)  # nwk format
-
-    species_names1 = check_proteome_files()
+    import argparse
+    parser = argparse.ArgumentParser(description="checking parameters for FastOMA")
+    parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes")
+    parser.add_argument("--species-tree", required=True, help="Path to the input species tree file in newick format")
+    parser.add_argument("--out-tree", required=True, help="Path to output file for sanitised species tree. ")
+    parser.add_argument("--splice", help="Path to the folder containing the splice information files")
+    parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files")
+    parser.add_argument("--omamer_db", help="Path to the omamer database")
+    parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
+    conf = parser.parse_args()
+    logger.setLevel(level=30 - 10 * min(conf.v, 2))
+    logger.debug("Arguments: %s", conf)
+
+    species_names = check_proteome_files(conf.proteomes)
+    if not species_names:
+        logger.error("Halting FastOMA because of invalid proteome input data")
+        sys.exit(1)
 
 
     try:
-        species_tree = Tree(_config.species_tree_address, format=1)
+        species_tree = Tree(conf.species_tree, format=1)
     except:
         try:
-            species_tree = Tree(_config.species_tree_address)
+            species_tree = Tree(conf.species_tree)
             # todo add check fro Phyloxml
         except:
-            logger_hog.error("We have problem with parsing species tree "+str(_config.species_tree_address) + " using ete3 Tree. Is it in the in_folder ? Maybe there are some special chars.")
+            logger.error("We have problem with parsing species tree %s using ete3 Tree. Maybe there are some special chars.", conf.species_tree)
+            sys.exit(1)
+
     check_speciestree_internalnode(species_tree)
-    check_speciestree_leaves(species_tree,species_names1)
-    add_internal_node_prune(species_tree,species_names1)
+    if not check_speciestree_leaves(species_tree, species_names):
+        logger.error("Halting FastOMA because of invalid species tree")
+        sys.exit(1)
+    add_internal_node_prune(species_tree, species_names, conf.out_tree)
 
-    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes()  # optional input folder
-    check_proteome(species_names, prot_recs_lists)
-    hogmap_files = os.path.exists("./hogmap_in/")
-    species_hogmaps =[]
+    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes)
+    if not check_proteome(species_names, prot_recs_lists, conf.proteomes):
+        logger.error("Halting FastOMA because of invalid proteome input data")
+        sys.exit(1)
+
+    hogmap_files = conf.hogmap is not None and os.path.exists(conf.hogmap)
+    species_hogmaps = []
     if hogmap_files:
-        species_hogmaps = check_hogmap_files()
+        species_hogmaps = check_hogmap_files(conf.hogmap)
 
-    #print("species_hogmaps",species_hogmaps)
-    #print("species_names",species_names)
     hogmap_complete = False
     if set(species_hogmaps) == set(species_names):
         hogmap_complete = True
 
-    # print("hogmap_complete", hogmap_complete)
-    omamerdb = check_omamer_db()
+    omamerdb = check_omamer_db(conf.omamer_db)
 
     if not(omamerdb or hogmap_complete):
-        logger_hog.error("OMAmer db does not exit and no hogmap provided. ")
-        logger_hog.error("Check input failed. FastOMA halted!")
+        logger.error("OMAmer db does not exist and no hogmap provided.")
+        logger.error("Check input failed. FastOMA halted!")
         sys.exit(1)
     else:
-        logger_hog.info("OMAmer db or hogmap exist. It looks ok. ")
+        logger.info("OMAmer db or hogmap exist. It looks ok.")
 
-    #todo  check splice file format . if the name matches with the proteome files.
-    splice_files = os.path.exists("./splice/")
+    # todo  check splice file format . if the name matches with the proteome files.
+    splice_files = conf.splice is not None and os.path.exists(conf.splice)
     if splice_files:
-        logger_hog.debug("splice folder exist. Let's see its inside.")
-        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names)
+        logger.debug("splice folder exist. Let's see its inside.")
+        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, conf.splice)
         check_splice(isoform_by_gene_all)
     else:
-        logger_hog.info("Splice folder doesn't exist and that's ok.")
-
-
-    logger_hog.info("Input check finished ! ")
+        logger.info("Splice folder doesn't exist and that's ok.")
+    logger.info("Input check finished ! ")
 
diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py
index b173a22..6fbe840 100644
--- a/FastOMA/collect_subhogs.py
+++ b/FastOMA/collect_subhogs.py
@@ -6,6 +6,7 @@
 from ete3 import Tree
 import os
 from Bio import SeqIO
+from pathlib import Path
 
 from FastOMA.zoo.hog import extract_flat_groups_at_level
 from FastOMA.zoo.hog.convert import orthoxml_to_newick
@@ -16,7 +17,26 @@
 # This code collect subhogs and writes outputs.
 
 
-def max_og_tree(tree,species_dic):
+def load_hogs(pickle_folder: Path):
+    hogs_all = []
+    cnt = 0
+    logger_hog.info("reading pickle files from %s", pickle_folder)
+    for root, dirs, files in os.walk(pickle_folder, followlinks=True):
+        for f in files:
+            if f.startswith('file_') and f.endswith('.pickle'):
+                with open(os.path.join(root, f), 'rb') as handle:
+                    cur = pickle.load(handle)
+                hogs_all.extend(cur)
+                cnt += 1
+                if cnt % 500 == 0:
+                    logger_hog.info("read %d batch pickle files so far, resulting in %d roothogs so far",
+                                    cnt, len(hogs_all))
+    logger_hog.info("number of pickle files is %d.", cnt)
+    logger_hog.debug("number of hogs in all batches is %d", len(hogs_all))
+    return hogs_all
+
+
+def max_og_tree(tree, species_dic):
     for node in tree.traverse("preorder"):
         # for node in xml_tree.traverse(strategy="preorder", is_leaf_fn=lambda n: hasattr(n, "attriremoved") and n.attriremoved==True):
         if not node.is_leaf() and hasattr(node, "Ev") and node.Ev == 'duplication':  # node.name[:3] == "dup"
@@ -59,34 +79,48 @@ def max_og_tree(tree,species_dic):
     return og_prot_list
 
 
-
-
 def fastoma_collect_subhogs():
-
-
-
-    # todo this function needs >200 GB of memory for >1500 species. needs some optimisation probably
-
-    logger_hog.info("started collecting pickle files ")
-
+    import argparse
+    parser = argparse.ArgumentParser(description="collecting all computed HOGs and combine into a single orthoxml")
+    parser.add_argument('--pickle-folder', required=True,
+                        help="folder containing the pickle files. Will be searched recursively")
+    parser.add_argument('--roothogs-folder', required=True, help="folder containing the omamer roothogs")
+    parser.add_argument('--gene-id-pickle-file', required=True,
+                        help="file containing the gene-id dictionary in pickle format")
+    parser.add_argument('--out', required=False, default="output_hog.orthoxml", help="output filename in orthoxml")
+    parser.add_argument('-v', action="count", default=0)
+    parser.add_argument('--roothog-tsv', default=None, required=False,
+                        help="If specified, a tsv file with the given path will be produced containing the roothog asignments as TSV file.")
+    parser.add_argument('--marker-groups-fasta', default=None, required=False,
+                        help="If specified, a folder named OrthologousFasta and a TSV file with the name provided in "
+                             "this argument will be generated that contains single copy groups, i.e. groups which have " 
+                             "at most one gene per species. Useful as phylogenetic marker genes to reconstruct species "
+                             "trees.")
+    conf = parser.parse_args()
+    logger_hog.setLevel(level=30 - 10 * min(conf.v, 2))
+    logger_hog.debug(conf)
+
+    write_hog_orthoxml(conf.pickle_folder, conf.out, conf.gene_id_pickle_file)
+    if conf.roothog_tsv is not None:
+        write_roothogs(conf.out, conf.roothog_tsv)
+    if conf.marker_groups_fasta is not None:
+        write_group_files(Path(conf.out), Path(conf.roothogs_folder), conf.marker_groups_fasta)
+
+
+def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file):
     # todo as input argument/option in nextflow
 
     # in benchamrk dataset the output prot names should be short
     # tr|A0A0N7KCI6|A0A0N7KCI6_ORYSJ
     # for qfo benchmark, the middle should be written in the file
 
-    pickle_folder = "./temp_pickles/" #pickle_rhogs
-    output_xml_name = "./output_hog.orthoxml"
-    gene_id_pickle_file = "./gene_id_dic_xml.pickle" # this file includes integer mapping of each protein
-
-
     orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA",
                                                    "originVersion": "Nov 2021", "version": "0.3"})  #
 
     with open(gene_id_pickle_file, 'rb') as handle:
         gene_id_name = pickle.load(handle)
         # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name)
-    logger_hog.debug("We read the gene_id_name dictionary with"+str(len(gene_id_name))+"items")
+    logger_hog.debug("We read the gene_id_name dictionary with %d items", len(gene_id_name))
     logger_hog.debug("Now creating the header of orthoxml")
 
    #  #### create the header of orthoxml ####
@@ -104,22 +138,8 @@ def fastoma_collect_subhogs():
     logger_hog.debug("gene_xml is created.")
 
     #  #### create the groups of orthoxml   ####
-    pickle_files_adress_raw = listdir(pickle_folder)
-    pickle_files_adress = [i for i in pickle_files_adress_raw if i.endswith(".pickle") and i.startswith("file_")]
-
-    logger_hog.info("number of pickle files is "+str(len(pickle_files_adress))+".")
-    logger_hog.debug("pickle files are " + str(len(pickle_files_adress)) + ".")
-    hogs_a_rhog_xml_all = []
-    for pickle_file_adress in pickle_files_adress:
-        with open(pickle_folder + pickle_file_adress, 'rb') as handle:
-            hogs_a_rhog_xml_batch = pickle.load(handle)  #  list of hog object.
-            hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch)
-            # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object.
-
-    logger_hog.debug("number of hogs in all batches is "+str(len(hogs_a_rhog_xml_all))+" .")
     groups_xml = ET.SubElement(orthoxml_file, "groups")
-
-    for hogs_a_rhog_xml in hogs_a_rhog_xml_all:
+    for hogs_a_rhog_xml in load_hogs(Path(pickle_folder)):
         groups_xml.append(hogs_a_rhog_xml)
     logger_hog.debug("converting the xml object to string.")
 
@@ -130,93 +150,93 @@ def fastoma_collect_subhogs():
         file_xml.write(xml_str)
     file_xml.close()
 
-    logger_hog.info("orthoxml is written in " + output_xml_name)
+    logger_hog.info("orthoxml is written in %s", output_xml_name)
 
 
+def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=None, output_fasta_groups=None):
+    if output_file_og_tsv is None:
+        output_file_og_tsv = "OrthologousGroups.tsv"
+    if output_fasta_groups is None:
+        output_fasta_groups = "OrthologousGroupsFasta"
+    output_fasta_groups = Path(output_fasta_groups)
 
 
+    logger_hog.info("Start writing OG fasta files ")
 
-
-
-    logger_hog.info("Now writing OG fasta files ")
-
-
-    input_orthoxml = output_xml_name # sys.argv[1]  # "out_folder/output_hog_.orthoxml"
-    rhog_all_folder = "./temp_omamer_rhogs/" #sys.argv[2] + "/"  # "out_folder/rhogs_all/"
     fasta_format = "fa"  # of the rhogs
 
-    output_file_og_tsv = "OrthologousGroups.tsv"
-
-    trees, species_dic = orthoxml_to_newick(input_orthoxml, return_gene_to_species=True)  # encode_levels_as_nhx=False,  xref_tag="protId",
-    print("We extracted " + str(len(trees)) + " trees  in NHX format from the input HOG orthoxml" + input_orthoxml)
+    trees, species_dic = orthoxml_to_newick(orthoxml, return_gene_to_species=True)
+    logger_hog.info("We extracted %d trees in NHX format from the input HOG orthoxml %s", len(trees),  orthoxml)
 
     OGs = {}
     for hog_id, tree_string in trees.items():
-        try :
+        try:
             tree = Tree(tree_string, format=1)
         except:
             try:
-                tree = Tree(tree_string, format=1,quoted_node_names=True)
+                tree = Tree(tree_string, format=1, quoted_node_names=True)
             except:
-                print("error", tree_string)
+                logger_hog.error("Error loading tree", tree_string)
+                raise
 
-        og_prot_list = max_og_tree(tree,species_dic)
-        if len(og_prot_list) >= 2: # a group should have at least 1 member/protein
+        og_prot_list = max_og_tree(tree, species_dic)
+        if len(og_prot_list) >= 2:  # a group should have at least 1 member/protein
             OGs[hog_id] = og_prot_list
 
-    print("done")
+    logger_hog.info("done extracting trees")
 
     with open(output_file_og_tsv, 'w') as handle:
         for hog_id, og_prot_list in OGs.items():
             line_text = str(hog_id) + "\t" + str(og_prot_list)[1:-1] + "\n"
             handle.write(line_text)
-    handle.close()
 
-    print("We wrote the protein families information in the file " + output_file_og_tsv)
+    logger_hog.info("We wrote the protein families information in the file %s", output_file_og_tsv)
 
-    out_folder_ogs = "OrthologousGroupsFasta/"
-    os.makedirs(out_folder_ogs)
-
-    print("start writing " + str(len(OGs)) + " OGs as fasta files in folder " + out_folder_ogs)
+    output_fasta_groups.mkdir(parents=True, exist_ok=True)
+    logger_hog.info("start writing %d OGs as fasta files in folder %s.", len(OGs), output_fasta_groups)
     for hog_id, og_prot_list in OGs.items():  # hog_id="HOG_0667494_sub10524"
         rhog_id = "_".join(hog_id.split("_")[:2])
 
-        omamer_rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format
-        omamer_rhogs_all_prots = list(SeqIO.parse(omamer_rhogs_all_address, "fasta"))
+        rhog_fasta = roothog_folder / (rhog_id + "." + fasta_format)
+        omamer_rhogs_all_prots = list(SeqIO.parse(rhog_fasta, "fasta"))
 
         og_prots = []
         og_prot_list = OGs[hog_id]
         for rhogs_prot in omamer_rhogs_all_prots:
-            if rhogs_prot.id.split("||")[0] in og_prot_list:
+            prot_id = rhogs_prot.id
+            if _config.protein_format_qfo_dataset_before2022:
+                prot_id = prot_id.split('|')[1]
+
+            if prot_id in og_prot_list:
                 sp = rhogs_prot.id.split("||")[1]
                 rhogs_prot.description += " [" + sp + "]"
                 og_prots.append(rhogs_prot)
 
         og_id = "OG_" + hog_id  # one OG per rootHOG      # "/HOG_"+ rhogid
-        SeqIO.write(og_prots, out_folder_ogs + og_id + ".fa", "fasta")
-    print("writing done")
-
+        SeqIO.write(og_prots, output_fasta_groups / (og_id + ".fa"), "fasta")
+    logger_hog.info("writing done")
 
 
+def write_roothogs(orthoxml, out):
     # import sys
     # input_orthoxml = output_xml_name
-    output_file = "rootHOGs.tsv"
+    if out is None:
+        out = "rootHOGs.tsv"
 
     toplevel_groups = []
-    for grp in extract_flat_groups_at_level(input_orthoxml):
+    for grp in extract_flat_groups_at_level(orthoxml):
         toplevel_groups.append(set(g.xref for g in grp))
 
-    # toplevel_groups is a list of sets
-
-    print("We extracted "+str(len(toplevel_groups))+" protein families from the input HOG orthoxml"+input_orthoxml)
-    print("The first one contain "+str(len(toplevel_groups[0]))+" proteins.")
+    logger_hog.info("We extracted %d protein families from the input HOG orthoxml %s", len(toplevel_groups), orthoxml)
+    logger_hog.info("The first one contain %d proteins.", len(toplevel_groups[0]))
 
-    with open(output_file, 'w') as handle:
+    with open(out, 'w') as handle:
         for toplevel_group_idx, toplevel_group in enumerate(toplevel_groups):
             line_text = str(toplevel_group_idx)+"\t"+str(toplevel_group)[1:-1]+"\n"
             handle.write(line_text)
-    handle.close()
 
-    print("We wrote the protein families information in the file "+output_file)
+    logger_hog.info("We wrote the protein families information in the file %s", out)
 
 
+if __name__ == "__main__":
+    fastoma_collect_subhogs()
\ No newline at end of file
diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py
index 35b3cf7..0decb1c 100644
--- a/FastOMA/infer_roothogs.py
+++ b/FastOMA/infer_roothogs.py
@@ -1,11 +1,9 @@
-
-
 import os.path
 from shutil import which
 
-from ._utils_subhog import logger_hog
 from . import _utils_roothog
 from . import _config
+logger = _config.logger_hog
 
 """
 proteomes of species as fasta files in /proteome/
@@ -14,58 +12,47 @@
 """
 
 
-# if not os.path.exists(_config.in_folder):
-#    os.mkdir(_config.in_folder) # s
-
-# in_folder+"omamer_database/oma_path/OmaServer.h5"
-# logger_hog.info("rHOG inferece has started. The oma database address is in " + _config.oma_database_address)
-# (oma_db, list_oma_species) = _utils_rhog.parse_oma_db(_config.oma_database_address)
-# (query_species_names, query_prot_recs) = _utils_rhog.parse_proteome(list_oma_species)
-
-
-
 def fastoma_infer_roothogs():
-    _config.set_configs()
-
-    # print(_config.in_folder)
-    print(_config.logger_level)
-    # import sys
-    # sys.exit(0)
-
-    folder = "/scratch/smajidi1/qfo/23Nov/merging_v5/work/00/f7751409536d34dd1ba3192b79d6d1"
-    species_names, prot_recs_lists,fasta_format_keep = _utils_roothog.parse_proteomes(folder) # optional input folder
+    import argparse
+    parser = argparse.ArgumentParser(description="checking parameters for FastOMA")
+    parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes")
+    parser.add_argument("--splice", help="Path to the folder containing the splice information files")
+    parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files")
+    parser.add_argument("--out-rhog-folder", required=True, help="Folder where the roothog fasta files are written")
+    parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug")
+    conf = parser.parse_args()
+    logger.setLevel(level=30 - 10 * min(conf.v, 2))
+    logger.debug("Arguments: %s", conf)
+
+    species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes)  # optional input folder
     prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists)
 
-    hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names,fasta_format_keep,folder) # optional input folder
+    hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep, folder=conf.hogmap)  # optional input folder
 
-    splice_files =  os.path.exists("./splice/")
+    splice_files = conf.splice is not None and os.path.exists(conf.splice)
     if splice_files:
-        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names)
-        isoform_selected,  isoform_not_selected = _utils_roothog.find_nonbest_isoform(species_names,isoform_by_gene_all,hogmaps)
-        _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected,prot_recs_lists)
+        isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, folder=conf.splice)
+        isoform_selected,  isoform_not_selected = _utils_roothog.find_nonbest_isoform(
+            species_names, isoform_by_gene_all, hogmaps
+        )
+        _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_lists)
         # for each isoform file, there will be a file ending with _selected_isoforms.tsv
-        hogmaps = _utils_roothog.handle_splice(hogmaps,isoform_not_selected)
-
+        hogmaps = _utils_roothog.handle_splice(hogmaps, isoform_not_selected)
 
     rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps)
-
-    rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots,hogmaps)
+    rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots, hogmaps)
     rhogs_prots = _utils_roothog.merge_rhogs(hogmaps, rhogs_prots)
     rhogs_prots = _utils_roothog.filter_big_roothogs(hogmaps, rhogs_prots)
 
 
-    address_rhogs_folder = "./temp_omamer_rhogs/"
-    min_rhog_size =2
-    rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, address_rhogs_folder, min_rhog_size)
-    linclust_available=which("mmseqs") #  True #
+    min_rhog_size = 2
+    rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size)
+    linclust_available=which("mmseqs")  # True #
     # if memseqs is not installed the output will be empty / None
     if linclust_available:
         num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all,  "singleton_unmapped.fa")
         if num_unmapped_singleton:
             result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa")
-            logger_hog.debug(" linclust is done "+ result_linclust)
-            num_clusters = _utils_roothog.write_clusters(address_rhogs_folder, min_rhog_size)
-            logger_hog.debug("we wrote "+str(num_clusters)+"  new clusters with linclust ")
-
-    # if _config.add_outgroup: # already done
-    #    _utils_roothog.write_outgroups_all(rhogs_prots,prot_recs_all)
\ No newline at end of file
+            logger.debug(" linclust is done %s", result_linclust)
+            num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size)
+            logger.debug("we wrote %d new clusters with linclust ", num_clusters)
diff --git a/FastOMA/infer_subhogs.py b/FastOMA/infer_subhogs.py
index 6d5e9a6..d2ad48c 100644
--- a/FastOMA/infer_subhogs.py
+++ b/FastOMA/infer_subhogs.py
@@ -45,7 +45,7 @@ def fastoma_infer_subhogs():
     if inferhog_concurrent_on:
         print("parallelization for subhog inference is on.")
 
-    pickles_rhog_folder = "./" # pickles_temp/ pickle_rhogs
+    pickles_rhog_folder = "./pickle_hogs" # pickles_temp/ pickle_rhogs
     if not os.path.exists(pickles_rhog_folder):
         os.makedirs(pickles_rhog_folder)
 
diff --git a/FastOMA/requirements b/FastOMA/requirements
deleted file mode 100644
index fcba280..0000000
--- a/FastOMA/requirements
+++ /dev/null
@@ -1,166 +0,0 @@
-$ conda list
-# packages in environment at /miniconda/envs/fastoma:
-#
-# Name                    Version                   Build  Channel
-_libgcc_mutex             0.1                 conda_forge    conda-forge
-_openmp_mutex             4.5                       2_gnu    conda-forge
-anyio                     3.7.1              pyhd8ed1ab_0    conda-forge
-argon2-cffi               23.1.0             pyhd8ed1ab_0    conda-forge
-argon2-cffi-bindings      21.2.0           py39hb9d737c_3    conda-forge
-arrow                     1.2.3              pyhd8ed1ab_0    conda-forge
-asttokens                 2.2.1              pyhd8ed1ab_0    conda-forge
-async-lru                 2.0.4              pyhd8ed1ab_0    conda-forge
-attrs                     23.1.0             pyh71513ae_1    conda-forge
-babel                     2.12.1             pyhd8ed1ab_1    conda-forge
-backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
-backports                 1.0                pyhd8ed1ab_3    conda-forge
-backports.functools_lru_cache 1.6.5              pyhd8ed1ab_0    conda-forge
-beautifulsoup4            4.12.2             pyha770c72_0    conda-forge
-biopython                 1.81                     pypi_0    pypi
-bleach                    6.0.0              pyhd8ed1ab_0    conda-forge
-blosc2                    2.0.0                    pypi_0    pypi
-brotli-python             1.0.9            py39h5a03fae_9    conda-forge
-bzip2                     1.0.8                h7f98852_4    conda-forge
-ca-certificates           2023.7.22            hbcca054_0    conda-forge
-cached-property           1.5.2                hd8ed1ab_1    conda-forge
-cached_property           1.5.2              pyha770c72_1    conda-forge
-certifi                   2023.7.22          pyhd8ed1ab_0    conda-forge
-cffi                      1.15.1           py39he91dace_3    conda-forge
-charset-normalizer        3.2.0              pyhd8ed1ab_0    conda-forge
-comm                      0.1.4              pyhd8ed1ab_0    conda-forge
-cython                    3.0.1                    pypi_0    pypi
-debugpy                   1.6.8            py39h3d6467e_0    conda-forge
-decorator                 5.1.1              pyhd8ed1ab_0    conda-forge
-defusedxml                0.7.1              pyhd8ed1ab_0    conda-forge
-dendropy                  4.6.1                    pypi_0    pypi
-entrypoints               0.4                pyhd8ed1ab_0    conda-forge
-ete3                      3.1.3                    pypi_0    pypi
-exceptiongroup            1.1.3              pyhd8ed1ab_0    conda-forge
-executing                 1.2.0              pyhd8ed1ab_0    conda-forge
-fastoma                   0.0.6                     dev_0    <develop>
-fasttree                  2.1.11               h031d066_2    bioconda
-fqdn                      1.5.1              pyhd8ed1ab_0    conda-forge
-future                    0.18.3                   pypi_0    pypi
-humanfriendly             10.0                     pypi_0    pypi
-idna                      3.4                pyhd8ed1ab_0    conda-forge
-importlib-metadata        6.8.0              pyha770c72_0    conda-forge
-importlib_metadata        6.8.0                hd8ed1ab_0    conda-forge
-importlib_resources       6.0.1              pyhd8ed1ab_0    conda-forge
-ipykernel                 6.25.1             pyh71e2992_0    conda-forge
-ipython                   8.14.0             pyh41d4057_0    conda-forge
-iqtree                    2.2.3                h21ec9f0_0    bioconda
-isoduration               20.11.0            pyhd8ed1ab_0    conda-forge
-jedi                      0.19.0             pyhd8ed1ab_0    conda-forge
-jinja2                    3.1.2              pyhd8ed1ab_1    conda-forge
-json5                     0.9.14             pyhd8ed1ab_0    conda-forge
-jsonpointer               2.0                        py_0    conda-forge
-jsonschema                4.19.0             pyhd8ed1ab_1    conda-forge
-jsonschema-specifications 2023.7.1           pyhd8ed1ab_0    conda-forge
-jsonschema-with-format-nongpl 4.19.0             pyhd8ed1ab_1    conda-forge
-jupyter-lsp               2.2.0              pyhd8ed1ab_0    conda-forge
-jupyter_client            8.3.0              pyhd8ed1ab_0    conda-forge
-jupyter_core              5.3.1            py39hf3d152e_0    conda-forge
-jupyter_events            0.7.0              pyhd8ed1ab_2    conda-forge
-jupyter_server            2.7.1              pyhd8ed1ab_0    conda-forge
-jupyter_server_terminals  0.4.4              pyhd8ed1ab_1    conda-forge
-jupyterlab                4.0.5              pyhd8ed1ab_0    conda-forge
-jupyterlab_pygments       0.2.2              pyhd8ed1ab_0    conda-forge
-jupyterlab_server         2.24.0             pyhd8ed1ab_0    conda-forge
-ld_impl_linux-64          2.40                 h41732ed_0    conda-forge
-libffi                    3.4.2                h7f98852_5    conda-forge
-libgcc-ng                 13.1.0               he5830b7_0    conda-forge
-libgomp                   13.1.0               he5830b7_0    conda-forge
-libnsl                    2.0.0                h7f98852_0    conda-forge
-libsodium                 1.0.18               h36c2ea0_1    conda-forge
-libsqlite                 3.43.0               h2797004_0    conda-forge
-libstdcxx-ng              13.1.0               hfd8a6a1_0    conda-forge
-libuuid                   2.38.1               h0b41bf4_0    conda-forge
-libzlib                   1.2.13               hd590300_5    conda-forge
-llvmlite                  0.40.1                   pypi_0    pypi
-lxml                      4.9.3                    pypi_0    pypi
-mafft                     7.520                h031d066_2    bioconda
-markupsafe                2.1.3            py39hd1e30aa_0    conda-forge
-matplotlib-inline         0.1.6              pyhd8ed1ab_0    conda-forge
-mistune                   3.0.1              pyhd8ed1ab_0    conda-forge
-msgpack                   1.0.5                    pypi_0    pypi
-nbclient                  0.8.0              pyhd8ed1ab_0    conda-forge
-nbconvert-core            7.7.4              pyhd8ed1ab_0    conda-forge
-nbformat                  5.9.2              pyhd8ed1ab_0    conda-forge
-ncurses                   6.4                  hcb278e6_0    conda-forge
-nest-asyncio              1.5.6              pyhd8ed1ab_0    conda-forge
-nextflow                  23.4.3                   pypi_0    pypi
-notebook-shim             0.2.3              pyhd8ed1ab_0    conda-forge
-numba                     0.57.1                   pypi_0    pypi
-numexpr                   2.8.5                    pypi_0    pypi
-numpy                     1.24.4                   pypi_0    pypi
-omamer                    0.2.6                    pypi_0    pypi
-openssl                   3.1.2                hd590300_0    conda-forge
-overrides                 7.4.0              pyhd8ed1ab_0    conda-forge
-packaging                 23.1               pyhd8ed1ab_0    conda-forge
-pandas                    2.0.3                    pypi_0    pypi
-pandocfilters             1.5.0              pyhd8ed1ab_0    conda-forge
-parso                     0.8.3              pyhd8ed1ab_0    conda-forge
-pexpect                   4.8.0              pyh1a96a4e_2    conda-forge
-pickleshare               0.7.5                   py_1003    conda-forge
-pip                       23.2.1             pyhd8ed1ab_0    conda-forge
-pkgutil-resolve-name      1.3.10             pyhd8ed1ab_0    conda-forge
-platformdirs              3.10.0             pyhd8ed1ab_0    conda-forge
-prometheus_client         0.17.1             pyhd8ed1ab_0    conda-forge
-prompt-toolkit            3.0.39             pyha770c72_0    conda-forge
-prompt_toolkit            3.0.39               hd8ed1ab_0    conda-forge
-property-manager          3.0                      pypi_0    pypi
-psutil                    5.9.5            py39h72bdee0_0    conda-forge
-ptyprocess                0.7.0              pyhd3deb0d_0    conda-forge
-pure_eval                 0.2.2              pyhd8ed1ab_0    conda-forge
-py-cpuinfo                9.0.0                    pypi_0    pypi
-pycparser                 2.21               pyhd8ed1ab_0    conda-forge
-pygments                  2.16.1             pyhd8ed1ab_0    conda-forge
-pyparsing                 3.1.1                    pypi_0    pypi
-pysais                    1.1.0                    pypi_0    pypi
-pysocks                   1.7.1              pyha2e5f31_6    conda-forge
-python                    3.9.17          h0755675_0_cpython    conda-forge
-python-dateutil           2.8.2              pyhd8ed1ab_0    conda-forge
-python-fastjsonschema     2.18.0             pyhd8ed1ab_0    conda-forge
-python-json-logger        2.0.7              pyhd8ed1ab_0    conda-forge
-python_abi                3.9                      3_cp39    conda-forge
-pytz                      2023.3             pyhd8ed1ab_0    conda-forge
-pyyaml                    6.0.1            py39hd1e30aa_0    conda-forge
-pyzmq                     25.1.1           py39hb257651_0    conda-forge
-readline                  8.2                  h8228510_1    conda-forge
-referencing               0.30.2             pyhd8ed1ab_0    conda-forge
-requests                  2.31.0             pyhd8ed1ab_0    conda-forge
-rfc3339-validator         0.1.4              pyhd8ed1ab_0    conda-forge
-rfc3986-validator         0.1.1              pyh9f0ad1d_0    conda-forge
-rpds-py                   0.9.2            py39h9fdd4d6_0    conda-forge
-scipy                     1.11.2                   pypi_0    pypi
-send2trash                1.8.2              pyh41d4057_0    conda-forge
-setuptools                68.1.2             pyhd8ed1ab_0    conda-forge
-six                       1.16.0             pyh6c4a22f_0    conda-forge
-sniffio                   1.3.0              pyhd8ed1ab_0    conda-forge
-soupsieve                 2.3.2.post1        pyhd8ed1ab_0    conda-forge
-stack_data                0.6.2              pyhd8ed1ab_0    conda-forge
-tables                    3.8.0                    pypi_0    pypi
-terminado                 0.17.1             pyh41d4057_0    conda-forge
-tinycss2                  1.2.1              pyhd8ed1ab_0    conda-forge
-tk                        8.6.12               h27826a3_0    conda-forge
-tomli                     2.0.1              pyhd8ed1ab_0    conda-forge
-tornado                   6.3.3            py39hd1e30aa_0    conda-forge
-tqdm                      4.66.1                   pypi_0    pypi
-traitlets                 5.9.0              pyhd8ed1ab_0    conda-forge
-typing-extensions         4.7.1                hd8ed1ab_0    conda-forge
-typing_extensions         4.7.1              pyha770c72_0    conda-forge
-typing_utils              0.1.0              pyhd8ed1ab_0    conda-forge
-tzdata                    2023.3                   pypi_0    pypi
-uri-template              1.3.0              pyhd8ed1ab_0    conda-forge
-urllib3                   2.0.4              pyhd8ed1ab_0    conda-forge
-verboselogs               1.7                      pypi_0    pypi
-wcwidth                   0.2.6              pyhd8ed1ab_0    conda-forge
-webcolors                 1.13               pyhd8ed1ab_0    conda-forge
-webencodings              0.5.1                      py_1    conda-forge
-websocket-client          1.6.2              pyhd8ed1ab_0    conda-forge
-wheel                     0.41.2             pyhd8ed1ab_0    conda-forge
-xz                        5.2.6                h166bdaf_0    conda-forge
-yaml                      0.2.5                h7f98852_2    conda-forge
-zeromq                    4.3.4                h9c3ff4c_1    conda-forge
-zipp                      3.16.2             pyhd8ed1ab_0    conda-forge
-linclust
\ No newline at end of file
diff --git a/FastOMA/zoo/unionfind.py b/FastOMA/zoo/unionfind.py
new file mode 100644
index 0000000..83036e1
--- /dev/null
+++ b/FastOMA/zoo/unionfind.py
@@ -0,0 +1,124 @@
+import collections
+
+"""UnionFind.py
+
+Union-find data structure. Based on Josiah Carlson's code,
+http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912
+with significant additional changes by D. Eppstein and
+Adrian Altenhoff.
+"""
+
+
+class UnionFind(object):
+    """Union-find data structure.
+
+    Each unionFind instance X maintains a family of disjoint sets of
+    hashable objects, supporting the following two methods:
+
+    - X[item] returns a name for the set containing the given item.
+      Each set is named by an arbitrarily-chosen one of its members; as
+      long as the set remains unchanged it will keep the same name. If
+      the item is not yet part of a set in X, a new singleton set is
+      created for it.
+
+    - X.union(item1, item2, ...) merges the sets containing each item
+      into a single larger set.  If any item is not yet part of a set
+      in X, it is added to X as one of the members of the merged set.
+    """
+
+    def __init__(self, elements=None):
+        """Create a new union-find structure.
+
+        If elements is not None, the structure gets initialized
+        with each element as a singleton component.
+
+        :param elements: an iterable to initialize the structure.
+        """
+
+        self.weights = {}
+        self.parents = {}
+        if elements is not None:
+            for elem in iter(elements):
+                self.parents[elem] = elem
+                self.weights[elem] = 1
+
+    def __getitem__(self, obj):
+        """return the name of set which contains obj.
+
+        :param obj: the query object
+
+        :SeeAlso: :meth:`find`"""
+        return self.find(obj)
+
+    def find(self, obj):
+        """Find and return the name of the set containing the obj.
+
+        If the object is not found in any set, a new singleton set
+        is created that holds only this object until it is further merged."""
+
+        # check for previously unknown obj. If unknown, add it
+        # as a new cluster
+        if obj not in self.parents:
+            self.parents[obj] = obj
+            self.weights[obj] = 1
+            return obj
+
+        # find path of objects leading to the root
+        path = [obj]
+        root = self.parents[obj]
+        while root != path[-1]:
+            path.append(root)
+            root = self.parents[root]
+
+        # compress the path and return
+        for ancestor in path:
+            self.parents[ancestor] = root
+        return root
+
+    def remove(self, obj):
+        """Remove an object from the sets.
+
+        Removes an object entirly from the datastructure. The
+        containing set will shrink by this one element.
+
+        :Note: If one tries to accessed it afterwards using
+            :meth:`find`, it will be created newly and put as a
+            singleton.
+        """
+        if obj not in self.parents:
+            return
+        comp = self.find(obj)
+        self.weights[comp] -= 1
+        self.parents.pop(obj)
+
+    def __iter__(self):
+        """Iterate through all items ever found or unioned by this structure."""
+        return iter(self.parents)
+
+    def union(self, *objects):
+        """Find the sets containing the objects and merge them.
+
+        any number of objects can be passed to this method and
+        all of them will be merged into one set containing at
+        least these objects.
+
+        :param objects: the objects to be merged. they have to be all
+            hashable. If they haven't been initialy added to the UnionFind
+            datastructre at instantiation time, they are added at this point
+            in time.
+        """
+        roots = [self[x] for x in objects]
+        heaviest = max([(self.weights[r], r) for r in roots], key=lambda x: x[0])[1]
+        for r in roots:
+            if r != heaviest:
+                self.weights[heaviest] += self.weights[r]
+                self.parents[r] = heaviest
+
+    def get_components(self):
+        """return a list of sets corresponding to the connected
+        components of the structure."""
+        comp_dict = collections.defaultdict(set)
+        for elem in iter(self):
+            comp_dict[self[elem]].add(elem)
+        comp = list(comp_dict.values())
+        return comp
diff --git a/FastOMA_light.nf b/FastOMA_light.nf
deleted file mode 100644
index 44bf9d6..0000000
--- a/FastOMA_light.nf
+++ /dev/null
@@ -1,215 +0,0 @@
-
-// NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000'
-
-params.input_folder = "./in_folder/"
-params.output_folder = "./out_folder/"
-params.proteome_folder = params.input_folder + "/proteome"
-params.proteomes = params.proteome_folder + "/*"
-params.hogmap_in = params.input_folder + "/hogmap_in"
-
-params.hogmap_folder = params.output_folder + "/hogmap"
-params.splice_folder = params.input_folder + "/splice"
-params.species_tree = params.input_folder + "/species_tree.nwk"
-params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk"
-
-params.temp_pickles = params.output_folder + "/temp_pickles"
-params.genetrees_folder = params.output_folder + "/genetrees"
-
-params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs"
-
-
-process check_input{
-
-
-    publishDir params.output_folder, mode: 'copy'
-    input:
-        path proteome_folder
-        path hogmap_folder
-        path species_tree
-        path omamerdb
-        path splice_folder
-    output:
-        path "species_tree_checked.nwk"
-        val true
-    script:
-        """
-        fastoma-check-input
-        """
-}
-
-
-process omamer_run{
-
-
-    publishDir params.hogmap_folder, mode: 'copy'
-    input:
-        path proteomes_omamerdb_inputhog
-        val ready_input_check_c
-    output:
-        path "*.hogmap"
-        val true
-    script: //todo this if condition can be done as part of nextflow, so it won't submit job for cp
-        """
-            if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ]
-        then
-            cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap  ${proteomes_omamerdb_inputhog[0]}.hogmap
-        else
-            omamer search -n 10 --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap
-        fi
-        """  // --nthreads 10
-}
-
-
-
-process infer_roothogs{
-
-
-    publishDir = [
-        path: params.temp_output,
-        mode:  'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename }
-        ]
-    input:
-        val ready_omamer_run
-        path hogmap_folder
-        path proteome_folder
-        path splice_folder
-    output:
-        path "temp_omamer_rhogs"
-        path "gene_id_dic_xml.pickle"
-        path "selected_isoforms" , optional: true  // SPECIESNAME_selected_isoforms.tsv
-        val true         // nextflow-io.github.io/patterns/state-dependency/
-    script:
-        """
-        fastoma-infer-roothogs  --logger-level DEBUG
-        """
-}
-
-
-process batch_roothogs{
-
-
-    input:
-        val ready_infer_roothogs
-        path "temp_omamer_rhogs"
-    output:
-        path "rhogs_rest/*", optional: true
-        path "rhogs_big/*" , optional: true
-        val true
-    script:
-        """
-        fastoma-batch-roothogs
-        """
-}
-
-process hog_big{
-    cpus  2
-
-
-    publishDir params.temp_pickles
-    input:
-        val rhogsbig_tree_ready
-    output:
-        path "*.pickle"
-        path "*.fa", optional: true   // msa         if write True
-        path "*.nwk", optional: true  // gene trees  if write True
-        val true
-    script:
-        """
-        fastoma-infer-subhogs  --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel
-        """
-}
-temp_pickles =2
-
-
-
-process hog_rest{
-
-
-    publishDir params.temp_pickles
-    input:
-        val rhogsrest_tree_ready
-    output:
-        path "*.pickle"
-        path "*.fa" , optional: true   // msa         if write True
-        path "*.nwk" , optional: true  // gene trees  if write True
-        val true
-    script:
-        """
-        fastoma-infer-subhogs  --input-rhog-folder ${rhogsrest_tree_ready[0]}  --species-tree ${rhogsrest_tree_ready[1]}
-        """  // --parrallel False
-}
-
-
-process collect_subhogs{
-
-
-    publishDir params.output_folder, mode: 'copy'
-    input:
-        val ready_hog_rest
-        val ready_hog_big
-        path "temp_pickles"   // this is the folder includes pickles_rhogs
-        path "gene_id_dic_xml.pickle"
-        path "temp_omamer_rhogs"
-    output:
-        path "output_hog.orthoxml"
-        path "OrthologousGroupsFasta"
-        path "OrthologousGroups.tsv"
-        path "rootHOGs.tsv"
-    script:
-        """
-        fastoma-collect-subhogs
-        """
-}
-
-
-
-
-workflow {
-    proteomes = Channel.fromPath(params.proteomes,  type:'any' ,checkIfExists:true)
-    proteome_folder = Channel.fromPath(params.proteome_folder)
-    hogmap_folder = Channel.fromPath(params.hogmap_folder)
-    splice_folder = Channel.fromPath(params.splice_folder)
-    temp_output = Channel.fromPath(params.temp_output)
-    genetrees_folder = Channel.fromPath(params.genetrees_folder)
-    hogmap_in = Channel.fromPath(params.hogmap_in)
-
-    temp_pickles =  Channel.fromPath(params.temp_pickles)
-    omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5")
-    species_tree = Channel.fromPath(params.species_tree)
-    species_tree_checked = Channel.fromPath(params.species_tree_checked)
-
-    (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder)
-    ready_input_check_c = ready_input_check.collect()
-    //species_tree_checked.view{"species_tree_checked ${it}"}
-
-    proteomes_omamerdb = proteomes.combine(omamerdb)
-    proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"}
-    //proteomes_omamerdb_inputhog_inputcheck =  proteomes_omamerdb_inputhog.combine(ready_input_check_c)
-    (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog,ready_input_check_c)
-    ready_omamer_run_c = ready_omamer_run.collect()
-    (temp_omamer_rhogs, gene_id_dic_xml,selected_isoforms, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder)
-    ready_infer_roothogs_c = ready_infer_roothogs.collect()
-
-    (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, temp_omamer_rhogs)
-    ready_batch_roothogs_c = ready_batch_roothogs.collect()
-
-
-    rhogsbig = rhogs_big_list.flatten()
-    rhogsbig_tree =  rhogsbig.combine(species_tree_checked)
-    rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs)   //     rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"}
-    (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready)
-
-    rhogsrest = rhogs_rest_list.flatten()
-    rhogsrest_tree =  rhogsrest.combine(species_tree_checked)
-    rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c)
-    (pickle_rest_rhog,  msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready)
-
-    (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv)  = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), temp_pickles, gene_id_dic_xml, temp_omamer_rhogs)
-    // temp_omamer_rhogs.view{" output omamer_rhogs ${it}"}
-    // orthoxml_file.view{" output orthoxml file ${it}"}
-
-}
-
-
-
-// todo: check input files very beginning (before omamer starts) e.g all species are in the species tree. No species chars in fasta record.
diff --git a/README.md b/README.md
index 88a75c9..99828af 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,22 @@ The details of output are described [below](https://github.com/DessimozLab/FastO
 In summary, you need to 1) install FastOMA and its prerequisites (below), and  2) put the input files in the folder `in_folder` 
 and 3) run FastOMA using the nextflow recipe `FastOMA_light.nf`. 
 ```
-nextflow  FastOMA_light.nf  --input_folder /path/to/in_folder   --output_folder /path/to/out_folder 
+nextflow run FastOMA_light.nf -profile docker --input_folder /path/to/in_folder   --output_folder /path/to/out_folder 
 ```
-The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`. 
+The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`.
+
+## More details on how to run
+We provide for every commit of the repository a docker image for FastOMA on dockerhub. You can specify the container as 
+part of the nextflow command with the parameter `container_version`. If you want to use the container of the current 
+git checkout version, you can specify this in the following way:
+
+```bash
+nextflow run FastOMA.nf -profile docker \
+    --container_version "sha-$(git rev-list --max-count=1 --abbrev-commit HEAD)" \
+    --input_folder testdata/in_folder \
+    --output_folder myresult/
+```
+
 
 
 # How to install FastOMA
diff --git a/conf/base.config b/conf/base.config
new file mode 100644
index 0000000..c92b610
--- /dev/null
+++ b/conf/base.config
@@ -0,0 +1,55 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    dessimozlab/FastOMA Nextflow base config file
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    A 'blank slate' config file, appropriate for general use on most high performance
+    compute environments. Assumes that all software is installed and available on
+    the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
+    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
+    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+    shell  = ['/bin/bash', '-euo', 'pipefail']
+
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 1
+    maxErrors     = '-1'
+
+    withLabel:process_single {
+        cpus   = { check_max( 1                   , 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_low {
+        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_medium {
+        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
+        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high {
+        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
+        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
+        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_long {
+        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    }
+    withLabel:process_high_memory {
+        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+    }
+    withLabel:error_ignore {
+        errorStrategy = 'ignore'
+    }
+    withLabel:error_retry {
+        errorStrategy = 'retry'
+        maxRetries    = 2
+    }
+}
\ No newline at end of file
diff --git a/environment-conda.yml b/environment-conda.yml
new file mode 100644
index 0000000..906210a
--- /dev/null
+++ b/environment-conda.yml
@@ -0,0 +1,12 @@
+name: fastoma-env
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - omamer
+  - mafft
+  - fasttree
+  - pip
+  - pip:
+    - .
diff --git a/nextflow.config b/nextflow.config
new file mode 100644
index 0000000..7529c85
--- /dev/null
+++ b/nextflow.config
@@ -0,0 +1,128 @@
+// General configuration used in all profiles
+manifest {
+  name = "FastOMA - Orthology inference"
+  description = """FastOMA computes Hierarchical Orthologous Groups from proteomes."""
+  author = "Sina Majidian, Adrian Altenhoff"
+  homePage = "https://omabrowser.org"
+  mainScript = "FastOMA.nf"
+  nextflowVersion = "22.10.4"
+}
+
+params {
+  container_name = "dessimozlab/fastoma"
+  container_version = "latest"
+  omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5"
+  debug_enabled = false
+  help = false
+  report = false
+
+  output_folder = "Output"
+  statsdir = "${params.output_folder}/stats"
+
+  // Max resource options
+  // Defaults only, expecting to be overwritten
+  max_memory                 = '128.GB'
+  max_cpus                   = 24
+  max_time                   = '120.h'
+}
+
+// Profiles configure nextflow depending on the environment (local, docker, singularity)
+profiles {
+
+  docker {
+    process {
+      container = "$params.container_name:$params.container_version"
+    }
+    docker.enabled = true
+  }
+
+  singularity {
+    process {
+      container = "$params.container_name:$params.container_version"
+    }
+    singularity.enabled = true
+    singularity.autoMounts = true
+  }
+
+  standard {
+    process.executor = 'local'
+  }
+
+  conda {
+    process.conda = "environment-conda.yml"
+    conda.enabled = true
+  }
+
+  slurm_singularity {
+    process {
+      container = "$params.container_name:$params.container_version"
+      executor = "slurm"
+      time = 4.h
+      memory = 20.GB
+    }
+    singularity.enabled = true
+    singularity.autoMounts = true
+  }
+
+  slurm_conda {
+    process {
+      conda = "environment-conda.yml"
+      executor = "slurm"
+      time = 4.h
+      memory = 20.GB
+    }
+    conda.enabled = true
+  }
+}
+
+def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')
+timeline {
+  enabled = params.report
+  file = "${params.statsdir}/timeline_${trace_timestamp}.html"
+}
+report {
+  enabled = params.report
+  file = "${params.statsdir}/report_{trace_timestamp}.html"
+}
+trace {
+    enabled = params.report
+    file    = "${params.statsdir}/trace_${trace_timestamp}.txt"
+}
+dag {
+    enabled = params.report
+    file    = "${params.statsdir}/pipeline_dag_${trace_timestamp}.html"
+}
+
+includeConfig "conf/base.config"
+
+// function to check maximum resources
+def check_max(obj, type) {
+    if (type == 'memory') {
+        try {
+            if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
+                return params.max_memory as nextflow.util.MemoryUnit
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'time') {
+        try {
+            if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
+                return params.max_time as nextflow.util.Duration
+            else
+                return obj
+        } catch (all) {
+            println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
+            return obj
+        }
+    } else if (type == 'cpus') {
+        try {
+            return Math.min( obj, params.max_cpus as int )
+        } catch (all) {
+            println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
+            return obj
+        }
+    }
+}
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..fc5e48e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,48 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "FastOMA"
+dynamic = ["version"]
+description = "FastOMA - a package to infer orthology information among proteomes"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.8"
+authors = [
+    { name = "Sina Majidian", email = "sina.majidian@gmail.com" },
+    { name = "Adrian Altenhoff", email = "adrian.altenhoff@inf.ethz.ch" }
+]
+dependencies = [
+    "biopython ~=1.81",
+    "DendroPy ~=4.3",
+    "ete3 ~=3.1",
+    "lxml ~=4.3",
+    "omamer ~=2.0",
+    "pyham ~=1.1",
+    "pyparsing ",
+]
+
+[project.optinal-dependencies]
+nextflow = [
+    "nextflow"
+]
+
+
+[project.scripts]
+fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs"
+fastoma-check-input = "FastOMA.check_input:fastoma_check_input"
+fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs"
+fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs"
+fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs"
+
+[project.urls]
+Homepage = "https://github.com/DessimozLab/FastOMA"
+
+[tool.hatch.version]
+path = "FastOMA/__init__.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/FastOMA",
+]