diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..2021e50 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +work +.nextflow* +.idea +.git +output +testdata +dist +archive/ diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 0000000..73e5fb6 --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,73 @@ +name: Docker Image CI + +on: + push: + pull_request: + release: + type: [published] + +env: + TEST_TAG: dessimozlab/fastoma:test + +jobs: + + build: + + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + submodules: recursive + + - name: Docker meta + id: meta + uses: docker/metadata-action@v4 + with: + # list of Docker images to use as base name for tags + images: | + dessimozlab/fastoma + # generate Docker tags based on the following events/attributes + tags: | + type=schedule + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha + + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Build and export to docker for testing + uses: docker/build-push-action@v3 + with: + context: . + load: true + tags: ${{ env.TEST_TAG }} + + #- name: Test + # run: | + # docker run --rm -i -v $PWD/tests:/input -v $PWD/tests/:/reads -v $PWD/output:/out -v $PWD/run:/run ${{ env.TEST_TAG }} --tree --standalone_path /input/marker_genes --dna_reference /input/cds-marker_genes.fasta.gz --reads /reads/sample_1.fastq --output_path /out + # if [ ! -f output/tree_sample_1.nwk ] ; then exit 1; fi + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_HUB_USERNAME }} + password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v3 + with: + context: . + platforms: linux/amd64,linux/arm64 + push: true + #${{ github.event_name != 'push' && github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a0c6a24 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +.nextflow* +work/ +.idea/ +dist/ +archive +.git +.gitignore +__pycache__ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..ee6c1f3 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,43 @@ +FROM python:3.11-slim as basis + +# set environment varibles +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + + +FROM basis as builder +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + fasttree \ + libxml2 \ + mafft \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /src +RUN pip install --upgrade hatch pip +COPY pyproject.toml . +RUN python -m venv /app \ + && hatch dep show requirements > requirements.txt \ + && /app/bin/pip install -r requirements.txt + +COPY . . +RUN ls -la \ + && hatch build \ + && ls -la dist/ \ + && /app/bin/pip install dist/*.whl + + +FROM basis as runtime +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + fasttree \ + libxml2 \ + mafft \ + procps \ + && apt-get -y autoremove \ + && apt-get -y autoclean \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=builder /app /app +ENV PATH="/app/bin:$PATH" diff --git a/FastOMA.nf b/FastOMA.nf index bb9c546..aac82a6 100644 --- a/FastOMA.nf +++ b/FastOMA.nf @@ -1,216 +1,324 @@ +import groovy.json.JsonBuilder; // NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000' -params.input_folder = "./in_folder/" -params.output_folder = "./out_folder/" +params.input_folder = "testdata/in_folder" +params.output_folder = "out_folder/" params.proteome_folder = params.input_folder + "/proteome" -params.proteomes = params.proteome_folder + "/*" params.hogmap_in = params.input_folder + "/hogmap_in" - -params.hogmap_folder = params.output_folder + "/hogmap" params.splice_folder = params.input_folder + "/splice" params.species_tree = params.input_folder + "/species_tree.nwk" -params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk" -params.temp_pickles = params.output_folder + "/temp_pickles" + + +// output subfolder definition params.genetrees_folder = params.output_folder + "/genetrees" +params.hogmap_folder = params.output_folder + "/hogmap" params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs" + +if (params.help) { + log.info """ + =========================================== + FastOMA -- PIPELINE + =========================================== + Usage: + Run the pipeline with default parameters: + nexflow run FastOMA.nf + + Run with user parameters: + nextflow run FastOMA.nf --input_folder {input.dir} --output_folder {results.dir} + + Mandatory arguments: + --input_folder Input data folder. Defaults to ${params.input_folder}. This folder + must contain the proteomes (in a subfolder named 'proteome') and + a species tree file. Optionally the folder might contain + - a sub-folder 'splice' containing splicing variant mappings + - a sub-folder 'hogmap_in' containing precomputed OMAmer + placement results for all proteomes + + All sub-folders and sub-files can also be placed in orther + locations if you provide alternative values for them (see below on + optional arguments section). + + --output_folder Path where all the output should be stored. Defaults to + ${params.output_folder} + + + Profile selection: + -profile FastOMA can be run using several execution profiles. The default + set of available profiles is + - docker Run pipeline using docker containers. Docker needs + to be installed on your system. Containers will be + fetched automatically from dockerhub. See also + additional options '--container_version' and + '--container_name'. + + - singlularity Run pipeline using singularity. Singularity needs + to be installed on your system. On HPC clusters, + it often needs to be loaded as a seperate module. + Containers will be fetched automatically from + dockerhub. See also additional options + '--container_version' and '--container_name'. + + - conda Run pipeline in a conda environment. Conda needs + to be installed on your system. The environment + will be created automatically. + + - standard Run pipeline on your local system. Mainly intended + for development purpose. All dependencies must be + installed in the calling environment. + + - slurm_singularity + Run pipeline using SLURM job scheduler and + singularity containers. This profile can also be a + template for other HPC clusters that use different + schedulers. + + - slurm_conda Run pipeline using SLURM job scheduler and conda + environment. + + Profiles are defined in nextflow.config and can be extended or + adjusted according to your needs. + + + Additional options: + --proteome_folder Overwrite location of proteomes (default ${params.proteome_folder}) + --species_tree Overwrite location of species tree file (newick format). + Defaults to ${params.species_tree} + --splice_folder Overwrite location of splice file folder. The splice files must be + named .splice. + Defaults to ${params.splice_folder} + --omamer_db Path or URL to download the OMAmer database from. + Defaults to ${params.omamer_db} + --hogmap_in Optional path where precomputed omamer mapping files are located. + Defaults to ${params.hogmap_in} + + Flags: + --help Display this message + --debug_enabled Store addtional information that might be helpful to debug in case + of a problem with FastOMA. + --report Produce nextflow report and timeline and store in in + $params.statdir + + """.stripIndent() + + exit 1 +} + + +log.info """ +=========================================== + FastOMA -- PIPELINE +=========================================== + + Project : ${workflow.projectDir} + Git info: ${workflow.repository} - ${workflow.revision} [${workflow.commitId}] + Cmd line: ${workflow.commandLine} + Manifest's pipeline version: ${workflow.manifest.version} + +Parameters: + input_folder ${params.input_folder} + proteome folder ${params.proteome_folder} + species_tree ${params.species_tree} + splice_folder ${params.splice_folder} + omamer_db ${params.omamer_db} + hogmap_in ${params.hogmap_in} + + debug_enabled ${params.debug_enabled} + report ${params.report} +""".stripIndent() + + process check_input{ - time {5.h} - memory {200.GB} - publishDir params.output_folder, mode: 'copy' - input: - path proteome_folder - path hogmap_folder - path species_tree - path omamerdb - path splice_folder - output: - path "species_tree_checked.nwk" - val true - script: - """ - fastoma-check-input - """ + label "process_single" + publishDir params.output_folder, mode: 'copy' + + input: + path proteome_folder + path hogmap_folder + path species_tree + path omamerdb + path splice_folder + output: + path "species_tree_checked.nwk" + val "check_completed" + script: + """ + fastoma-check-input --proteomes ${proteome_folder} \ + --species-tree ${species_tree} \ + --out-tree species_tree_checked.nwk \ + --splice ${splice_folder} \ + --hogmap ${hogmap_folder} \ + --omamer_db ${omamerdb} \ + -vv + """ } process omamer_run{ - time {5.h} - memory {95.GB} - publishDir params.hogmap_folder, mode: 'copy' - input: - path proteomes_omamerdb_inputhog - val ready_input_check_c - output: - path "*.hogmap" - val true - script: //todo this if condition can be done as part of nextflow, so it won't submit job for cp - """ - if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ] - then - cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ${proteomes_omamerdb_inputhog[0]}.hogmap - else - omamer search -n 10 --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap - fi - """ // --nthreads 10 + label "process_single" + tag "$proteome" + publishDir params.hogmap_folder, mode: 'copy' + + input: + tuple path(proteome), path(omamer_db), path(precomputed_hogmap_folder), val(ready) + + output: + path "*.hogmap" + + script: + """ + if [ -f ${precomputed_hogmap_folder}/${proteome}.hogmap ] ; then + cp ${precomputed_hogmap_folder}/${proteome}.hogmap ${proteome}.hogmap + else + omamer search --db ${omamer_db} --query ${proteome} --out ${proteome}.hogmap + fi + """ } - process infer_roothogs{ - cpus 10 // useful for linclust - time {15.h} // including linclust - memory {350.GB} - publishDir = [ - path: params.temp_output, - mode: 'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename } - ] - input: - val ready_omamer_run - path hogmap_folder - path proteome_folder - path splice_folder - output: - path "temp_omamer_rhogs" - path "gene_id_dic_xml.pickle" - path "selected_isoforms" , optional: true // SPECIESNAME_selected_isoforms.tsv - val true // nextflow-io.github.io/patterns/state-dependency/ - script: - """ - fastoma-infer-roothogs --logger-level DEBUG - """ + label "process_single" + publishDir = [ + path: params.temp_output, + enabled: params.debug_enabled, + ] + + input: + path hogmaps, stageAs: "hogmaps/*" + path proteome_folder + path splice_folder + output: + path "omamer_rhogs/*" + path "gene_id_dic_xml.pickle" + path "selected_isoforms" , optional: true + script: + """ + fastoma-infer-roothogs --proteomes ${proteome_folder} \ + --hogmap hogmaps \ + --splice ${splice_folder} \ + --out-rhog-folder "omamer_rhogs/" \ + -vv + """ } process batch_roothogs{ - time {15.h} // including linclust - memory {95.GB} - input: - val ready_infer_roothogs - path "temp_omamer_rhogs" - output: - path "rhogs_rest/*", optional: true - path "rhogs_big/*" , optional: true - val true - script: - """ - fastoma-batch-roothogs - """ + input: + path rhogs, stageAs: "omamer_rhogs/*" + output: + path "rhogs_rest/*", optional: true + path "rhogs_big/*" , optional: true + script: + """ + fastoma-batch-roothogs --input-roothogs omamer_rhogs/ \ + --out-big rhogs_big \ + --out-rest rhogs_rest \ + -vv + """ } process hog_big{ - cpus 6 - time {10.h} // for very big rhog it might need more, or you could re-run and add `-resume` - memory {200.GB} - publishDir params.temp_pickles - input: - val rhogsbig_tree_ready - output: - path "*.pickle" - path "*.fa", optional: true // msa if write True - path "*.nwk", optional: true // gene trees if write True - val true - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel - """ + label "process_high" + + input: + each rhogsbig + path species_tree + output: + path "pickle_hogs" + path "msa/*.fa" , optional: true // msa if write True + path "gene_trees/*.nwk" , optional: true // gene trees if write True + script: + """ + fastoma-infer-subhogs --input-rhog-folder ${rhogsbig} \ + --species-tree ${species_tree} \ + --fragment-detection \ + --low-so-detection \ + --parallel + """ } -temp_pickles =2 - - process hog_rest{ - time {3.h} // for very big rhog it might need more, or you could re-run and add `-resume` - memory {95.GB} - publishDir params.temp_pickles - input: - val rhogsrest_tree_ready - output: - path "*.pickle" - path "*.fa" , optional: true // msa if write True - path "*.nwk" , optional: true // gene trees if write True - val true - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsrest_tree_ready[0]} --species-tree ${rhogsrest_tree_ready[1]} - """ // --parrallel False + label "process_single" + + input: + each rhogsrest + path species_tree + output: + path "pickle_hogs" + path "msa/*.fa" , optional: true // msa if write True + path "gene_trees/*.nwk" , optional: true // gene trees if write True + script: + """ + fastoma-infer-subhogs --input-rhog-folder ${rhogsrest} \ + --species-tree ${species_tree} \ + --fragment-detection \ + --low-so-detection + #--out pickle_hogs + """ } process collect_subhogs{ - time {10.h} - memory {300.GB} // orthoxml string can be very huge in memory - publishDir params.output_folder, mode: 'copy' - input: - val ready_hog_rest - val ready_hog_big - path "temp_pickles" // this is the folder includes pickles_rhogs - path "gene_id_dic_xml.pickle" - path "temp_omamer_rhogs" - output: - path "output_hog.orthoxml" - path "OrthologousGroupsFasta" - path "OrthologousGroups.tsv" - path "rootHOGs.tsv" - script: - """ - fastoma-collect-subhogs - """ + publishDir params.output_folder, mode: 'copy' + input: + path pickles, stageAs: "pickle_folders/?" + path "gene_id_dic_xml.pickle" + path rhogs, stageAs: "omamer_rhogs/*" + output: + path "output_hog.orthoxml" + path "OrthologousGroupsFasta" + path "OrthologousGroups.tsv" + path "rootHOGs.tsv" + script: + """ + fastoma-collect-subhogs --pickle-folder pickle_folders/ \ + --roothogs-folder omamer_rhogs/ \ + --gene-id-pickle-file gene_id_dic_xml.pickle \ + --out output_hog.orthoxml \ + --marker-groups-fasta OrthologousGroups.tsv \ + --roothog-tsv rootHOGs.tsv \ + -vv + """ } +workflow { + proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first() + proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true) + species_tree = Channel.fromPath(params.species_tree, type: "file", checkIfExists:true).first() + splice_folder = Channel.fromPath(params.splice_folder, type: "dir") + genetrees_folder = Channel.fromPath(params.genetrees_folder, type: 'dir') + hogmap_in = Channel.fromPath(params.hogmap_in, type:'dir') + omamerdb = Channel.fromPath(params.omamer_db) + (species_tree_checked_, ready_input_check) = check_input(proteome_folder, hogmap_in, species_tree, omamerdb, splice_folder) + omamer_input_channel = proteomes.combine(omamerdb).combine(hogmap_in).combine(ready_input_check) + hogmap = omamer_run(omamer_input_channel) -workflow { - proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true) - proteome_folder = Channel.fromPath(params.proteome_folder) - hogmap_folder = Channel.fromPath(params.hogmap_folder) - splice_folder = Channel.fromPath(params.splice_folder) - temp_output = Channel.fromPath(params.temp_output) - genetrees_folder = Channel.fromPath(params.genetrees_folder) - hogmap_in = Channel.fromPath(params.hogmap_in) - - temp_pickles = Channel.fromPath(params.temp_pickles) - omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") - species_tree = Channel.fromPath(params.species_tree) - species_tree_checked = Channel.fromPath(params.species_tree_checked) - - (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder) - ready_input_check_c = ready_input_check.collect() - //species_tree_checked.view{"species_tree_checked ${it}"} - - proteomes_omamerdb = proteomes.combine(omamerdb) - proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"} - //proteomes_omamerdb_inputhog_inputcheck = proteomes_omamerdb_inputhog.combine(ready_input_check_c) - (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog,ready_input_check_c) - ready_omamer_run_c = ready_omamer_run.collect() - (temp_omamer_rhogs, gene_id_dic_xml,selected_isoforms, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder) - ready_infer_roothogs_c = ready_infer_roothogs.collect() - - (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, temp_omamer_rhogs) - ready_batch_roothogs_c = ready_batch_roothogs.collect() - - - rhogsbig = rhogs_big_list.flatten() - rhogsbig_tree = rhogsbig.combine(species_tree_checked) - rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} - (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready) - - rhogsrest = rhogs_rest_list.flatten() - rhogsrest_tree = rhogsrest.combine(species_tree_checked) - rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c) - (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready) - - (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), temp_pickles, gene_id_dic_xml, temp_omamer_rhogs) - // temp_omamer_rhogs.view{" output omamer_rhogs ${it}"} - // orthoxml_file.view{" output orthoxml file ${it}"} + (omamer_rhogs, gene_id_dic_xml, ready_infer_roothogs) = infer_roothogs(hogmap.collect(), proteome_folder, splice_folder) + + (rhogs_rest_batches, rhogs_big_batches) = batch_roothogs(omamer_rhogs) + + (pickle_big_rhog, msa_out_big, genetrees_out_rest) = hog_big(rhogs_big_batches.flatten(), species_tree) + (pickle_rest_rhog, msas_out_rest, genetrees_out_test) = hog_rest(rhogs_rest_batches.flatten(), species_tree) + channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle } + + (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs) } +workflow.onComplete { + def String report = ( params.report ? "\nNextflow report : ${params.statsdir}" : ""); + println "" + println "Completed at : $workflow.complete" + println "Duration : $workflow.duration" + println "Processes : $workflow.workflowStats.succeedCount (success), $workflow.workflowStats.failedCount (failed)" + println "Output in : $params.output_folder" + report + println ( workflow.success ? "Done!" : "Oops .. something went wrong" ) +} -// todo: check input files very beginning (before omamer starts) e.g all species are in the species tree. No species chars in fasta record. diff --git a/FastOMA/_config.py b/FastOMA/_config.py index 3453348..62db7bd 100644 --- a/FastOMA/_config.py +++ b/FastOMA/_config.py @@ -77,6 +77,7 @@ inferhog_min_hog_size_xml = 2 # by setting this as 1, pyham won't work on xml output. +# batch_roothogs big_rhog_filesize_thresh = 400 * 1000 sum_list_rhogs_filesize_thresh = 2 * 1e6 diff --git a/FastOMA/_utils_roothog.py b/FastOMA/_utils_roothog.py index e33cd20..5e87639 100644 --- a/FastOMA/_utils_roothog.py +++ b/FastOMA/_utils_roothog.py @@ -1,3 +1,5 @@ +import collections +import csv from Bio import SeqIO import pickle @@ -7,6 +9,7 @@ from ._utils_subhog import logger_hog from .zoo.unionfind import UnionFind from . import _config +HOGMapData = collections.namedtuple("HOGMapData", ("hogid", "score", "seqlen", "subfamily_medianseqlen")) import collections @@ -134,7 +137,7 @@ def get_components(self): return comp -def parse_proteomes(folder="./"): # list_oma_species +def parse_proteomes(folder=None): # list_oma_species """ parsing fasta files of proteins located in /proteome/ using Bio.SeqIO.parse @@ -142,28 +145,31 @@ def parse_proteomes(folder="./"): # list_oma_species All proteoems should be with the same extension. output: prot_recs_lists: a dic with key as species name and its value list of Biopython record of species. """ + if folder is None: + folder = "./proteome" - project_files = listdir(folder+"/proteome/") - species_names = [] # query/input species name based on the file name + fasta_format_keep = "" + project_files = listdir(folder) + logger_hog.debug("using '%s' as proteome folder, found %d files", folder, len(project_files)) + species_names = [] # query/input species name based on the file name for file in project_files: - fasta_format = file.split(".")[-1] - if fasta_format == "fa" or fasta_format == "fasta": - file_name_split = file.split(".")[:-1] - species_names.append('.'.join(file_name_split)) - fasta_format_keep = fasta_format # last one is stored either fa or fasta + species_name, ext = file.rsplit('.', 1) + logger_hog.debug("%s: %s | %s", file, species_name, ext) + if ext in ("fa", "fasta"): + species_names.append(species_name) + fasta_format_keep = ext # last one is stored either fa or fasta + # todo accept all fasta formats in the input prtoeome folder, fasta, fa, fna, .. prot_recs_lists = {} # key: species name, value is a dic of query protein Biopython records. # 'MYCGE': [SeqRecord(seq=Seq('MDFDK - for species_name in species_names: - prot_address = folder+"/proteome/" + species_name + "."+fasta_format_keep + prot_address = os.path.join(folder, species_name + "." + fasta_format_keep) prots_record = list(SeqIO.parse(prot_address, "fasta")) - prot_recs_lists[species_name]=prots_record + prot_recs_lists[species_name] = prots_record - logger_hog.info("There are "+str(len(species_names))+" species in the proteome folder.") + logger_hog.info("There are %d species in the proteome folder.", len(species_names)) return species_names, prot_recs_lists, fasta_format_keep - def add_species_name_prot_id(species_names, prot_recs_lists): """ adding the name of species to each protein record @@ -203,7 +209,7 @@ def add_species_name_prot_id(species_names, prot_recs_lists): return prot_recs_all -def parse_hogmap_omamer(species_names, fasta_format_keep, folder="./"): +def parse_hogmap_omamer(species_names, fasta_format_keep, folder=None): """ function for parsing output of omamer (hogmap files) located in /hogmap/ Each hogmap file correspond to one fasta file of species, with the same name. @@ -213,41 +219,29 @@ def parse_hogmap_omamer(species_names, fasta_format_keep, folder="./"): # sp|O66907|ATPA_AQUAE HOG:C0886513.1b Eukaryota 696.5519485850672 187 0.37302594463771466 0.2643067275644273 120 504 551 0.8230616302186878 output is dic of dic for all species: """ + if folder is None: + folder = "./hogmap" + hogmaps = {} - unmapped = {} + unmapped = collections.defaultdict(list) for species_name in species_names: - hogmap_address = folder + "/hogmap/" + species_name + "."+fasta_format_keep+".hogmap" - hogmap_file = open(hogmap_address, 'r') - - for line in hogmap_file: - line_strip = line.strip() - if not line_strip.startswith('!') and not line_strip.startswith('qs'): - # qseqid hogid hoglevel family_p family_count family_normcount subfamily_score subfamily_count qseqlen subfamily_medianseqlen qseq_overlap - line_split = line_strip.split("\t") - prot_id = line_split[0] - hogid = line_split[1] - score = line_split[3] - seqlen = line_split[8] - subfamily_medianseqlen = line_split[9] - - if hogid == "N/A": - if species_name in unmapped: - unmapped[species_name].append(prot_id) - else: - unmapped[species_name] = [prot_id] + hogmap_address = os.path.join(folder, species_name + "." + fasta_format_keep + ".hogmap") + cur_species_hogmap = collections.defaultdict(list) + with open(hogmap_address, 'rt') as hogmap_file: + reader = csv.DictReader((line for line in hogmap_file if not line.startswith('!')), + dialect="excel-tab") + for row in reader: + if row['hogid'] == "N/A": + unmapped[species_name].append(row['qseqid']) else: - if species_name in hogmaps: - if prot_id in hogmaps[species_name]: - hogmaps[species_name][prot_id].append((hogid, score, seqlen, subfamily_medianseqlen)) - - else: - hogmaps[species_name][prot_id] = [(hogid, score, seqlen, subfamily_medianseqlen)] - else: - hogmaps[species_name] = {prot_id: [(hogid, score, seqlen, subfamily_medianseqlen)]} - - logger_hog.info("There are " + str(len(hogmaps)) + " species in the hogmap folder.") - logger_hog.info("The first species " + species_names[0] + " contains " + str(len(hogmaps[species_names[0]])) + " proteins.") - + cur_species_hogmap[row['qseqid']].append( + HOGMapData(row['hogid'], row['family_p'], row['qseqlen'], row['subfamily_medianseqlen'])) + hogmaps[species_name] = cur_species_hogmap + logger_hog.info("hogmap %s: %d proteins mapped to %d hogs, %d not mapped", + species_name, len(cur_species_hogmap), sum(len(x) for x in cur_species_hogmap.values()), + len(unmapped[species_name])) + + logger_hog.info("There are %d species in the hogmap folder.", len(hogmaps)) return hogmaps, unmapped @@ -793,14 +787,13 @@ def write_clusters(address_rhogs_folder, min_rhog_size): return cluster_iter-1 - -def parse_isoform_file(species_names, folder="."): +def parse_isoform_file(species_names, folder=None): + if folder is None: + folder = "./splice" isoform_by_gene_all = {} for species_name in species_names: # from fasta file - - file_splice_name = folder + "/splice/" + species_name + ".splice" - + file_splice_name = os.path.join(folder, species_name + ".splice") if os.path.isfile(file_splice_name): # from OMArk isoform_by_gene = [] @@ -811,12 +804,10 @@ def parse_isoform_file(species_names, folder="."): isoform_by_gene.append(splice) else: isoform_by_gene = [] - isoform_by_gene_all[species_name] = isoform_by_gene return isoform_by_gene_all - def find_nonbest_isoform(species_names, isoform_by_gene_all, hogmaps): isoform_selected = {} isoform_not_selected = {} @@ -866,13 +857,10 @@ def write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_list write isoforms """ - selected_isoforms_folder= "selected_isoforms/" + selected_isoforms_folder = "selected_isoforms/" if not os.path.exists(selected_isoforms_folder): os.mkdir(selected_isoforms_folder) - - - for species, isoform_selected_sp in isoform_selected.items(): isoform_by_gene = isoform_by_gene_all[species] assert len(isoform_by_gene) == len(isoform_selected_sp) diff --git a/FastOMA/batch_roothogs.py b/FastOMA/batch_roothogs.py index 1d76a31..4937071 100644 --- a/FastOMA/batch_roothogs.py +++ b/FastOMA/batch_roothogs.py @@ -1,91 +1,66 @@ - - -import os -from os import makedirs import shutil -from os import listdir - from . import _config - - -def list_rhog_fastas(address_rhogs_folder): - """ - create orthoxml_to_newick.py list of rootHOG IDs stored in the folder of rHOG . - input: folder address - output: list of rhog Id (integer) - """ - rhog_files = listdir(address_rhogs_folder) - rhogid_list= [] - for rhog_file in rhog_files: - if rhog_file.split(".")[-1] == "fa" and rhog_file.split("_")[0] == "HOG": - rhogid = rhog_file.split(".")[0].split("_")[1] - rhogid_list.append(rhogid) - - return rhogid_list - - - -def folder_1h_rhog(address_rhogs_folder, output_folder_big, output_folder_rest): - - # in_folder = "/work/FAC/FBM/DBC/cdessim2/default/smajidi1/gethog3_qfo/working_nfp/" - - #work_fldor_out = in_folder - - rhogid_list = list_rhog_fastas(address_rhogs_folder) - len(rhogid_list) - dic_rhognum_size = {} - for rhogid in rhogid_list[2:]: - rhog_i_prot_address = address_rhogs_folder + "/HOG_" + rhogid + ".fa" - # rhog_i = list(SeqIO.parse(rhog_i_prot_address, "fasta")) - rhog_i_size = os.path.getsize(rhog_i_prot_address) - dic_rhognum_size[rhogid] = rhog_i_size - len(dic_rhognum_size) - - # to have at least one big and one rest rhog, this makes nextflow pipline easier, - # otherwise we need if condition collect_subhogs won't satisfy if one of them is empty - - list_list_rest_rhog = [[rhogid_list[0]],[]] # each insid list should take around 1h - list_list_rest_size = [[],[]] - list_list_big = [rhogid_list[1]] - - for rhognum, size in dic_rhognum_size.items(): - # print(rhognum, size) - if size > _config.big_rhog_filesize_thresh: - list_list_big.append(rhognum) - else: - if sum(list_list_rest_size[-1]) < _config.sum_list_rhogs_filesize_thresh: - list_list_rest_rhog[-1].append(rhognum) - list_list_rest_size[-1].append(size) +from pathlib import Path +logger = _config.logger_hog + + +class BatchBuilder: + def __init__(self, outdir: Path, max_size: int): + self.outdir = outdir + self.max_size = max_size + + def __enter__(self): + self.cur_batch = [] + self.cur_size = 0 + self.counter = 0 + self.outdir.mkdir(parents=True, exist_ok=True) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if len(self.cur_batch) > 0: + self._flush() + + def add_hog(self, hog_file: Path): + self.cur_batch.append(hog_file) + self.cur_size += hog_file.stat().st_size + logger.debug("adding %s with size %d to batch %d", hog_file, hog_file.stat().st_size, self.counter) + if self.cur_size > self.max_size: + self._flush() + self.counter += 1 + + def _flush(self): + batch_dir = self.outdir / str(self.counter) + batch_dir.mkdir() + for fn in self.cur_batch: + shutil.copy(fn, batch_dir) + logger.debug("creating batch %s with %d families; total size of files is %d", + batch_dir, len(self.cur_batch), self.cur_size) + self.cur_size = 0 + self.cur_batch = [] + + +def folder_1h_rhog(roothog_path: Path, output_folder_big: Path, output_folder_rest: Path): + # create a list of hogs in descending filesize order + hog_size_tuples = sorted([(f, f.stat().st_size) for f in roothog_path.rglob("*.fa")], key=lambda x: -x[1]) + with BatchBuilder(output_folder_big, 1) as big_hogs, \ + BatchBuilder(output_folder_rest, _config.sum_list_rhogs_filesize_thresh) as rest_hogs: + for hog, fsize in hog_size_tuples: + if fsize > _config.big_rhog_filesize_thresh: + big_hogs.add_hog(hog) else: - list_list_rest_rhog.append([rhognum]) - list_list_rest_size.append([size]) - - # makedirs(work_fldor_out+"rhogs_rest") - for folder_id, list_rhog in enumerate(list_list_rest_rhog): - # print(folder_id) - # output_folder_rest = work_fldor_out + "/rhogs_rest/" - makedirs( output_folder_rest + str(folder_id)) - for rhogid in list_rhog: - name = "HOG_" + rhogid + ".fa" - folder_rest =output_folder_rest + str(folder_id) + "/" - shutil.copy(address_rhogs_folder + name, folder_rest + name) - - # makedirs(work_fldor_out+"rhogs_big") - # output_folder_big = work_fldor_out + "rhogs_big/" - for folder_id, rhogid in enumerate(list_list_big): - name = "HOG_" + rhogid + ".fa" - folder_big = output_folder_big + "b" + str(folder_id) + "/" - makedirs(folder_big) - shutil.copy(address_rhogs_folder + name, folder_big + name) - - return 1 - + rest_hogs.add_hog(hog) def fastoma_batch_roothogs(): - - input_rhog = "./temp_omamer_rhogs/" # - output_folder_big = "./rhogs_big/" - output_folder_rest = "./rhogs_rest/" - folder_1h_rhog(input_rhog, output_folder_big, output_folder_rest) + import argparse + parser = argparse.ArgumentParser(description="Analyse roothog families and create batches for analysis") + parser.add_argument('--input-roothogs', required=True, help="folder where input roothogs are stored") + parser.add_argument('--out-big', required=True, help="folder where the big single family hogs should be stored") + parser.add_argument('--out-rest', required=True, help="folder where the remaining families should be stored in" + "batch subfolder structure.") + parser.add_argument('-v', default=0, action="count", help="incrase verbosity") + conf = parser.parse_args() + logger.setLevel(level=30 - 10 * min(conf.v, 2)) + + folder_1h_rhog(Path(conf.input_roothogs), Path(conf.out_big), Path(conf.out_rest)) diff --git a/FastOMA/check_input.py b/FastOMA/check_input.py index de3b506..e46441a 100644 --- a/FastOMA/check_input.py +++ b/FastOMA/check_input.py @@ -1,72 +1,64 @@ - - import sys from ete3 import Tree import os import collections +import argparse from . import _config from . import _utils_roothog -from ._utils_subhog import logger_hog - - - -""" -cd in_folder -python check_input.py -""" +logger = _config.logger_hog -def check_proteome_files(): +def check_proteome_files(proteome_folder): + proteome_files = os.listdir(proteome_folder) + logger.info("There are %d files in the proteome folder.", len(proteome_files)) - proteome_files = os.listdir("./proteome/") - logger_hog.info("There are " + str(len(proteome_files)) + " files in the proteome folder. ") - - not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta") )] + not_fa = [i for i in proteome_files if not (i.endswith(".fa") or i.endswith(".fasta"))] if not_fa: - logger_hog.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these " + str(not_fa) ) + logger.warning("We expect that only fa/fasta files are in the proteome folder. Better to remove these %s", not_fa) - fa_fasta = [i.split(".")[-1] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")] - if len(fa_fasta)<=2: - logger_hog.error("There are not enough proteomes in the folder ") - logger_hog.error("Check input failed. FastOMA halted!") - sys.exit(1) + fa_fasta = [i.split(".")[-1] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")] + if len(fa_fasta) <= 2: + logger.error("There are not enough proteomes in the folder ") + logger.error("Check input failed. FastOMA halted!") + return False - if len(set(fa_fasta))>1: - logger_hog.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them " + str(proteome_files) ) + if len(set(fa_fasta)) > 1: + logger.warning("We expect that all fasta files are with the same format either fa or fasta, we won't include some of them %s.", proteome_files) + return False - species_names1= [".".join(i.split(".")[:-1]) for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")] - return species_names1 + species_names = [os.path.splitext(i)[0] for i in proteome_files if i.endswith(".fa") or i.endswith(".fasta")] + return species_names -def check_proteome(species_names,prot_recs_lists): +def check_proteome(species_names, prot_recs_lists, proteome_folder): # ids_set=set() + isOk = True num_prots_all = 0 for species_name in species_names: num_prots = len(prot_recs_lists[species_name]) - num_prots_all+=num_prots + num_prots_all += num_prots # todo , check duplicated Ids for seq in prot_recs_lists[species_name]: # if seq.id in ids_set: report duplciated ids_set.add(seq.id), if num_prots <= 2: - logger_hog.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it, in_folder/proteome/" + species_name + "."+fasta_format_keep) + logger.error("The input proteome looks empty or too few proteins or Bio.SeqIO couldn't read it, %s/%s.fa", proteome_folder, species_name) + isOk = False # todo write new protoems with cleaned record ids, keep the mapping, to be used in orthoxml writing # use the mapping back for orhtoxml - logger_hog.error("There are "+str(num_prots_all)+" proteins in total in the input proteome folder. ") - return 1 - + logger.info("There are %d proteins in total in the input proteome folder.", num_prots_all) + return isOk -def check_hogmap_files(): - - hogmap_files = os.listdir("./hogmap_in/") - logger_hog.info("There are " + str(len(hogmap_files)) + " files in the hogmap folder. ") - not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap") ] +def check_hogmap_files(hogmap_folder): + hogmap_files = os.listdir(hogmap_folder) + logger.info("There are %d files in the hogmap folder.", len(hogmap_files)) + not_hogmap = [i for i in hogmap_files if not i.endswith(".hogmap")] if not_hogmap: - logger_hog.warning("We expect that only hogmap files are in the hogmap_in folder. Better to remove these " + str(not_hogmap) ) - - species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if i.endswith(".hogmap")] - logger_hog.info("There are "+str(len(species_hogmaps))+" hogmaps.") + logger.warning("We expect that only hogmap files are in the %s folder. Better to remove these %s", + hogmap_folder, not_hogmap) + species_hogmaps = ["".join(i.split(".")[:-2]) for i in hogmap_files if i.endswith(".hogmap")] + logger.info("There are "+str(len(species_hogmaps))+" hogmaps.") return species_hogmaps @@ -78,63 +70,63 @@ def check_speciestree_internalnode(species_tree): if not node.is_leaf(): internal_node_name = [] if (not node.name) or len(node.name) < 1: - logger_hog.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.") + logger.warning("one of the internal node in species tree doesn't have a name. we'll update the species tree.") else: internal_node_name.append(node.name) if len(internal_node_name) != len(set(internal_node_name)): - logger_hog.warning("All the internal node names should be unique. One of the internal node is repeated:") - logger_hog.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1]) - logger_hog.warning("We'll change the internal node names.") - - - + logger.warning("All the internal node names should be unique. One of the internal node is repeated:") + logger.warning([item for item, count in collections.Counter(internal_node_name).items() if count > 1]) + logger.warning("We'll change the internal node names.") return 1 -def check_speciestree_leaves(species_tree,species_names): +def check_speciestree_leaves(species_tree, species_names): leaves_name = [i.name for i in species_tree.get_leaves()] - logger_hog.info("The species tree has " + str(len(leaves_name))+" leaves.") - if len(set(leaves_name))!=len(leaves_name): - logger_hog.error("Leaves name should be unique in the species tree. You may try ete3 prune") - logger_hog.error("Check input failed. FastOMA halted!") - sys.exit(1) + logger.info("The species tree has %d leaves.", len(leaves_name)) + if len(set(leaves_name)) != len(leaves_name): + logger.error("Leaves name should be unique in the species tree. You may try ete3 prune") + logger.error("Check input failed. FastOMA halted!") + return False - species_names_not_intree = [i for i in species_names if i not in leaves_name] + species_names_not_intree = [i for i in species_names if i not in leaves_name] if species_names_not_intree: - logger_hog.error("These species are not in the species tree "+ str(species_names_not_intree)) - logger_hog.error("This is a tree template: ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files: AQUAE.fa CHLTR.fa MYCGE.fa. ") - logger_hog.error("Check input failed. FastOMA halted!") - sys.exit(1) - leaves_tree_not_proteome = [i for i in leaves_name if i not in species_names] + logger.error("These species are not in the species tree:\n %s", "\n ".join(species_names_not_intree)) + logger.error("This is a tree template: ((AQUAE,CHLTR)inter1,MYCGE)inter2; for proteome folder with these three files: AQUAE.fa CHLTR.fa MYCGE.fa. ") + logger.error("Check input failed. FastOMA halted!") + return False + + leaves_tree_not_proteome = [i for i in leaves_name if i not in species_names] if leaves_tree_not_proteome: - logger_hog.warning("there are "+str(len(leaves_tree_not_proteome))+" leaves in the species tree that there is no proteome. So we will discard them.") + logger.warning("there are %d leaves in the species tree that there is no proteome. So we will discard them.", len(leaves_tree_not_proteome)) + return True - return 1 -def check_omamer_db(omamerdb_adress="omamerdb.h5"): +def check_omamer_db(omamerdb_adress=None): + if omamerdb_adress is None: + logger.warning("omamer_db not passed. will assume you know what you are doing.") + return True - if os.path.exists(omamerdb_adress): + if os.path.exists(omamerdb_adress): if os.path.getsize(omamerdb_adress) > 10000: # 3 bytes omamerdb = True # todo we can do some checks on version omamer v2 else: - logger_hog.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress) + logger.warning("The omamer db looks very small. are you sure it is correct?"+omamerdb_adress) omamerdb = False else: omamerdb =False - logger_hog.info("OMAmer db does not exist.") + logger.info("OMAmer db does not exist.") return omamerdb -def add_internal_node_prune(species_tree,species_names): - +def add_internal_node_prune(species_tree, species_names, out_fname): # add name for the internal, if no name is provided, removintg special chars counter_internal = 0 node_names = set() for node in species_tree.traverse(strategy="postorder"): - if not node.is_leaf() : + if not node.is_leaf(): node_name = node.name if len(node_name) < 3 or node_name in node_names: @@ -142,27 +134,26 @@ def add_internal_node_prune(species_tree,species_names): node.name = "internal_" + str(counter_internal) counter_internal += 1 node_names.add(node.name) - logger_hog.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name) - elif not all(e.isalnum() or e=="_" or e=="-" for e in node_name): #any(not c.isalnum() for c in node_name): - node_name_new = ''.join(e for e in node_name if e.isalnum() or e=="_" or e=="-") # removing special chars + logger.debug("The internal node name was too small or repeated "+node_name+" which is changed to "+node.name) + elif not all(e.isalnum() or e in ("_", "-") for e in node_name): #any(not c.isalnum() for c in node_name): + node_name_new = ''.join(e for e in node_name if e.isalnum() or e in ("_", "-")) # removing special chars if node_name_new in node_names: - node.name =node_name_new+"_"+str(counter_internal) + node_name_new += "_" + str(counter_internal) counter_internal += 1 - else: - node.name=node_name_new - + logger.debug("The internal node name has special chars or repeated '%s' which is changed to '%s'", + node.name, node_name_new) + node.name = node_name_new node_names.add(node.name) - logger_hog.debug("The internal node name has special chars or repeated " + node_name + " which is changed to " + node.name) else: node_names.add(node_name) - logger_hog.info("The species tree has " + str(len(species_tree)) + " leaves") - species_tree.prune(species_names) # , preserve_branch_length=True) - logger_hog.info("After pruning, the species tree has " + str(len(species_tree)) + " leaves") + logger.info("The species tree has %d leaves", len(species_tree)) + species_tree.prune(species_names) # , preserve_branch_length=True) + logger.info("After pruning, the species tree has %d leaves", len(species_tree)) - species_tree_output= "./species_tree_checked.nwk" - species_tree.write(format=1, format_root_node=True, outfile=species_tree_output) - logger_hog.debug("The new species tree is written "+species_tree_output) + species_tree_output = "./species_tree_checked.nwk" + species_tree.write(format=1, format_root_node=True, outfile=out_fname) + logger.debug("The new species tree is written %s", out_fname) return species_tree @@ -180,76 +171,83 @@ def check_splice(isoform_by_gene_all): total_isoforms+=len(isoform) total_genes+=1 - logger_hog.debug("For "+str(spliced_species_num)+" species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.") + logger.debug("For "+str(spliced_species_num)+" species, out of "+str(len(isoform_by_gene_all))+" , we have splice files.") if total_genes ==0: - logger_hog.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ") + logger.debug("It seems that in all of the splice files, each line has only one item. Make sure that the splitter in each line is semicolon ; ") sys.exit(1) - logger_hog.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+" splices.") + logger.debug("In total, for "+ str(total_genes)+" genes, we have " + str(total_isoforms)+" splices.") # make sys back return 1 - - - - def fastoma_check_input(): - _config.set_configs() - - # print(_config.in_folder) - print(_config.logger_level) - - print(_config.species_tree_address) # nwk format - - species_names1 = check_proteome_files() + import argparse + parser = argparse.ArgumentParser(description="checking parameters for FastOMA") + parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes") + parser.add_argument("--species-tree", required=True, help="Path to the input species tree file in newick format") + parser.add_argument("--out-tree", required=True, help="Path to output file for sanitised species tree. ") + parser.add_argument("--splice", help="Path to the folder containing the splice information files") + parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files") + parser.add_argument("--omamer_db", help="Path to the omamer database") + parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug") + conf = parser.parse_args() + logger.setLevel(level=30 - 10 * min(conf.v, 2)) + logger.debug("Arguments: %s", conf) + + species_names = check_proteome_files(conf.proteomes) + if not species_names: + logger.error("Halting FastOMA because of invalid proteome input data") + sys.exit(1) try: - species_tree = Tree(_config.species_tree_address, format=1) + species_tree = Tree(conf.species_tree, format=1) except: try: - species_tree = Tree(_config.species_tree_address) + species_tree = Tree(conf.species_tree) # todo add check fro Phyloxml except: - logger_hog.error("We have problem with parsing species tree "+str(_config.species_tree_address) + " using ete3 Tree. Is it in the in_folder ? Maybe there are some special chars.") + logger.error("We have problem with parsing species tree %s using ete3 Tree. Maybe there are some special chars.", conf.species_tree) + sys.exit(1) + check_speciestree_internalnode(species_tree) - check_speciestree_leaves(species_tree,species_names1) - add_internal_node_prune(species_tree,species_names1) + if not check_speciestree_leaves(species_tree, species_names): + logger.error("Halting FastOMA because of invalid species tree") + sys.exit(1) + add_internal_node_prune(species_tree, species_names, conf.out_tree) - species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes() # optional input folder - check_proteome(species_names, prot_recs_lists) - hogmap_files = os.path.exists("./hogmap_in/") - species_hogmaps =[] + species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes) + if not check_proteome(species_names, prot_recs_lists, conf.proteomes): + logger.error("Halting FastOMA because of invalid proteome input data") + sys.exit(1) + + hogmap_files = conf.hogmap is not None and os.path.exists(conf.hogmap) + species_hogmaps = [] if hogmap_files: - species_hogmaps = check_hogmap_files() + species_hogmaps = check_hogmap_files(conf.hogmap) - #print("species_hogmaps",species_hogmaps) - #print("species_names",species_names) hogmap_complete = False if set(species_hogmaps) == set(species_names): hogmap_complete = True - # print("hogmap_complete", hogmap_complete) - omamerdb = check_omamer_db() + omamerdb = check_omamer_db(conf.omamer_db) if not(omamerdb or hogmap_complete): - logger_hog.error("OMAmer db does not exit and no hogmap provided. ") - logger_hog.error("Check input failed. FastOMA halted!") + logger.error("OMAmer db does not exist and no hogmap provided.") + logger.error("Check input failed. FastOMA halted!") sys.exit(1) else: - logger_hog.info("OMAmer db or hogmap exist. It looks ok. ") + logger.info("OMAmer db or hogmap exist. It looks ok.") - #todo check splice file format . if the name matches with the proteome files. - splice_files = os.path.exists("./splice/") + # todo check splice file format . if the name matches with the proteome files. + splice_files = conf.splice is not None and os.path.exists(conf.splice) if splice_files: - logger_hog.debug("splice folder exist. Let's see its inside.") - isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names) + logger.debug("splice folder exist. Let's see its inside.") + isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, conf.splice) check_splice(isoform_by_gene_all) else: - logger_hog.info("Splice folder doesn't exist and that's ok.") - - - logger_hog.info("Input check finished ! ") + logger.info("Splice folder doesn't exist and that's ok.") + logger.info("Input check finished ! ") diff --git a/FastOMA/collect_subhogs.py b/FastOMA/collect_subhogs.py index b173a22..6fbe840 100644 --- a/FastOMA/collect_subhogs.py +++ b/FastOMA/collect_subhogs.py @@ -6,6 +6,7 @@ from ete3 import Tree import os from Bio import SeqIO +from pathlib import Path from FastOMA.zoo.hog import extract_flat_groups_at_level from FastOMA.zoo.hog.convert import orthoxml_to_newick @@ -16,7 +17,26 @@ # This code collect subhogs and writes outputs. -def max_og_tree(tree,species_dic): +def load_hogs(pickle_folder: Path): + hogs_all = [] + cnt = 0 + logger_hog.info("reading pickle files from %s", pickle_folder) + for root, dirs, files in os.walk(pickle_folder, followlinks=True): + for f in files: + if f.startswith('file_') and f.endswith('.pickle'): + with open(os.path.join(root, f), 'rb') as handle: + cur = pickle.load(handle) + hogs_all.extend(cur) + cnt += 1 + if cnt % 500 == 0: + logger_hog.info("read %d batch pickle files so far, resulting in %d roothogs so far", + cnt, len(hogs_all)) + logger_hog.info("number of pickle files is %d.", cnt) + logger_hog.debug("number of hogs in all batches is %d", len(hogs_all)) + return hogs_all + + +def max_og_tree(tree, species_dic): for node in tree.traverse("preorder"): # for node in xml_tree.traverse(strategy="preorder", is_leaf_fn=lambda n: hasattr(n, "attriremoved") and n.attriremoved==True): if not node.is_leaf() and hasattr(node, "Ev") and node.Ev == 'duplication': # node.name[:3] == "dup" @@ -59,34 +79,48 @@ def max_og_tree(tree,species_dic): return og_prot_list - - def fastoma_collect_subhogs(): - - - - # todo this function needs >200 GB of memory for >1500 species. needs some optimisation probably - - logger_hog.info("started collecting pickle files ") - + import argparse + parser = argparse.ArgumentParser(description="collecting all computed HOGs and combine into a single orthoxml") + parser.add_argument('--pickle-folder', required=True, + help="folder containing the pickle files. Will be searched recursively") + parser.add_argument('--roothogs-folder', required=True, help="folder containing the omamer roothogs") + parser.add_argument('--gene-id-pickle-file', required=True, + help="file containing the gene-id dictionary in pickle format") + parser.add_argument('--out', required=False, default="output_hog.orthoxml", help="output filename in orthoxml") + parser.add_argument('-v', action="count", default=0) + parser.add_argument('--roothog-tsv', default=None, required=False, + help="If specified, a tsv file with the given path will be produced containing the roothog asignments as TSV file.") + parser.add_argument('--marker-groups-fasta', default=None, required=False, + help="If specified, a folder named OrthologousFasta and a TSV file with the name provided in " + "this argument will be generated that contains single copy groups, i.e. groups which have " + "at most one gene per species. Useful as phylogenetic marker genes to reconstruct species " + "trees.") + conf = parser.parse_args() + logger_hog.setLevel(level=30 - 10 * min(conf.v, 2)) + logger_hog.debug(conf) + + write_hog_orthoxml(conf.pickle_folder, conf.out, conf.gene_id_pickle_file) + if conf.roothog_tsv is not None: + write_roothogs(conf.out, conf.roothog_tsv) + if conf.marker_groups_fasta is not None: + write_group_files(Path(conf.out), Path(conf.roothogs_folder), conf.marker_groups_fasta) + + +def write_hog_orthoxml(pickle_folder, output_xml_name, gene_id_pickle_file): # todo as input argument/option in nextflow # in benchamrk dataset the output prot names should be short # tr|A0A0N7KCI6|A0A0N7KCI6_ORYSJ # for qfo benchmark, the middle should be written in the file - pickle_folder = "./temp_pickles/" #pickle_rhogs - output_xml_name = "./output_hog.orthoxml" - gene_id_pickle_file = "./gene_id_dic_xml.pickle" # this file includes integer mapping of each protein - - orthoxml_file = ET.Element("orthoXML", attrib={"xmlns": "http://orthoXML.org/2011/", "origin": "OMA", "originVersion": "Nov 2021", "version": "0.3"}) # with open(gene_id_pickle_file, 'rb') as handle: gene_id_name = pickle.load(handle) # gene_id_name[query_species_name] = (gene_idx_integer, query_prot_name) - logger_hog.debug("We read the gene_id_name dictionary with"+str(len(gene_id_name))+"items") + logger_hog.debug("We read the gene_id_name dictionary with %d items", len(gene_id_name)) logger_hog.debug("Now creating the header of orthoxml") # #### create the header of orthoxml #### @@ -104,22 +138,8 @@ def fastoma_collect_subhogs(): logger_hog.debug("gene_xml is created.") # #### create the groups of orthoxml #### - pickle_files_adress_raw = listdir(pickle_folder) - pickle_files_adress = [i for i in pickle_files_adress_raw if i.endswith(".pickle") and i.startswith("file_")] - - logger_hog.info("number of pickle files is "+str(len(pickle_files_adress))+".") - logger_hog.debug("pickle files are " + str(len(pickle_files_adress)) + ".") - hogs_a_rhog_xml_all = [] - for pickle_file_adress in pickle_files_adress: - with open(pickle_folder + pickle_file_adress, 'rb') as handle: - hogs_a_rhog_xml_batch = pickle.load(handle) # list of hog object. - hogs_a_rhog_xml_all.extend(hogs_a_rhog_xml_batch) - # hogs_rhogs_xml_all is orthoxml_to_newick.py list of hog object. - - logger_hog.debug("number of hogs in all batches is "+str(len(hogs_a_rhog_xml_all))+" .") groups_xml = ET.SubElement(orthoxml_file, "groups") - - for hogs_a_rhog_xml in hogs_a_rhog_xml_all: + for hogs_a_rhog_xml in load_hogs(Path(pickle_folder)): groups_xml.append(hogs_a_rhog_xml) logger_hog.debug("converting the xml object to string.") @@ -130,93 +150,93 @@ def fastoma_collect_subhogs(): file_xml.write(xml_str) file_xml.close() - logger_hog.info("orthoxml is written in " + output_xml_name) + logger_hog.info("orthoxml is written in %s", output_xml_name) +def write_group_files(orthoxml: Path, roothog_folder: Path, output_file_og_tsv=None, output_fasta_groups=None): + if output_file_og_tsv is None: + output_file_og_tsv = "OrthologousGroups.tsv" + if output_fasta_groups is None: + output_fasta_groups = "OrthologousGroupsFasta" + output_fasta_groups = Path(output_fasta_groups) + logger_hog.info("Start writing OG fasta files ") - - - logger_hog.info("Now writing OG fasta files ") - - - input_orthoxml = output_xml_name # sys.argv[1] # "out_folder/output_hog_.orthoxml" - rhog_all_folder = "./temp_omamer_rhogs/" #sys.argv[2] + "/" # "out_folder/rhogs_all/" fasta_format = "fa" # of the rhogs - output_file_og_tsv = "OrthologousGroups.tsv" - - trees, species_dic = orthoxml_to_newick(input_orthoxml, return_gene_to_species=True) # encode_levels_as_nhx=False, xref_tag="protId", - print("We extracted " + str(len(trees)) + " trees in NHX format from the input HOG orthoxml" + input_orthoxml) + trees, species_dic = orthoxml_to_newick(orthoxml, return_gene_to_species=True) + logger_hog.info("We extracted %d trees in NHX format from the input HOG orthoxml %s", len(trees), orthoxml) OGs = {} for hog_id, tree_string in trees.items(): - try : + try: tree = Tree(tree_string, format=1) except: try: - tree = Tree(tree_string, format=1,quoted_node_names=True) + tree = Tree(tree_string, format=1, quoted_node_names=True) except: - print("error", tree_string) + logger_hog.error("Error loading tree", tree_string) + raise - og_prot_list = max_og_tree(tree,species_dic) - if len(og_prot_list) >= 2: # a group should have at least 1 member/protein + og_prot_list = max_og_tree(tree, species_dic) + if len(og_prot_list) >= 2: # a group should have at least 1 member/protein OGs[hog_id] = og_prot_list - print("done") + logger_hog.info("done extracting trees") with open(output_file_og_tsv, 'w') as handle: for hog_id, og_prot_list in OGs.items(): line_text = str(hog_id) + "\t" + str(og_prot_list)[1:-1] + "\n" handle.write(line_text) - handle.close() - print("We wrote the protein families information in the file " + output_file_og_tsv) + logger_hog.info("We wrote the protein families information in the file %s", output_file_og_tsv) - out_folder_ogs = "OrthologousGroupsFasta/" - os.makedirs(out_folder_ogs) - - print("start writing " + str(len(OGs)) + " OGs as fasta files in folder " + out_folder_ogs) + output_fasta_groups.mkdir(parents=True, exist_ok=True) + logger_hog.info("start writing %d OGs as fasta files in folder %s.", len(OGs), output_fasta_groups) for hog_id, og_prot_list in OGs.items(): # hog_id="HOG_0667494_sub10524" rhog_id = "_".join(hog_id.split("_")[:2]) - omamer_rhogs_all_address = rhog_all_folder + rhog_id + "." + fasta_format - omamer_rhogs_all_prots = list(SeqIO.parse(omamer_rhogs_all_address, "fasta")) + rhog_fasta = roothog_folder / (rhog_id + "." + fasta_format) + omamer_rhogs_all_prots = list(SeqIO.parse(rhog_fasta, "fasta")) og_prots = [] og_prot_list = OGs[hog_id] for rhogs_prot in omamer_rhogs_all_prots: - if rhogs_prot.id.split("||")[0] in og_prot_list: + prot_id = rhogs_prot.id + if _config.protein_format_qfo_dataset_before2022: + prot_id = prot_id.split('|')[1] + + if prot_id in og_prot_list: sp = rhogs_prot.id.split("||")[1] rhogs_prot.description += " [" + sp + "]" og_prots.append(rhogs_prot) og_id = "OG_" + hog_id # one OG per rootHOG # "/HOG_"+ rhogid - SeqIO.write(og_prots, out_folder_ogs + og_id + ".fa", "fasta") - print("writing done") - + SeqIO.write(og_prots, output_fasta_groups / (og_id + ".fa"), "fasta") + logger_hog.info("writing done") +def write_roothogs(orthoxml, out): # import sys # input_orthoxml = output_xml_name - output_file = "rootHOGs.tsv" + if out is None: + out = "rootHOGs.tsv" toplevel_groups = [] - for grp in extract_flat_groups_at_level(input_orthoxml): + for grp in extract_flat_groups_at_level(orthoxml): toplevel_groups.append(set(g.xref for g in grp)) - # toplevel_groups is a list of sets - - print("We extracted "+str(len(toplevel_groups))+" protein families from the input HOG orthoxml"+input_orthoxml) - print("The first one contain "+str(len(toplevel_groups[0]))+" proteins.") + logger_hog.info("We extracted %d protein families from the input HOG orthoxml %s", len(toplevel_groups), orthoxml) + logger_hog.info("The first one contain %d proteins.", len(toplevel_groups[0])) - with open(output_file, 'w') as handle: + with open(out, 'w') as handle: for toplevel_group_idx, toplevel_group in enumerate(toplevel_groups): line_text = str(toplevel_group_idx)+"\t"+str(toplevel_group)[1:-1]+"\n" handle.write(line_text) - handle.close() - print("We wrote the protein families information in the file "+output_file) + logger_hog.info("We wrote the protein families information in the file %s", out) +if __name__ == "__main__": + fastoma_collect_subhogs() \ No newline at end of file diff --git a/FastOMA/infer_roothogs.py b/FastOMA/infer_roothogs.py index 35b3cf7..0decb1c 100644 --- a/FastOMA/infer_roothogs.py +++ b/FastOMA/infer_roothogs.py @@ -1,11 +1,9 @@ - - import os.path from shutil import which -from ._utils_subhog import logger_hog from . import _utils_roothog from . import _config +logger = _config.logger_hog """ proteomes of species as fasta files in /proteome/ @@ -14,58 +12,47 @@ """ -# if not os.path.exists(_config.in_folder): -# os.mkdir(_config.in_folder) # s - -# in_folder+"omamer_database/oma_path/OmaServer.h5" -# logger_hog.info("rHOG inferece has started. The oma database address is in " + _config.oma_database_address) -# (oma_db, list_oma_species) = _utils_rhog.parse_oma_db(_config.oma_database_address) -# (query_species_names, query_prot_recs) = _utils_rhog.parse_proteome(list_oma_species) - - - def fastoma_infer_roothogs(): - _config.set_configs() - - # print(_config.in_folder) - print(_config.logger_level) - # import sys - # sys.exit(0) - - folder = "/scratch/smajidi1/qfo/23Nov/merging_v5/work/00/f7751409536d34dd1ba3192b79d6d1" - species_names, prot_recs_lists,fasta_format_keep = _utils_roothog.parse_proteomes(folder) # optional input folder + import argparse + parser = argparse.ArgumentParser(description="checking parameters for FastOMA") + parser.add_argument("--proteomes", required=True, help="Path to the folder containing the input proteomes") + parser.add_argument("--splice", help="Path to the folder containing the splice information files") + parser.add_argument("--hogmap", help="Path to the folder containing the hogmap files") + parser.add_argument("--out-rhog-folder", required=True, help="Folder where the roothog fasta files are written") + parser.add_argument('-v', action="count", default=0, help="Increase verbosity to info/debug") + conf = parser.parse_args() + logger.setLevel(level=30 - 10 * min(conf.v, 2)) + logger.debug("Arguments: %s", conf) + + species_names, prot_recs_lists, fasta_format_keep = _utils_roothog.parse_proteomes(conf.proteomes) # optional input folder prot_recs_all = _utils_roothog.add_species_name_prot_id(species_names, prot_recs_lists) - hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names,fasta_format_keep,folder) # optional input folder + hogmaps, unmapped = _utils_roothog.parse_hogmap_omamer(species_names, fasta_format_keep, folder=conf.hogmap) # optional input folder - splice_files = os.path.exists("./splice/") + splice_files = conf.splice is not None and os.path.exists(conf.splice) if splice_files: - isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names) - isoform_selected, isoform_not_selected = _utils_roothog.find_nonbest_isoform(species_names,isoform_by_gene_all,hogmaps) - _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected,prot_recs_lists) + isoform_by_gene_all = _utils_roothog.parse_isoform_file(species_names, folder=conf.splice) + isoform_selected, isoform_not_selected = _utils_roothog.find_nonbest_isoform( + species_names, isoform_by_gene_all, hogmaps + ) + _utils_roothog.write_isoform_selected(isoform_by_gene_all, isoform_selected, prot_recs_lists) # for each isoform file, there will be a file ending with _selected_isoforms.tsv - hogmaps = _utils_roothog.handle_splice(hogmaps,isoform_not_selected) - + hogmaps = _utils_roothog.handle_splice(hogmaps, isoform_not_selected) rhogs_prots = _utils_roothog.group_prots_roothogs(hogmaps) - - rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots,hogmaps) + rhogs_prots = _utils_roothog.handle_singleton(rhogs_prots, hogmaps) rhogs_prots = _utils_roothog.merge_rhogs(hogmaps, rhogs_prots) rhogs_prots = _utils_roothog.filter_big_roothogs(hogmaps, rhogs_prots) - address_rhogs_folder = "./temp_omamer_rhogs/" - min_rhog_size =2 - rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, address_rhogs_folder, min_rhog_size) - linclust_available=which("mmseqs") # True # + min_rhog_size = 2 + rhogid_written_list = _utils_roothog.write_rhog(rhogs_prots, prot_recs_all, conf.out_rhog_folder, min_rhog_size) + linclust_available=which("mmseqs") # True # # if memseqs is not installed the output will be empty / None if linclust_available: num_unmapped_singleton = _utils_roothog.collect_unmapped_singleton(rhogs_prots, unmapped, prot_recs_all, "singleton_unmapped.fa") if num_unmapped_singleton: result_linclust = _utils_roothog.run_linclust(fasta_to_cluster="singleton_unmapped.fa") - logger_hog.debug(" linclust is done "+ result_linclust) - num_clusters = _utils_roothog.write_clusters(address_rhogs_folder, min_rhog_size) - logger_hog.debug("we wrote "+str(num_clusters)+" new clusters with linclust ") - - # if _config.add_outgroup: # already done - # _utils_roothog.write_outgroups_all(rhogs_prots,prot_recs_all) \ No newline at end of file + logger.debug(" linclust is done %s", result_linclust) + num_clusters = _utils_roothog.write_clusters(conf.out_rhog_folder, min_rhog_size) + logger.debug("we wrote %d new clusters with linclust ", num_clusters) diff --git a/FastOMA/infer_subhogs.py b/FastOMA/infer_subhogs.py index 6d5e9a6..d2ad48c 100644 --- a/FastOMA/infer_subhogs.py +++ b/FastOMA/infer_subhogs.py @@ -45,7 +45,7 @@ def fastoma_infer_subhogs(): if inferhog_concurrent_on: print("parallelization for subhog inference is on.") - pickles_rhog_folder = "./" # pickles_temp/ pickle_rhogs + pickles_rhog_folder = "./pickle_hogs" # pickles_temp/ pickle_rhogs if not os.path.exists(pickles_rhog_folder): os.makedirs(pickles_rhog_folder) diff --git a/FastOMA/requirements b/FastOMA/requirements deleted file mode 100644 index fcba280..0000000 --- a/FastOMA/requirements +++ /dev/null @@ -1,166 +0,0 @@ -$ conda list -# packages in environment at /miniconda/envs/fastoma: -# -# Name Version Build Channel -_libgcc_mutex 0.1 conda_forge conda-forge -_openmp_mutex 4.5 2_gnu conda-forge -anyio 3.7.1 pyhd8ed1ab_0 conda-forge -argon2-cffi 23.1.0 pyhd8ed1ab_0 conda-forge -argon2-cffi-bindings 21.2.0 py39hb9d737c_3 conda-forge -arrow 1.2.3 pyhd8ed1ab_0 conda-forge -asttokens 2.2.1 pyhd8ed1ab_0 conda-forge -async-lru 2.0.4 pyhd8ed1ab_0 conda-forge -attrs 23.1.0 pyh71513ae_1 conda-forge -babel 2.12.1 pyhd8ed1ab_1 conda-forge -backcall 0.2.0 pyh9f0ad1d_0 conda-forge -backports 1.0 pyhd8ed1ab_3 conda-forge -backports.functools_lru_cache 1.6.5 pyhd8ed1ab_0 conda-forge -beautifulsoup4 4.12.2 pyha770c72_0 conda-forge -biopython 1.81 pypi_0 pypi -bleach 6.0.0 pyhd8ed1ab_0 conda-forge -blosc2 2.0.0 pypi_0 pypi -brotli-python 1.0.9 py39h5a03fae_9 conda-forge -bzip2 1.0.8 h7f98852_4 conda-forge -ca-certificates 2023.7.22 hbcca054_0 conda-forge -cached-property 1.5.2 hd8ed1ab_1 conda-forge -cached_property 1.5.2 pyha770c72_1 conda-forge -certifi 2023.7.22 pyhd8ed1ab_0 conda-forge -cffi 1.15.1 py39he91dace_3 conda-forge -charset-normalizer 3.2.0 pyhd8ed1ab_0 conda-forge -comm 0.1.4 pyhd8ed1ab_0 conda-forge -cython 3.0.1 pypi_0 pypi -debugpy 1.6.8 py39h3d6467e_0 conda-forge -decorator 5.1.1 pyhd8ed1ab_0 conda-forge -defusedxml 0.7.1 pyhd8ed1ab_0 conda-forge -dendropy 4.6.1 pypi_0 pypi -entrypoints 0.4 pyhd8ed1ab_0 conda-forge -ete3 3.1.3 pypi_0 pypi -exceptiongroup 1.1.3 pyhd8ed1ab_0 conda-forge -executing 1.2.0 pyhd8ed1ab_0 conda-forge -fastoma 0.0.6 dev_0 -fasttree 2.1.11 h031d066_2 bioconda -fqdn 1.5.1 pyhd8ed1ab_0 conda-forge -future 0.18.3 pypi_0 pypi -humanfriendly 10.0 pypi_0 pypi -idna 3.4 pyhd8ed1ab_0 conda-forge -importlib-metadata 6.8.0 pyha770c72_0 conda-forge -importlib_metadata 6.8.0 hd8ed1ab_0 conda-forge -importlib_resources 6.0.1 pyhd8ed1ab_0 conda-forge -ipykernel 6.25.1 pyh71e2992_0 conda-forge -ipython 8.14.0 pyh41d4057_0 conda-forge -iqtree 2.2.3 h21ec9f0_0 bioconda -isoduration 20.11.0 pyhd8ed1ab_0 conda-forge -jedi 0.19.0 pyhd8ed1ab_0 conda-forge -jinja2 3.1.2 pyhd8ed1ab_1 conda-forge -json5 0.9.14 pyhd8ed1ab_0 conda-forge -jsonpointer 2.0 py_0 conda-forge -jsonschema 4.19.0 pyhd8ed1ab_1 conda-forge -jsonschema-specifications 2023.7.1 pyhd8ed1ab_0 conda-forge -jsonschema-with-format-nongpl 4.19.0 pyhd8ed1ab_1 conda-forge -jupyter-lsp 2.2.0 pyhd8ed1ab_0 conda-forge -jupyter_client 8.3.0 pyhd8ed1ab_0 conda-forge -jupyter_core 5.3.1 py39hf3d152e_0 conda-forge -jupyter_events 0.7.0 pyhd8ed1ab_2 conda-forge -jupyter_server 2.7.1 pyhd8ed1ab_0 conda-forge -jupyter_server_terminals 0.4.4 pyhd8ed1ab_1 conda-forge -jupyterlab 4.0.5 pyhd8ed1ab_0 conda-forge -jupyterlab_pygments 0.2.2 pyhd8ed1ab_0 conda-forge -jupyterlab_server 2.24.0 pyhd8ed1ab_0 conda-forge -ld_impl_linux-64 2.40 h41732ed_0 conda-forge -libffi 3.4.2 h7f98852_5 conda-forge -libgcc-ng 13.1.0 he5830b7_0 conda-forge -libgomp 13.1.0 he5830b7_0 conda-forge -libnsl 2.0.0 h7f98852_0 conda-forge -libsodium 1.0.18 h36c2ea0_1 conda-forge -libsqlite 3.43.0 h2797004_0 conda-forge -libstdcxx-ng 13.1.0 hfd8a6a1_0 conda-forge -libuuid 2.38.1 h0b41bf4_0 conda-forge -libzlib 1.2.13 hd590300_5 conda-forge -llvmlite 0.40.1 pypi_0 pypi -lxml 4.9.3 pypi_0 pypi -mafft 7.520 h031d066_2 bioconda -markupsafe 2.1.3 py39hd1e30aa_0 conda-forge -matplotlib-inline 0.1.6 pyhd8ed1ab_0 conda-forge -mistune 3.0.1 pyhd8ed1ab_0 conda-forge -msgpack 1.0.5 pypi_0 pypi -nbclient 0.8.0 pyhd8ed1ab_0 conda-forge -nbconvert-core 7.7.4 pyhd8ed1ab_0 conda-forge -nbformat 5.9.2 pyhd8ed1ab_0 conda-forge -ncurses 6.4 hcb278e6_0 conda-forge -nest-asyncio 1.5.6 pyhd8ed1ab_0 conda-forge -nextflow 23.4.3 pypi_0 pypi -notebook-shim 0.2.3 pyhd8ed1ab_0 conda-forge -numba 0.57.1 pypi_0 pypi -numexpr 2.8.5 pypi_0 pypi -numpy 1.24.4 pypi_0 pypi -omamer 0.2.6 pypi_0 pypi -openssl 3.1.2 hd590300_0 conda-forge -overrides 7.4.0 pyhd8ed1ab_0 conda-forge -packaging 23.1 pyhd8ed1ab_0 conda-forge -pandas 2.0.3 pypi_0 pypi -pandocfilters 1.5.0 pyhd8ed1ab_0 conda-forge -parso 0.8.3 pyhd8ed1ab_0 conda-forge -pexpect 4.8.0 pyh1a96a4e_2 conda-forge -pickleshare 0.7.5 py_1003 conda-forge -pip 23.2.1 pyhd8ed1ab_0 conda-forge -pkgutil-resolve-name 1.3.10 pyhd8ed1ab_0 conda-forge -platformdirs 3.10.0 pyhd8ed1ab_0 conda-forge -prometheus_client 0.17.1 pyhd8ed1ab_0 conda-forge -prompt-toolkit 3.0.39 pyha770c72_0 conda-forge -prompt_toolkit 3.0.39 hd8ed1ab_0 conda-forge -property-manager 3.0 pypi_0 pypi -psutil 5.9.5 py39h72bdee0_0 conda-forge -ptyprocess 0.7.0 pyhd3deb0d_0 conda-forge -pure_eval 0.2.2 pyhd8ed1ab_0 conda-forge -py-cpuinfo 9.0.0 pypi_0 pypi -pycparser 2.21 pyhd8ed1ab_0 conda-forge -pygments 2.16.1 pyhd8ed1ab_0 conda-forge -pyparsing 3.1.1 pypi_0 pypi -pysais 1.1.0 pypi_0 pypi -pysocks 1.7.1 pyha2e5f31_6 conda-forge -python 3.9.17 h0755675_0_cpython conda-forge -python-dateutil 2.8.2 pyhd8ed1ab_0 conda-forge -python-fastjsonschema 2.18.0 pyhd8ed1ab_0 conda-forge -python-json-logger 2.0.7 pyhd8ed1ab_0 conda-forge -python_abi 3.9 3_cp39 conda-forge -pytz 2023.3 pyhd8ed1ab_0 conda-forge -pyyaml 6.0.1 py39hd1e30aa_0 conda-forge -pyzmq 25.1.1 py39hb257651_0 conda-forge -readline 8.2 h8228510_1 conda-forge -referencing 0.30.2 pyhd8ed1ab_0 conda-forge -requests 2.31.0 pyhd8ed1ab_0 conda-forge -rfc3339-validator 0.1.4 pyhd8ed1ab_0 conda-forge -rfc3986-validator 0.1.1 pyh9f0ad1d_0 conda-forge -rpds-py 0.9.2 py39h9fdd4d6_0 conda-forge -scipy 1.11.2 pypi_0 pypi -send2trash 1.8.2 pyh41d4057_0 conda-forge -setuptools 68.1.2 pyhd8ed1ab_0 conda-forge -six 1.16.0 pyh6c4a22f_0 conda-forge -sniffio 1.3.0 pyhd8ed1ab_0 conda-forge -soupsieve 2.3.2.post1 pyhd8ed1ab_0 conda-forge -stack_data 0.6.2 pyhd8ed1ab_0 conda-forge -tables 3.8.0 pypi_0 pypi -terminado 0.17.1 pyh41d4057_0 conda-forge -tinycss2 1.2.1 pyhd8ed1ab_0 conda-forge -tk 8.6.12 h27826a3_0 conda-forge -tomli 2.0.1 pyhd8ed1ab_0 conda-forge -tornado 6.3.3 py39hd1e30aa_0 conda-forge -tqdm 4.66.1 pypi_0 pypi -traitlets 5.9.0 pyhd8ed1ab_0 conda-forge -typing-extensions 4.7.1 hd8ed1ab_0 conda-forge -typing_extensions 4.7.1 pyha770c72_0 conda-forge -typing_utils 0.1.0 pyhd8ed1ab_0 conda-forge -tzdata 2023.3 pypi_0 pypi -uri-template 1.3.0 pyhd8ed1ab_0 conda-forge -urllib3 2.0.4 pyhd8ed1ab_0 conda-forge -verboselogs 1.7 pypi_0 pypi -wcwidth 0.2.6 pyhd8ed1ab_0 conda-forge -webcolors 1.13 pyhd8ed1ab_0 conda-forge -webencodings 0.5.1 py_1 conda-forge -websocket-client 1.6.2 pyhd8ed1ab_0 conda-forge -wheel 0.41.2 pyhd8ed1ab_0 conda-forge -xz 5.2.6 h166bdaf_0 conda-forge -yaml 0.2.5 h7f98852_2 conda-forge -zeromq 4.3.4 h9c3ff4c_1 conda-forge -zipp 3.16.2 pyhd8ed1ab_0 conda-forge -linclust \ No newline at end of file diff --git a/FastOMA/zoo/unionfind.py b/FastOMA/zoo/unionfind.py new file mode 100644 index 0000000..83036e1 --- /dev/null +++ b/FastOMA/zoo/unionfind.py @@ -0,0 +1,124 @@ +import collections + +"""UnionFind.py + +Union-find data structure. Based on Josiah Carlson's code, +http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/215912 +with significant additional changes by D. Eppstein and +Adrian Altenhoff. +""" + + +class UnionFind(object): + """Union-find data structure. + + Each unionFind instance X maintains a family of disjoint sets of + hashable objects, supporting the following two methods: + + - X[item] returns a name for the set containing the given item. + Each set is named by an arbitrarily-chosen one of its members; as + long as the set remains unchanged it will keep the same name. If + the item is not yet part of a set in X, a new singleton set is + created for it. + + - X.union(item1, item2, ...) merges the sets containing each item + into a single larger set. If any item is not yet part of a set + in X, it is added to X as one of the members of the merged set. + """ + + def __init__(self, elements=None): + """Create a new union-find structure. + + If elements is not None, the structure gets initialized + with each element as a singleton component. + + :param elements: an iterable to initialize the structure. + """ + + self.weights = {} + self.parents = {} + if elements is not None: + for elem in iter(elements): + self.parents[elem] = elem + self.weights[elem] = 1 + + def __getitem__(self, obj): + """return the name of set which contains obj. + + :param obj: the query object + + :SeeAlso: :meth:`find`""" + return self.find(obj) + + def find(self, obj): + """Find and return the name of the set containing the obj. + + If the object is not found in any set, a new singleton set + is created that holds only this object until it is further merged.""" + + # check for previously unknown obj. If unknown, add it + # as a new cluster + if obj not in self.parents: + self.parents[obj] = obj + self.weights[obj] = 1 + return obj + + # find path of objects leading to the root + path = [obj] + root = self.parents[obj] + while root != path[-1]: + path.append(root) + root = self.parents[root] + + # compress the path and return + for ancestor in path: + self.parents[ancestor] = root + return root + + def remove(self, obj): + """Remove an object from the sets. + + Removes an object entirly from the datastructure. The + containing set will shrink by this one element. + + :Note: If one tries to accessed it afterwards using + :meth:`find`, it will be created newly and put as a + singleton. + """ + if obj not in self.parents: + return + comp = self.find(obj) + self.weights[comp] -= 1 + self.parents.pop(obj) + + def __iter__(self): + """Iterate through all items ever found or unioned by this structure.""" + return iter(self.parents) + + def union(self, *objects): + """Find the sets containing the objects and merge them. + + any number of objects can be passed to this method and + all of them will be merged into one set containing at + least these objects. + + :param objects: the objects to be merged. they have to be all + hashable. If they haven't been initialy added to the UnionFind + datastructre at instantiation time, they are added at this point + in time. + """ + roots = [self[x] for x in objects] + heaviest = max([(self.weights[r], r) for r in roots], key=lambda x: x[0])[1] + for r in roots: + if r != heaviest: + self.weights[heaviest] += self.weights[r] + self.parents[r] = heaviest + + def get_components(self): + """return a list of sets corresponding to the connected + components of the structure.""" + comp_dict = collections.defaultdict(set) + for elem in iter(self): + comp_dict[self[elem]].add(elem) + comp = list(comp_dict.values()) + return comp diff --git a/FastOMA_light.nf b/FastOMA_light.nf deleted file mode 100644 index 44bf9d6..0000000 --- a/FastOMA_light.nf +++ /dev/null @@ -1,215 +0,0 @@ - -// NXF_WRAPPER_STAGE_FILE_THRESHOLD='50000' - -params.input_folder = "./in_folder/" -params.output_folder = "./out_folder/" -params.proteome_folder = params.input_folder + "/proteome" -params.proteomes = params.proteome_folder + "/*" -params.hogmap_in = params.input_folder + "/hogmap_in" - -params.hogmap_folder = params.output_folder + "/hogmap" -params.splice_folder = params.input_folder + "/splice" -params.species_tree = params.input_folder + "/species_tree.nwk" -params.species_tree_checked = params.output_folder + "/species_tree_checked.nwk" - -params.temp_pickles = params.output_folder + "/temp_pickles" -params.genetrees_folder = params.output_folder + "/genetrees" - -params.temp_output = params.output_folder +"/temp_output" //"/temp_omamer_rhogs" - - -process check_input{ - - - publishDir params.output_folder, mode: 'copy' - input: - path proteome_folder - path hogmap_folder - path species_tree - path omamerdb - path splice_folder - output: - path "species_tree_checked.nwk" - val true - script: - """ - fastoma-check-input - """ -} - - -process omamer_run{ - - - publishDir params.hogmap_folder, mode: 'copy' - input: - path proteomes_omamerdb_inputhog - val ready_input_check_c - output: - path "*.hogmap" - val true - script: //todo this if condition can be done as part of nextflow, so it won't submit job for cp - """ - if [ -f ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ] - then - cp ${proteomes_omamerdb_inputhog[2]}/${proteomes_omamerdb_inputhog[0]}.hogmap ${proteomes_omamerdb_inputhog[0]}.hogmap - else - omamer search -n 10 --db ${proteomes_omamerdb_inputhog[1]} --query ${proteomes_omamerdb_inputhog[0]} --out ${proteomes_omamerdb_inputhog[0]}.hogmap - fi - """ // --nthreads 10 -} - - - -process infer_roothogs{ - - - publishDir = [ - path: params.temp_output, - mode: 'copy', // pattern: "temp_output", saveAs: { filename -> filename.equals('temp_omamer_rhogs') ? null : filename } - ] - input: - val ready_omamer_run - path hogmap_folder - path proteome_folder - path splice_folder - output: - path "temp_omamer_rhogs" - path "gene_id_dic_xml.pickle" - path "selected_isoforms" , optional: true // SPECIESNAME_selected_isoforms.tsv - val true // nextflow-io.github.io/patterns/state-dependency/ - script: - """ - fastoma-infer-roothogs --logger-level DEBUG - """ -} - - -process batch_roothogs{ - - - input: - val ready_infer_roothogs - path "temp_omamer_rhogs" - output: - path "rhogs_rest/*", optional: true - path "rhogs_big/*" , optional: true - val true - script: - """ - fastoma-batch-roothogs - """ -} - -process hog_big{ - cpus 2 - - - publishDir params.temp_pickles - input: - val rhogsbig_tree_ready - output: - path "*.pickle" - path "*.fa", optional: true // msa if write True - path "*.nwk", optional: true // gene trees if write True - val true - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsbig_tree_ready[0]} --species-tree ${rhogsbig_tree_ready[1]} --parallel - """ -} -temp_pickles =2 - - - -process hog_rest{ - - - publishDir params.temp_pickles - input: - val rhogsrest_tree_ready - output: - path "*.pickle" - path "*.fa" , optional: true // msa if write True - path "*.nwk" , optional: true // gene trees if write True - val true - script: - """ - fastoma-infer-subhogs --input-rhog-folder ${rhogsrest_tree_ready[0]} --species-tree ${rhogsrest_tree_ready[1]} - """ // --parrallel False -} - - -process collect_subhogs{ - - - publishDir params.output_folder, mode: 'copy' - input: - val ready_hog_rest - val ready_hog_big - path "temp_pickles" // this is the folder includes pickles_rhogs - path "gene_id_dic_xml.pickle" - path "temp_omamer_rhogs" - output: - path "output_hog.orthoxml" - path "OrthologousGroupsFasta" - path "OrthologousGroups.tsv" - path "rootHOGs.tsv" - script: - """ - fastoma-collect-subhogs - """ -} - - - - -workflow { - proteomes = Channel.fromPath(params.proteomes, type:'any' ,checkIfExists:true) - proteome_folder = Channel.fromPath(params.proteome_folder) - hogmap_folder = Channel.fromPath(params.hogmap_folder) - splice_folder = Channel.fromPath(params.splice_folder) - temp_output = Channel.fromPath(params.temp_output) - genetrees_folder = Channel.fromPath(params.genetrees_folder) - hogmap_in = Channel.fromPath(params.hogmap_in) - - temp_pickles = Channel.fromPath(params.temp_pickles) - omamerdb = Channel.fromPath(params.input_folder+"/omamerdb.h5") - species_tree = Channel.fromPath(params.species_tree) - species_tree_checked = Channel.fromPath(params.species_tree_checked) - - (species_tree_checked_, ready_input_check) = check_input(proteome_folder,hogmap_in,species_tree,omamerdb,splice_folder) - ready_input_check_c = ready_input_check.collect() - //species_tree_checked.view{"species_tree_checked ${it}"} - - proteomes_omamerdb = proteomes.combine(omamerdb) - proteomes_omamerdb_inputhog = proteomes_omamerdb.combine(hogmap_in) // proteomes_omamerdb_inputhog.view{" rhogsbig ${it}"} - //proteomes_omamerdb_inputhog_inputcheck = proteomes_omamerdb_inputhog.combine(ready_input_check_c) - (hogmap, ready_omamer_run)= omamer_run(proteomes_omamerdb_inputhog,ready_input_check_c) - ready_omamer_run_c = ready_omamer_run.collect() - (temp_omamer_rhogs, gene_id_dic_xml,selected_isoforms, ready_infer_roothogs) = infer_roothogs(ready_omamer_run_c, hogmap_folder, proteome_folder, splice_folder) - ready_infer_roothogs_c = ready_infer_roothogs.collect() - - (rhogs_rest_list, rhogs_big_list, ready_batch_roothogs) = batch_roothogs(ready_infer_roothogs_c, temp_omamer_rhogs) - ready_batch_roothogs_c = ready_batch_roothogs.collect() - - - rhogsbig = rhogs_big_list.flatten() - rhogsbig_tree = rhogsbig.combine(species_tree_checked) - rhogsbig_tree_ready = rhogsbig_tree.combine(ready_batch_roothogs) // rhogsbig_tree_ready.view{"rhogsbig_tree_ready ${it}"} - (pickle_big_rhog, msas_out, genetrees_out, ready_hog_big) = hog_big(rhogsbig_tree_ready) - - rhogsrest = rhogs_rest_list.flatten() - rhogsrest_tree = rhogsrest.combine(species_tree_checked) - rhogsrest_tree_ready = rhogsrest_tree.combine(ready_batch_roothogs_c) - (pickle_rest_rhog, msas_out_rest, genetrees_out_test, ready_hog_rest) = hog_rest(rhogsrest_tree_ready) - - (orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(ready_hog_rest.collect(), ready_hog_big.collect(), temp_pickles, gene_id_dic_xml, temp_omamer_rhogs) - // temp_omamer_rhogs.view{" output omamer_rhogs ${it}"} - // orthoxml_file.view{" output orthoxml file ${it}"} - -} - - - -// todo: check input files very beginning (before omamer starts) e.g all species are in the species tree. No species chars in fasta record. diff --git a/README.md b/README.md index 88a75c9..99828af 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,22 @@ The details of output are described [below](https://github.com/DessimozLab/FastO In summary, you need to 1) install FastOMA and its prerequisites (below), and 2) put the input files in the folder `in_folder` and 3) run FastOMA using the nextflow recipe `FastOMA_light.nf`. ``` -nextflow FastOMA_light.nf --input_folder /path/to/in_folder --output_folder /path/to/out_folder +nextflow run FastOMA_light.nf -profile docker --input_folder /path/to/in_folder --output_folder /path/to/out_folder ``` -The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`. +The script `FastOMA_light.nf` is tailored for a few species. To run FastOMA with hundreds of species, please use `FastOMA.nf`. + +## More details on how to run +We provide for every commit of the repository a docker image for FastOMA on dockerhub. You can specify the container as +part of the nextflow command with the parameter `container_version`. If you want to use the container of the current +git checkout version, you can specify this in the following way: + +```bash +nextflow run FastOMA.nf -profile docker \ + --container_version "sha-$(git rev-list --max-count=1 --abbrev-commit HEAD)" \ + --input_folder testdata/in_folder \ + --output_folder myresult/ +``` + # How to install FastOMA diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 0000000..c92b610 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,55 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + dessimozlab/FastOMA Nextflow base config file +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + A 'blank slate' config file, appropriate for general use on most high performance + compute environments. Assumes that all software is installed and available on + the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. +---------------------------------------------------------------------------------------- +*/ + +process { + + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + shell = ['/bin/bash', '-euo', 'pipefail'] + + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_low { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withLabel:process_medium { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + time = { check_max( 8.h * task.attempt, 'time' ) } + } + withLabel:process_high { + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 16.h * task.attempt, 'time' ) } + } + withLabel:process_long { + time = { check_max( 20.h * task.attempt, 'time' ) } + } + withLabel:process_high_memory { + memory = { check_max( 200.GB * task.attempt, 'memory' ) } + } + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } +} \ No newline at end of file diff --git a/environment-conda.yml b/environment-conda.yml new file mode 100644 index 0000000..906210a --- /dev/null +++ b/environment-conda.yml @@ -0,0 +1,12 @@ +name: fastoma-env +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - omamer + - mafft + - fasttree + - pip + - pip: + - . diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 0000000..7529c85 --- /dev/null +++ b/nextflow.config @@ -0,0 +1,128 @@ +// General configuration used in all profiles +manifest { + name = "FastOMA - Orthology inference" + description = """FastOMA computes Hierarchical Orthologous Groups from proteomes.""" + author = "Sina Majidian, Adrian Altenhoff" + homePage = "https://omabrowser.org" + mainScript = "FastOMA.nf" + nextflowVersion = "22.10.4" +} + +params { + container_name = "dessimozlab/fastoma" + container_version = "latest" + omamer_db = "https://omabrowser.org/All/Primates-v2.0.0.h5" + debug_enabled = false + help = false + report = false + + output_folder = "Output" + statsdir = "${params.output_folder}/stats" + + // Max resource options + // Defaults only, expecting to be overwritten + max_memory = '128.GB' + max_cpus = 24 + max_time = '120.h' +} + +// Profiles configure nextflow depending on the environment (local, docker, singularity) +profiles { + + docker { + process { + container = "$params.container_name:$params.container_version" + } + docker.enabled = true + } + + singularity { + process { + container = "$params.container_name:$params.container_version" + } + singularity.enabled = true + singularity.autoMounts = true + } + + standard { + process.executor = 'local' + } + + conda { + process.conda = "environment-conda.yml" + conda.enabled = true + } + + slurm_singularity { + process { + container = "$params.container_name:$params.container_version" + executor = "slurm" + time = 4.h + memory = 20.GB + } + singularity.enabled = true + singularity.autoMounts = true + } + + slurm_conda { + process { + conda = "environment-conda.yml" + executor = "slurm" + time = 4.h + memory = 20.GB + } + conda.enabled = true + } +} + +def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') +timeline { + enabled = params.report + file = "${params.statsdir}/timeline_${trace_timestamp}.html" +} +report { + enabled = params.report + file = "${params.statsdir}/report_{trace_timestamp}.html" +} +trace { + enabled = params.report + file = "${params.statsdir}/trace_${trace_timestamp}.txt" +} +dag { + enabled = params.report + file = "${params.statsdir}/pipeline_dag_${trace_timestamp}.html" +} + +includeConfig "conf/base.config" + +// function to check maximum resources +def check_max(obj, type) { + if (type == 'memory') { + try { + if (obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'time') { + try { + if (obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if (type == 'cpus') { + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..fc5e48e --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "FastOMA" +dynamic = ["version"] +description = "FastOMA - a package to infer orthology information among proteomes" +readme = "README.md" +license = "MIT" +requires-python = ">=3.8" +authors = [ + { name = "Sina Majidian", email = "sina.majidian@gmail.com" }, + { name = "Adrian Altenhoff", email = "adrian.altenhoff@inf.ethz.ch" } +] +dependencies = [ + "biopython ~=1.81", + "DendroPy ~=4.3", + "ete3 ~=3.1", + "lxml ~=4.3", + "omamer ~=2.0", + "pyham ~=1.1", + "pyparsing ", +] + +[project.optinal-dependencies] +nextflow = [ + "nextflow" +] + + +[project.scripts] +fastoma-batch-roothogs = "FastOMA.batch_roothogs:fastoma_batch_roothogs" +fastoma-check-input = "FastOMA.check_input:fastoma_check_input" +fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs" +fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs" +fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs" + +[project.urls] +Homepage = "https://github.com/DessimozLab/FastOMA" + +[tool.hatch.version] +path = "FastOMA/__init__.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/FastOMA", +]