From 8069caa954485a8324e36a068021b5b3ee1c8398 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Wed, 27 Jul 2022 16:55:52 +0200 Subject: [PATCH] eggnog mapper: implement search and annotation phase as separate tools for rationale see README --- tools/eggnog_mapper/eggnog_macros.xml | 468 +++++++++++++++++- tools/eggnog_mapper/eggnog_mapper/README | 21 + .../eggnog_mapper/eggnog_mapper.xml | 358 +------------- .../eggnog_mapper/eggnog_mapper_annotate.xml | 155 ++++++ .../eggnog_mapper/eggnog_mapper_search.xml | 111 +++++ 5 files changed, 763 insertions(+), 350 deletions(-) create mode 100644 tools/eggnog_mapper/eggnog_mapper/README create mode 100644 tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml create mode 100644 tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml diff --git a/tools/eggnog_mapper/eggnog_macros.xml b/tools/eggnog_mapper/eggnog_macros.xml index ac49d0b28..8a9142865 100644 --- a/tools/eggnog_mapper/eggnog_macros.xml +++ b/tools/eggnog_mapper/eggnog_macros.xml @@ -3,6 +3,7 @@ 2.1.8 3 5.0.2 + 22.01 + @@ -105,15 +127,316 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' - query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + value.metadata.columns == 11 + + + + + + + value.metadata.columns == 22 + + + + + + annotate_hits_table.tsv + && + #end if + ]]> + + - - - - - - + query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov + + + + ortho_method['m'] not in ['no_search', 'cache'] + + + + + @@ -122,6 +445,83 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' + + + + + + + Min E-value expected when searching for seed eggNOG ortholog. Applies to phmmer/diamond searches. + Queries not having a significant seed orthologs (E-value less than threshold) will not be annotated. + + + + + Min bit score expected when searching for seed eggNOG ortholog. + Queries not having a significant seed orthologs will not be annotated. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ortho_method['m'] == 'cache' and ortho_method['output_no_annotations'] + + + + + + ortho_method['m'] != 'cache' + output_options['report_orthologs'] + + + + + + @@ -142,4 +542,58 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' + + diff --git a/tools/eggnog_mapper/eggnog_mapper/README b/tools/eggnog_mapper/eggnog_mapper/README new file mode 100644 index 000000000..fdef9b01b --- /dev/null +++ b/tools/eggnog_mapper/eggnog_mapper/README @@ -0,0 +1,21 @@ +This folder contains three tools: + +1. eggnogg_mapper: which runs the search and annotation phase in a single tool +2. eggnogg_mapper_search: which implements the search phase +3. eggnogg_mapper_annotate: which implements the annotation phase + +While the search phase of eggnog_mapper is very CPU intense and is efficient +also for larger number of threads, the anotation phase is very IO intensive +and can be very inefficient (depending on the configuration, e.g. if the +reference data is located on a slow partition). + +While for most applications eggnogg_mapper will be sufficient separating the +two phases can be more efficient: + +- sending eggnogg_mapper_search to a destination using many threads +- and eggnogg_mapper_annotate to a destination using a small numbe of threads + +If eggnogg_mapper_annotate is send to a single core destination +the option `--dbmem` is activated which will copy the complete +EggNOG annotation DB into memory which is usually much faster +than using multiple cores (but needs approx. 37GB of RAM). \ No newline at end of file diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml index 502422f39..0666d582e 100644 --- a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml @@ -1,4 +1,4 @@ - + functional sequence annotation by orthology eggnog_macros.xml @@ -6,86 +6,15 @@ annotate_hits_table.tsv - && - #end if + @MERGE_ANNOTATIONS@ emapper.py - --data_dir '$eggnog_data.fields.path' - -m '$ortho_method.m' - - #if $ortho_method.m in ['diamond', 'mmseqs', 'cache']: - -i '$ortho_method.input' - --itype '$ortho_method.input_trans.itype' - #if $ortho_method.input_trans.itype in ['CDS', 'genome', 'metagenome']: - $ortho_method.input_trans.translate - #end if - #if $ortho_method.input_trans.itype in ['genome', 'metagenome']: - --genepred $ortho_method.input_trans.genepred - #end if - #elif $ortho_method.m == "no_search" - --annotate_hits_table annotate_hits_table.tsv - #end if - - #if $ortho_method.m == 'cache' - --cache '$ortho_method.cache' - #end if - - #if $ortho_method.m in ['diamond', 'mmseqs']: - ## Diamond option - #if $ortho_method.m == "diamond": - --matrix '$ortho_method.matrix_gapcosts.matrix' - $ortho_method.matrix_gapcosts.gap_costs - --sensmode $ortho_method.sensmode - $ortho_method.dmnd_iterate - $ortho_method.dmnd_ignore_warnings - #elif $ortho_method.m == "mmseqs": - --start_sens $ortho_method.start_sens - --sens_steps $ortho_method.sens_steps - --final_sens $ortho_method.final_sens - #end if - - ## Common options for search filtering (applies to diamond and mmseqs only) - #if str($ortho_method.query_cover): - --query_cover $ortho_method.query_cover - #end if - #if str($ortho_method.subject_cover): - --subject_cover $ortho_method.subject_cover - #end if - #if str($ortho_method.pident): - --pident $ortho_method.pident - #end if - #if str($ortho_method.evalue): - --evalue $ortho_method.evalue - #end if - #if str($ortho_method.score): - --score $ortho_method.score - #end if - #end if - + @DB_TOKEN@ + @ORTHO_SEARCH_TOKEN@ #if $annotation_options.no_annot == "--no_annot" --no_annot #else - #if str($annotation_options.seed_ortholog_evalue): - --seed_ortholog_evalue $annotation_options.seed_ortholog_evalue - #end if - #if str($annotation_options.seed_ortholog_score): - --seed_ortholog_score $annotation_options.seed_ortholog_score - #end if - #if $annotation_options.tax_scope: - --tax_scope=$annotation_options.tax_scope - #end if - #if $annotation_options.target_orthologs: - --target_orthologs=$annotation_options.target_orthologs - #end if - #if $annotation_options.go_evidence: - --go_evidence=$annotation_options.go_evidence - #end if + @ANNOTATION_TOKEN@ #end if $output_options.no_file_comments $output_options.report_orthologs @@ -96,219 +25,15 @@ --temp_dir \${TEMP:-\$_GALAXY_JOB_TMP_DIR} ]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - value.metadata.columns == 11 - - - - - - - value.metadata.columns == 22 - - - - - + + - - - Min E-value expected when searching for seed eggNOG ortholog. Applies to phmmer/diamond searches. - Queries not having a significant seed orthologs (E-value less than threshold) will not be annotated. - - - - - Min bit score expected when searching for seed eggNOG ortholog. - Queries not having a significant seed orthologs will not be annotated. - - - - - - - - - - - - - - - + @@ -317,40 +42,17 @@ + label="Output a file with the list of orthologs for each hit"/> - - ortho_method['m'] not in ['no_search', 'cache'] - - - - - + + annotation_options['no_annot'] == '' - - - - - - - - - - - - - ortho_method['m'] != 'cache' and output_options['report_orthologs'] - - - - - - ortho_method['m'] == 'cache' and output_options['output_no_annotations'] - + + @@ -502,40 +204,11 @@ EggNOG-mapper is also available as a public online resource: ` diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml new file mode 100644 index 000000000..762dc999b --- /dev/null +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml @@ -0,0 +1,155 @@ + + annotation phase + + eggnog_macros.xml + + + + + + + + +
+ +
+ +
+ + + +
+
+ + + + + + + + + + + + + +
+ + +
+ + +
+ + + + + + + + + +
+ + + +
+ + + + + + + +
+ + + + + + + +
+ +
+
+ + +
+ + + + + +
+
+ `_. + +Outputs +------- + +@HELP_ANNOTATION_OUTPUTS@ + +**Recommentation for large input data** + +EggNOG-mapper consists of two phases + +1. finding seed orthologous sequences (compute intensive) +2. expanding annotations (IO intensive) + +by default (i.e. if *Method to search seed orthologs* is not *Skip search stage...* and *Annotate seed orthologs* is *Yes*) +both phases are executed within one tool run. + +For large input FASTA datasets in can be favourable to split this in two separate +tool runs as follows: + +1. Split the FASTA (e.g. 1M seqs per data set) +2. Run the search phase only (set *Annotate seed orthologs* to *No*) on the separate FASTA files. +3. Run the annotation phase (set *Method to search seed orthologs* to *Skip search stage...*) + +See [also](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#Setting_up_large_annotation_jobs) + +Another alternative is to use cached annotations (produced in a run with --md5 enabled). + + + ]]> + +
diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml new file mode 100644 index 000000000..c37e5d8c5 --- /dev/null +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml @@ -0,0 +1,111 @@ + + search phase + + eggnog_macros.xml + + + + + + + +
+ + + +
+
+ + + + + + + + + + +
+ +
+ + +
+ + + + + + + + + +
+ + +
+ + + + +
+
+ `_. + +Outputs +------- + +@HELP_SEARCH_OUTPUTS@ + +**Recommentation for large input data** + +EggNOG-mapper consists of two phases + +1. finding seed orthologous sequences (compute intensive) +2. expanding annotations (IO intensive) + +by default (i.e. if *Method to search seed orthologs* is not *Skip search stage...* and *Annotate seed orthologs* is *Yes*) +both phases are executed within one tool run. + +For large input FASTA datasets in can be favourable to split this in two separate +tool runs as follows: + +1. Split the FASTA (e.g. 1M seqs per data set) +2. Run the search phase only (set *Annotate seed orthologs* to *No*) on the separate FASTA files. +3. Run the annotation phase (set *Method to search seed orthologs* to *Skip search stage...*) + +See [also](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#Setting_up_large_annotation_jobs) + +Another alternative is to use cached annotations (produced in a run with --md5 enabled). + + + ]]> + +