From 09f836d4911a07266b94c999d29d81988127392a Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Wed, 27 Jul 2022 16:55:52 +0200 Subject: [PATCH 1/4] eggnog mapper: implement search and annotation phase as separate tools for rationale see README --- tools/eggnog_mapper/eggnog_macros.xml | 468 +++++++++++++++++- tools/eggnog_mapper/eggnog_mapper/README | 21 + .../eggnog_mapper/eggnog_mapper.xml | 358 +------------- .../eggnog_mapper/eggnog_mapper_annotate.xml | 155 ++++++ .../eggnog_mapper/eggnog_mapper_search.xml | 111 +++++ 5 files changed, 763 insertions(+), 350 deletions(-) create mode 100644 tools/eggnog_mapper/eggnog_mapper/README create mode 100644 tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml create mode 100644 tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml diff --git a/tools/eggnog_mapper/eggnog_macros.xml b/tools/eggnog_mapper/eggnog_macros.xml index ac49d0b28..8a9142865 100644 --- a/tools/eggnog_mapper/eggnog_macros.xml +++ b/tools/eggnog_mapper/eggnog_macros.xml @@ -3,6 +3,7 @@ 2.1.8 3 5.0.2 + 22.01 + @@ -105,15 +127,316 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' - query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + value.metadata.columns == 11 + + + + + + + value.metadata.columns == 22 + + + + + + annotate_hits_table.tsv + && + #end if + ]]> + + - - - - - - + query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov + + + + ortho_method['m'] not in ['no_search', 'cache'] + + + + + @@ -122,6 +445,83 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' + + + + + + + Min E-value expected when searching for seed eggNOG ortholog. Applies to phmmer/diamond searches. + Queries not having a significant seed orthologs (E-value less than threshold) will not be annotated. + + + + + Min bit score expected when searching for seed eggNOG ortholog. + Queries not having a significant seed orthologs will not be annotated. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ortho_method['m'] == 'cache' and ortho_method['output_no_annotations'] + + + + + + ortho_method['m'] != 'cache' + output_options['report_orthologs'] + + + + + + @@ -142,4 +542,58 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' + + diff --git a/tools/eggnog_mapper/eggnog_mapper/README b/tools/eggnog_mapper/eggnog_mapper/README new file mode 100644 index 000000000..fdef9b01b --- /dev/null +++ b/tools/eggnog_mapper/eggnog_mapper/README @@ -0,0 +1,21 @@ +This folder contains three tools: + +1. eggnogg_mapper: which runs the search and annotation phase in a single tool +2. eggnogg_mapper_search: which implements the search phase +3. eggnogg_mapper_annotate: which implements the annotation phase + +While the search phase of eggnog_mapper is very CPU intense and is efficient +also for larger number of threads, the anotation phase is very IO intensive +and can be very inefficient (depending on the configuration, e.g. if the +reference data is located on a slow partition). + +While for most applications eggnogg_mapper will be sufficient separating the +two phases can be more efficient: + +- sending eggnogg_mapper_search to a destination using many threads +- and eggnogg_mapper_annotate to a destination using a small numbe of threads + +If eggnogg_mapper_annotate is send to a single core destination +the option `--dbmem` is activated which will copy the complete +EggNOG annotation DB into memory which is usually much faster +than using multiple cores (but needs approx. 37GB of RAM). \ No newline at end of file diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml index 502422f39..0666d582e 100644 --- a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml @@ -1,4 +1,4 @@ - + functional sequence annotation by orthology eggnog_macros.xml @@ -6,86 +6,15 @@ annotate_hits_table.tsv - && - #end if + @MERGE_ANNOTATIONS@ emapper.py - --data_dir '$eggnog_data.fields.path' - -m '$ortho_method.m' - - #if $ortho_method.m in ['diamond', 'mmseqs', 'cache']: - -i '$ortho_method.input' - --itype '$ortho_method.input_trans.itype' - #if $ortho_method.input_trans.itype in ['CDS', 'genome', 'metagenome']: - $ortho_method.input_trans.translate - #end if - #if $ortho_method.input_trans.itype in ['genome', 'metagenome']: - --genepred $ortho_method.input_trans.genepred - #end if - #elif $ortho_method.m == "no_search" - --annotate_hits_table annotate_hits_table.tsv - #end if - - #if $ortho_method.m == 'cache' - --cache '$ortho_method.cache' - #end if - - #if $ortho_method.m in ['diamond', 'mmseqs']: - ## Diamond option - #if $ortho_method.m == "diamond": - --matrix '$ortho_method.matrix_gapcosts.matrix' - $ortho_method.matrix_gapcosts.gap_costs - --sensmode $ortho_method.sensmode - $ortho_method.dmnd_iterate - $ortho_method.dmnd_ignore_warnings - #elif $ortho_method.m == "mmseqs": - --start_sens $ortho_method.start_sens - --sens_steps $ortho_method.sens_steps - --final_sens $ortho_method.final_sens - #end if - - ## Common options for search filtering (applies to diamond and mmseqs only) - #if str($ortho_method.query_cover): - --query_cover $ortho_method.query_cover - #end if - #if str($ortho_method.subject_cover): - --subject_cover $ortho_method.subject_cover - #end if - #if str($ortho_method.pident): - --pident $ortho_method.pident - #end if - #if str($ortho_method.evalue): - --evalue $ortho_method.evalue - #end if - #if str($ortho_method.score): - --score $ortho_method.score - #end if - #end if - + @DB_TOKEN@ + @ORTHO_SEARCH_TOKEN@ #if $annotation_options.no_annot == "--no_annot" --no_annot #else - #if str($annotation_options.seed_ortholog_evalue): - --seed_ortholog_evalue $annotation_options.seed_ortholog_evalue - #end if - #if str($annotation_options.seed_ortholog_score): - --seed_ortholog_score $annotation_options.seed_ortholog_score - #end if - #if $annotation_options.tax_scope: - --tax_scope=$annotation_options.tax_scope - #end if - #if $annotation_options.target_orthologs: - --target_orthologs=$annotation_options.target_orthologs - #end if - #if $annotation_options.go_evidence: - --go_evidence=$annotation_options.go_evidence - #end if + @ANNOTATION_TOKEN@ #end if $output_options.no_file_comments $output_options.report_orthologs @@ -96,219 +25,15 @@ --temp_dir \${TEMP:-\$_GALAXY_JOB_TMP_DIR} ]]> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - value.metadata.columns == 11 - - - - - - - value.metadata.columns == 22 - - - - - + + - - - Min E-value expected when searching for seed eggNOG ortholog. Applies to phmmer/diamond searches. - Queries not having a significant seed orthologs (E-value less than threshold) will not be annotated. - - - - - Min bit score expected when searching for seed eggNOG ortholog. - Queries not having a significant seed orthologs will not be annotated. - - - - - - - - - - - - - - - + @@ -317,40 +42,17 @@ + label="Output a file with the list of orthologs for each hit"/> - - ortho_method['m'] not in ['no_search', 'cache'] - - - - - + + annotation_options['no_annot'] == '' - - - - - - - - - - - - - ortho_method['m'] != 'cache' and output_options['report_orthologs'] - - - - - - ortho_method['m'] == 'cache' and output_options['output_no_annotations'] - + + @@ -502,40 +204,11 @@ EggNOG-mapper is also available as a public online resource: ` diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml new file mode 100644 index 000000000..762dc999b --- /dev/null +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml @@ -0,0 +1,155 @@ + + annotation phase + + eggnog_macros.xml + + + + + + + + +
+ +
+ +
+ + + +
+
+ + + + + + + + + + + + + +
+ + +
+ + +
+ + + + + + + + + +
+ + + +
+ + + + + + + +
+ + + + + + + +
+ +
+
+ + +
+ + + + + +
+
+ `_. + +Outputs +------- + +@HELP_ANNOTATION_OUTPUTS@ + +**Recommentation for large input data** + +EggNOG-mapper consists of two phases + +1. finding seed orthologous sequences (compute intensive) +2. expanding annotations (IO intensive) + +by default (i.e. if *Method to search seed orthologs* is not *Skip search stage...* and *Annotate seed orthologs* is *Yes*) +both phases are executed within one tool run. + +For large input FASTA datasets in can be favourable to split this in two separate +tool runs as follows: + +1. Split the FASTA (e.g. 1M seqs per data set) +2. Run the search phase only (set *Annotate seed orthologs* to *No*) on the separate FASTA files. +3. Run the annotation phase (set *Method to search seed orthologs* to *Skip search stage...*) + +See [also](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#Setting_up_large_annotation_jobs) + +Another alternative is to use cached annotations (produced in a run with --md5 enabled). + + + ]]> + +
diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml new file mode 100644 index 000000000..c37e5d8c5 --- /dev/null +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml @@ -0,0 +1,111 @@ + + search phase + + eggnog_macros.xml + + + + + + + +
+ + + +
+
+ + + + + + + + + + +
+ +
+ + +
+ + + + + + + + + +
+ + +
+ + + + +
+
+ `_. + +Outputs +------- + +@HELP_SEARCH_OUTPUTS@ + +**Recommentation for large input data** + +EggNOG-mapper consists of two phases + +1. finding seed orthologous sequences (compute intensive) +2. expanding annotations (IO intensive) + +by default (i.e. if *Method to search seed orthologs* is not *Skip search stage...* and *Annotate seed orthologs* is *Yes*) +both phases are executed within one tool run. + +For large input FASTA datasets in can be favourable to split this in two separate +tool runs as follows: + +1. Split the FASTA (e.g. 1M seqs per data set) +2. Run the search phase only (set *Annotate seed orthologs* to *No*) on the separate FASTA files. +3. Run the annotation phase (set *Method to search seed orthologs* to *Skip search stage...*) + +See [also](https://github.com/eggnogdb/eggnog-mapper/wiki/eggNOG-mapper-v2.1.5-to-v2.1.8#Setting_up_large_annotation_jobs) + +Another alternative is to use cached annotations (produced in a run with --md5 enabled). + + + ]]> + +
From c67ed4314b574094c27fbb4bd902832d30588fb4 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 4 Oct 2022 17:13:21 +0200 Subject: [PATCH 2/4] more --- tools/eggnog_mapper/eggnog_macros.xml | 44 ++++++++++++++----- .../eggnog_mapper/eggnog_mapper.xml | 22 +++------- .../eggnog_mapper/eggnog_mapper_annotate.xml | 9 +--- .../eggnog_mapper/eggnog_mapper_search.xml | 12 +---- 4 files changed, 39 insertions(+), 48 deletions(-) diff --git a/tools/eggnog_mapper/eggnog_macros.xml b/tools/eggnog_mapper/eggnog_macros.xml index 8a9142865..8edec65a2 100644 --- a/tools/eggnog_mapper/eggnog_macros.xml +++ b/tools/eggnog_mapper/eggnog_macros.xml @@ -426,6 +426,22 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' #end if ]]> + + +
+ + +
+
+ + + + + + + + query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov @@ -437,14 +453,25 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' - + + + + @@ -522,35 +549,28 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' - + + - + + - -
- - - -
+ @@ -65,7 +58,6 @@
-
@@ -84,7 +76,6 @@
-
@@ -99,12 +90,12 @@
- +
- - - + + + @@ -118,7 +109,6 @@
-
@@ -141,7 +131,6 @@
-
@@ -161,7 +150,6 @@
-
diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml index 762dc999b..fa6ad6963 100644 --- a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml @@ -37,14 +37,7 @@ -
- - - -
+ diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml index c37e5d8c5..ed197b3ae 100644 --- a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml @@ -11,8 +11,6 @@ @ORTHO_SEARCH_TOKEN@ $output_options.no_file_comments - $output_options.report_orthologs - $output_options.md5 --output='results' --cpu "\${GALAXY_SLOTS:-4}" --scratch_dir \${TEMP:-\$_GALAXY_JOB_TMP_DIR} @@ -21,14 +19,7 @@ -
- - - -
+
@@ -56,7 +47,6 @@
-
From b0f30d8dde11059dbf75ab9599a027973f9fee18 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Mon, 4 Sep 2023 13:29:53 +0200 Subject: [PATCH 3/4] minor fixes - more consistent macro names - readme reformulation --- tools/eggnog_mapper/eggnog_macros.xml | 2 +- tools/eggnog_mapper/eggnog_mapper/README | 8 ++++---- tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml | 2 +- .../eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/eggnog_mapper/eggnog_macros.xml b/tools/eggnog_mapper/eggnog_macros.xml index 8edec65a2..98f3e0f3d 100644 --- a/tools/eggnog_mapper/eggnog_macros.xml +++ b/tools/eggnog_mapper/eggnog_macros.xml @@ -444,7 +444,7 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' query_name,seed_eggNOG_ortholog,seed_ortholog_evalue,seed_ortholog_score,query_start,query_end,seed_start,seed_end,pident,query_cov,seed_cov - + ortho_method['m'] not in ['no_search', 'cache'] diff --git a/tools/eggnog_mapper/eggnog_mapper/README b/tools/eggnog_mapper/eggnog_mapper/README index fdef9b01b..a72f33a4c 100644 --- a/tools/eggnog_mapper/eggnog_mapper/README +++ b/tools/eggnog_mapper/eggnog_mapper/README @@ -5,17 +5,17 @@ This folder contains three tools: 3. eggnogg_mapper_annotate: which implements the annotation phase While the search phase of eggnog_mapper is very CPU intense and is efficient -also for larger number of threads, the anotation phase is very IO intensive +also for a larger number of threads, the annotation phase is very IO intensive and can be very inefficient (depending on the configuration, e.g. if the reference data is located on a slow partition). -While for most applications eggnogg_mapper will be sufficient separating the +While for most applications eggnogg_mapper will be sufficient to separate the two phases can be more efficient: - sending eggnogg_mapper_search to a destination using many threads -- and eggnogg_mapper_annotate to a destination using a small numbe of threads +- and eggnogg_mapper_annotate to a destination using a small number of threads -If eggnogg_mapper_annotate is send to a single core destination +If eggnogg_mapper_annotate is sent to a single core destination the option `--dbmem` is activated which will copy the complete EggNOG annotation DB into memory which is usually much faster than using multiple cores (but needs approx. 37GB of RAM). \ No newline at end of file diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml index 36b54213b..e06a09a6a 100644 --- a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper.xml @@ -41,7 +41,7 @@ - + annotation_options['no_annot'] == '' diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml index ed197b3ae..46729f49e 100644 --- a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_search.xml @@ -22,7 +22,7 @@ - + From 8a48179fe393e05e7b85e3da9e7e748b16058bcd Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Mon, 4 Sep 2023 14:09:35 +0200 Subject: [PATCH 4/4] make -dbmem explicit --- tools/eggnog_mapper/eggnog_macros.xml | 1 + tools/eggnog_mapper/eggnog_mapper/README | 7 +++---- .../eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml | 7 ------- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/tools/eggnog_mapper/eggnog_macros.xml b/tools/eggnog_mapper/eggnog_macros.xml index 98f3e0f3d..cfed89b33 100644 --- a/tools/eggnog_mapper/eggnog_macros.xml +++ b/tools/eggnog_mapper/eggnog_macros.xml @@ -519,6 +519,7 @@ python '${__tool_directory__}/data_manager_eggnog.py' --config_file '$out_file' #if $annotation_options.go_evidence: --go_evidence=$annotation_options.go_evidence #end if + \$EGGNOG_DBMEM ]]>
diff --git a/tools/eggnog_mapper/eggnog_mapper/README b/tools/eggnog_mapper/eggnog_mapper/README index a72f33a4c..e665ac94f 100644 --- a/tools/eggnog_mapper/eggnog_mapper/README +++ b/tools/eggnog_mapper/eggnog_mapper/README @@ -15,7 +15,6 @@ two phases can be more efficient: - sending eggnogg_mapper_search to a destination using many threads - and eggnogg_mapper_annotate to a destination using a small number of threads -If eggnogg_mapper_annotate is sent to a single core destination -the option `--dbmem` is activated which will copy the complete -EggNOG annotation DB into memory which is usually much faster -than using multiple cores (but needs approx. 37GB of RAM). \ No newline at end of file +Admins can choose to set the environment variable ``EGGNOG_DBMEM=--dbmem`` +which will copy the complete EggNOG annotation DB into memory which is usually +much faster than using multiple cores (but needs approx. 37GB of RAM). \ No newline at end of file diff --git a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml index fa6ad6963..14cc122fc 100644 --- a/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml +++ b/tools/eggnog_mapper/eggnog_mapper/eggnog_mapper_annotate.xml @@ -8,12 +8,6 @@