diff --git a/CHANGELOG.md b/CHANGELOG.md index 5270234..afe6de6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +### Added +- Added the `--min-number-genes` parameter to the `summary` module. This parameter allows users to set the minimum number of genes a sequence must encode to be considered for classification as a plasmid or virus. The default value is `1`. When `--conservative` is used, this parameter is set to `1`. When `--relaxed` is used, this parameter is set to `0`. This filter has no effect if the `annotate` module is not executed. + ### Changed - Added a hyperlink to the official documentation in the help dialogue. - The virus taxonomic lineage is presented using a fixed number of fields separated by semicolons (`;`). As a result, for genomes that could not be assigned to the family level (the most specific taxonomic rank), there will be trailing semicolons at the end of the lineage string. diff --git a/docs/_source/post_classification_filtering.md b/docs/_source/post_classification_filtering.md index 7f7344a..2189223 100644 --- a/docs/_source/post_classification_filtering.md +++ b/docs/_source/post_classification_filtering.md @@ -9,7 +9,8 @@ To address these challenges, geNomad applies a series of filters to the classifi The following filters are available to users to generate the final lists of plasmids and viruses: - **`min-score`:** Minimum score to flag a sequence as virus or plasmid. -- **`max-fdr`:** Maximum accepted false discovery rate. This option will be ignored if the scores were not calibrated. +- **`max-fdr`:** Maximum accepted false discovery rate. This option will be ignored if the scores were not calibrated. +- **`min-number-genes`:** The minimum number of genes a sequence must encode to be considered for classification as a plasmid or virus. - **`min-plasmid-marker-enrichment`:** Minimum allowed value for the plasmid marker enrichment score. - **`min-virus-marker-enrichment`:** Minimum allowed value for the virus marker enrichment score. - **`min-plasmid-hallmarks`:** Minimum number of plasmid hallmarks in the identified plasmids. @@ -38,6 +39,7 @@ The values used to filter predictions when executing geNomad with default parame |:-----------------------------------|--------:|--------:|-------------:| | `min-score` | 0.70 | 0.00 | 0.80 | | `max-fdr` | 0.10 | 1.00 | 0.05 | +| `min-number-genes` | 1 | 0 | 1 | | `min-plasmid-marker-enrichment` | 0.10 | -100.00 | 1.50 | | `min-virus-marker-enrichment` | 0.00 | -100.00 | 1.50 | | `min-plasmid-hallmarks` | 0 | 0 | 1 | @@ -45,3 +47,8 @@ The values used to filter predictions when executing geNomad with default parame | `min-virus-hallmarks` | 0 | 0 | 1 | | `min-virus-hallmarks-short-seqs` | 1 | 0 | 1 | | `max-uscg` | 4 | 100 | 2 | + +```{admonition} Post-classification filtering in the absence of gene annotation +:class: tip +When the `annotate` module is not executed, most filters are disabled, since they rely on gene annotation. In such cases, only `min-score` and `max-fdr` are used to filter the classification results. +``` diff --git a/genomad/cli.py b/genomad/cli.py index 8014121..2bbb516 100644 --- a/genomad/cli.py +++ b/genomad/cli.py @@ -165,6 +165,7 @@ "options": [ "--min-score", "--max-fdr", + "--min-number-genes", "--min-plasmid-marker-enrichment", "--min-virus-marker-enrichment", "--min-plasmid-hallmarks", @@ -220,6 +221,7 @@ "options": [ "--min-score", "--max-fdr", + "--min-number-genes", "--min-plasmid-marker-enrichment", "--min-virus-marker-enrichment", "--min-plasmid-hallmarks", @@ -243,6 +245,7 @@ def use_preset(ctx, param, value): for i in [ "min_score", "max_fdr", + "min_number_genes", "min_plasmid_marker_enrichment", "min_virus_marker_enrichment", "min_plasmid_hallmarks", @@ -277,15 +280,16 @@ def use_preset(ctx, param, value): ) sys.exit(1) if value is False: - set_preset_values(ctx, 0, 1, -100, -100, 0, 0, 0, 0, 100) + set_preset_values(ctx, 0, 1, 0, -100, -100, 0, 0, 0, 0, 100) elif value is True: - set_preset_values(ctx, 0.8, 0.05, 1.5, 1.5, 1, 1, 1, 1, 2) + set_preset_values(ctx, 0.8, 0.05, 1, 1.5, 1.5, 1, 1, 1, 1, 2) def set_preset_values( ctx, min_score, max_fdr, + min_number_genes, min_plasmid_marker_enrichment, min_virus_marker_enrichment, min_plasmid_hallmarks, @@ -296,6 +300,7 @@ def set_preset_values( ): ctx.params["min_score"] = min_score ctx.params["max_fdr"] = max_fdr + ctx.params["min_number_genes"] = min_number_genes ctx.params["min_plasmid_marker_enrichment"] = min_plasmid_marker_enrichment ctx.params["min_virus_marker_enrichment"] = min_virus_marker_enrichment ctx.params["min_plasmid_hallmarks"] = min_plasmid_hallmarks @@ -813,6 +818,7 @@ def score_calibration(input, output, composition, force_auto, verbose): preset disables all post-classification filters.\n These presets cannot be used together with the following parameters: [cyan bold]--min-score[/], [cyan bold]--max-fdr[/], + [cyan bold]--min-number-genes[/], [cyan bold]--min-plasmid-marker-enrichment[/], [cyan bold]--min-virus-marker-enrichment[/], [cyan bold]--min-plasmid-hallmarks[/], @@ -838,6 +844,15 @@ def score_calibration(input, output, composition, force_auto, verbose): help="""Maximum accepted false discovery rate. This option will be ignored if the scores were not calibrated.""", ) +@click.option( + "--min-number-genes", + type=click.IntRange(min=0), + default=1, + show_default=True, + is_eager=True, + help="""The minimum number of genes a sequence must encode to be considered + for classification as a plasmid or virus.""", +) @click.option( "--min-plasmid-marker-enrichment", type=float, @@ -915,6 +930,7 @@ def summary( verbose, min_score, max_fdr, + min_number_genes, min_plasmid_marker_enrichment, min_virus_marker_enrichment, min_plasmid_hallmarks, @@ -936,6 +952,7 @@ def summary( verbose, min_score, max_fdr, + min_number_genes, min_plasmid_marker_enrichment, min_virus_marker_enrichment, min_plasmid_hallmarks, @@ -993,14 +1010,15 @@ def summary( classification is strongly supported. The [cyan bold]--relaxed[/] preset disables all post-classification filters.\n These presets cannot be used together with the following parameters: - [cyan bold]--min_score[/], [cyan bold]--max_fdr[/], - [cyan bold]--min_plasmid_marker_enrichment[/], - [cyan bold]--min_virus_marker_enrichment[/], - [cyan bold]--min_plasmid_hallmarks[/], - [cyan bold]--min_plasmid_hallmarks_short_seqs[/], - [cyan bold]--min_virus_hallmarks[/], - [cyan bold]--min_virus_hallmarks_short_seqs[/], - and [cyan bold]--max_uscg[/].""", + [cyan bold]--min-score[/], [cyan bold]--max-fdr[/], + [cyan bold]--min-number-genes[/], + [cyan bold]--min-plasmid-marker-enrichment[/], + [cyan bold]--min-virus-marker-enrichment[/], + [cyan bold]--min-plasmid-hallmarks[/], + [cyan bold]--min-plasmid-hallmarks-short-seqs[/], + [cyan bold]--min-virus-hallmarks[/], + [cyan bold]--min-virus-hallmarks-short-seqs[/], + and [cyan bold]--max-uscg[/].""", ) @click.option( "--disable-find-proviruses", @@ -1096,6 +1114,15 @@ def summary( help="""Maximum accepted false discovery rate. This option will be ignored if the scores were not calibrated.""", ) +@click.option( + "--min-number-genes", + type=click.IntRange(min=0), + default=1, + show_default=True, + is_eager=True, + help="""The minimum number of genes a sequence must encode to be considered + for classification as a plasmid or virus.""", +) @click.option( "--min-plasmid-marker-enrichment", type=float, @@ -1189,6 +1216,7 @@ def end_to_end( force_auto, min_score, max_fdr, + min_number_genes, min_plasmid_marker_enrichment, min_virus_marker_enrichment, min_plasmid_hallmarks, @@ -1305,6 +1333,7 @@ def end_to_end( output=output, min_score=min_score, max_fdr=max_fdr, + min_number_genes=min_number_genes, min_plasmid_marker_enrichment=min_plasmid_marker_enrichment, min_virus_marker_enrichment=min_virus_marker_enrichment, min_plasmid_hallmarks=min_plasmid_hallmarks, diff --git a/genomad/modules/summary.py b/genomad/modules/summary.py index b29a2fd..336ff2e 100644 --- a/genomad/modules/summary.py +++ b/genomad/modules/summary.py @@ -24,10 +24,12 @@ def flag_sequences( class_index, min_score, max_fdr, + min_number_genes, min_marker_enrichment, min_hallmarks, min_hallmarks_short, max_uscg, + n_genes_dict, filters_dict, annotate_exec, provirus_name_array=None, @@ -52,6 +54,7 @@ def flag_sequences( added_contigs = set() added_proviruses = set() for i in reversed(score_array[:, class_index].argsort()): + n_genes = n_genes_dict.get(name_array[i], 0) n_uscg, marker_enrichment, n_hallmarks = filters_dict.get( name_array[i], (0, np.zeros(3), (0, 0)) ) @@ -61,6 +64,7 @@ def flag_sequences( annotate_exec and score_array[i].argmax() == class_index and score_array[i, class_index] >= min_score + and n_genes >= min_number_genes and marker_enrichment >= min_marker_enrichment and ( n_hallmarks >= min_hallmarks @@ -105,6 +109,7 @@ def main( verbose, min_score, max_fdr, + min_number_genes, min_plasmid_marker_enrichment, min_virus_marker_enrichment, min_plasmid_hallmarks, @@ -369,7 +374,10 @@ def main( elif selected_classifier == "calibrated_nn": console.log("Using calibrated scores from [cyan]nn-classification[/cyan].") elif selected_classifier == "nn": - console.log("Using scores from [cyan]nn-classification[/cyan].") + console.log( + "Using scores from [cyan]nn-classification[/cyan]. Gene-based filters " + "will not be applied." + ) # Warn if aggregated-classification was not executed if ( @@ -454,10 +462,12 @@ def main( 1, min_score, max_fdr, + min_number_genes, min_plasmid_marker_enrichment, min_plasmid_hallmarks, min_plasmid_hallmarks_short_seqs, max_uscg, + n_genes_dict, filters_dict, annotate_exec, ) @@ -468,14 +478,16 @@ def main( 2, min_score, max_fdr, + min_number_genes, min_virus_marker_enrichment, min_virus_hallmarks, min_virus_hallmarks_short_seqs, max_uscg, + n_genes_dict, filters_dict, annotate_exec, - score_dict["provirus_names"], - score_dict["provirus_predictions"], + provirus_name_array=score_dict["provirus_names"], + provirus_score_array=score_dict["provirus_predictions"], ) plasmid_name_set = set(plasmid_name_array) virus_name_set = set(virus_name_array)