Skip to content

Commit

Permalink
Add the min-number-genes parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
apcamargo committed Apr 11, 2024
1 parent 397f443 commit 70422bc
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 14 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Added the `--min-number-genes` parameter to the `summary` module. This parameter allows users to set the minimum number of genes a sequence must encode to be considered for classification as a plasmid or virus. The default value is `1`. When `--conservative` is used, this parameter is set to `1`. When `--relaxed` is used, this parameter is set to `0`. This filter has no effect if the `annotate` module is not executed.

### Changed
- Added a hyperlink to the official documentation in the help dialogue.
- The virus taxonomic lineage is presented using a fixed number of fields separated by semicolons (`;`). As a result, for genomes that could not be assigned to the family level (the most specific taxonomic rank), there will be trailing semicolons at the end of the lineage string.
Expand Down
9 changes: 8 additions & 1 deletion docs/_source/post_classification_filtering.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ To address these challenges, geNomad applies a series of filters to the classifi
The following filters are available to users to generate the final lists of plasmids and viruses:

- **`min-score`:** Minimum score to flag a sequence as virus or plasmid.
- **`max-fdr`:** Maximum accepted false discovery rate. This option will be ignored if the scores were not calibrated.
- **`max-fdr`:** Maximum accepted false discovery rate. This option will be ignored if the scores were not calibrated.
- **`min-number-genes`:** The minimum number of genes a sequence must encode to be considered for classification as a plasmid or virus.
- **`min-plasmid-marker-enrichment`:** Minimum allowed value for the plasmid marker enrichment score.
- **`min-virus-marker-enrichment`:** Minimum allowed value for the virus marker enrichment score.
- **`min-plasmid-hallmarks`:** Minimum number of plasmid hallmarks in the identified plasmids.
Expand Down Expand Up @@ -38,10 +39,16 @@ The values used to filter predictions when executing geNomad with default parame
|:-----------------------------------|--------:|--------:|-------------:|
| `min-score` | 0.70 | 0.00 | 0.80 |
| `max-fdr` | 0.10 | 1.00 | 0.05 |
| `min-number-genes` | 1 | 0 | 1 |
| `min-plasmid-marker-enrichment` | 0.10 | -100.00 | 1.50 |
| `min-virus-marker-enrichment` | 0.00 | -100.00 | 1.50 |
| `min-plasmid-hallmarks` | 0 | 0 | 1 |
| `min-plasmid-hallmarks-short-seqs` | 1 | 0 | 1 |
| `min-virus-hallmarks` | 0 | 0 | 1 |
| `min-virus-hallmarks-short-seqs` | 1 | 0 | 1 |
| `max-uscg` | 4 | 100 | 2 |

```{admonition} Post-classification filtering in the absence of gene annotation
:class: tip
When the `annotate` module is not executed, most filters are disabled, since they rely on gene annotation. In such cases, only `min-score` and `max-fdr` are used to filter the classification results.
```
49 changes: 39 additions & 10 deletions genomad/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@
"options": [
"--min-score",
"--max-fdr",
"--min-number-genes",
"--min-plasmid-marker-enrichment",
"--min-virus-marker-enrichment",
"--min-plasmid-hallmarks",
Expand Down Expand Up @@ -220,6 +221,7 @@
"options": [
"--min-score",
"--max-fdr",
"--min-number-genes",
"--min-plasmid-marker-enrichment",
"--min-virus-marker-enrichment",
"--min-plasmid-hallmarks",
Expand All @@ -243,6 +245,7 @@ def use_preset(ctx, param, value):
for i in [
"min_score",
"max_fdr",
"min_number_genes",
"min_plasmid_marker_enrichment",
"min_virus_marker_enrichment",
"min_plasmid_hallmarks",
Expand Down Expand Up @@ -277,15 +280,16 @@ def use_preset(ctx, param, value):
)
sys.exit(1)
if value is False:
set_preset_values(ctx, 0, 1, -100, -100, 0, 0, 0, 0, 100)
set_preset_values(ctx, 0, 1, 0, -100, -100, 0, 0, 0, 0, 100)
elif value is True:
set_preset_values(ctx, 0.8, 0.05, 1.5, 1.5, 1, 1, 1, 1, 2)
set_preset_values(ctx, 0.8, 0.05, 1, 1.5, 1.5, 1, 1, 1, 1, 2)


def set_preset_values(
ctx,
min_score,
max_fdr,
min_number_genes,
min_plasmid_marker_enrichment,
min_virus_marker_enrichment,
min_plasmid_hallmarks,
Expand All @@ -296,6 +300,7 @@ def set_preset_values(
):
ctx.params["min_score"] = min_score
ctx.params["max_fdr"] = max_fdr
ctx.params["min_number_genes"] = min_number_genes
ctx.params["min_plasmid_marker_enrichment"] = min_plasmid_marker_enrichment
ctx.params["min_virus_marker_enrichment"] = min_virus_marker_enrichment
ctx.params["min_plasmid_hallmarks"] = min_plasmid_hallmarks
Expand Down Expand Up @@ -813,6 +818,7 @@ def score_calibration(input, output, composition, force_auto, verbose):
preset disables all post-classification filters.\n
These presets cannot be used together with the following parameters:
[cyan bold]--min-score[/], [cyan bold]--max-fdr[/],
[cyan bold]--min-number-genes[/],
[cyan bold]--min-plasmid-marker-enrichment[/],
[cyan bold]--min-virus-marker-enrichment[/],
[cyan bold]--min-plasmid-hallmarks[/],
Expand All @@ -838,6 +844,15 @@ def score_calibration(input, output, composition, force_auto, verbose):
help="""Maximum accepted false discovery rate. This option will be ignored
if the scores were not calibrated.""",
)
@click.option(
"--min-number-genes",
type=click.IntRange(min=0),
default=1,
show_default=True,
is_eager=True,
help="""The minimum number of genes a sequence must encode to be considered
for classification as a plasmid or virus.""",
)
@click.option(
"--min-plasmid-marker-enrichment",
type=float,
Expand Down Expand Up @@ -915,6 +930,7 @@ def summary(
verbose,
min_score,
max_fdr,
min_number_genes,
min_plasmid_marker_enrichment,
min_virus_marker_enrichment,
min_plasmid_hallmarks,
Expand All @@ -936,6 +952,7 @@ def summary(
verbose,
min_score,
max_fdr,
min_number_genes,
min_plasmid_marker_enrichment,
min_virus_marker_enrichment,
min_plasmid_hallmarks,
Expand Down Expand Up @@ -993,14 +1010,15 @@ def summary(
classification is strongly supported. The [cyan bold]--relaxed[/]
preset disables all post-classification filters.\n
These presets cannot be used together with the following parameters:
[cyan bold]--min_score[/], [cyan bold]--max_fdr[/],
[cyan bold]--min_plasmid_marker_enrichment[/],
[cyan bold]--min_virus_marker_enrichment[/],
[cyan bold]--min_plasmid_hallmarks[/],
[cyan bold]--min_plasmid_hallmarks_short_seqs[/],
[cyan bold]--min_virus_hallmarks[/],
[cyan bold]--min_virus_hallmarks_short_seqs[/],
and [cyan bold]--max_uscg[/].""",
[cyan bold]--min-score[/], [cyan bold]--max-fdr[/],
[cyan bold]--min-number-genes[/],
[cyan bold]--min-plasmid-marker-enrichment[/],
[cyan bold]--min-virus-marker-enrichment[/],
[cyan bold]--min-plasmid-hallmarks[/],
[cyan bold]--min-plasmid-hallmarks-short-seqs[/],
[cyan bold]--min-virus-hallmarks[/],
[cyan bold]--min-virus-hallmarks-short-seqs[/],
and [cyan bold]--max-uscg[/].""",
)
@click.option(
"--disable-find-proviruses",
Expand Down Expand Up @@ -1096,6 +1114,15 @@ def summary(
help="""Maximum accepted false discovery rate. This option will be ignored
if the scores were not calibrated.""",
)
@click.option(
"--min-number-genes",
type=click.IntRange(min=0),
default=1,
show_default=True,
is_eager=True,
help="""The minimum number of genes a sequence must encode to be considered
for classification as a plasmid or virus.""",
)
@click.option(
"--min-plasmid-marker-enrichment",
type=float,
Expand Down Expand Up @@ -1189,6 +1216,7 @@ def end_to_end(
force_auto,
min_score,
max_fdr,
min_number_genes,
min_plasmid_marker_enrichment,
min_virus_marker_enrichment,
min_plasmid_hallmarks,
Expand Down Expand Up @@ -1305,6 +1333,7 @@ def end_to_end(
output=output,
min_score=min_score,
max_fdr=max_fdr,
min_number_genes=min_number_genes,
min_plasmid_marker_enrichment=min_plasmid_marker_enrichment,
min_virus_marker_enrichment=min_virus_marker_enrichment,
min_plasmid_hallmarks=min_plasmid_hallmarks,
Expand Down
18 changes: 15 additions & 3 deletions genomad/modules/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@ def flag_sequences(
class_index,
min_score,
max_fdr,
min_number_genes,
min_marker_enrichment,
min_hallmarks,
min_hallmarks_short,
max_uscg,
n_genes_dict,
filters_dict,
annotate_exec,
provirus_name_array=None,
Expand All @@ -52,6 +54,7 @@ def flag_sequences(
added_contigs = set()
added_proviruses = set()
for i in reversed(score_array[:, class_index].argsort()):
n_genes = n_genes_dict.get(name_array[i], 0)
n_uscg, marker_enrichment, n_hallmarks = filters_dict.get(
name_array[i], (0, np.zeros(3), (0, 0))
)
Expand All @@ -61,6 +64,7 @@ def flag_sequences(
annotate_exec
and score_array[i].argmax() == class_index
and score_array[i, class_index] >= min_score
and n_genes >= min_number_genes
and marker_enrichment >= min_marker_enrichment
and (
n_hallmarks >= min_hallmarks
Expand Down Expand Up @@ -105,6 +109,7 @@ def main(
verbose,
min_score,
max_fdr,
min_number_genes,
min_plasmid_marker_enrichment,
min_virus_marker_enrichment,
min_plasmid_hallmarks,
Expand Down Expand Up @@ -369,7 +374,10 @@ def main(
elif selected_classifier == "calibrated_nn":
console.log("Using calibrated scores from [cyan]nn-classification[/cyan].")
elif selected_classifier == "nn":
console.log("Using scores from [cyan]nn-classification[/cyan].")
console.log(
"Using scores from [cyan]nn-classification[/cyan]. Gene-based filters "
"will not be applied."
)

# Warn if aggregated-classification was not executed
if (
Expand Down Expand Up @@ -454,10 +462,12 @@ def main(
1,
min_score,
max_fdr,
min_number_genes,
min_plasmid_marker_enrichment,
min_plasmid_hallmarks,
min_plasmid_hallmarks_short_seqs,
max_uscg,
n_genes_dict,
filters_dict,
annotate_exec,
)
Expand All @@ -468,14 +478,16 @@ def main(
2,
min_score,
max_fdr,
min_number_genes,
min_virus_marker_enrichment,
min_virus_hallmarks,
min_virus_hallmarks_short_seqs,
max_uscg,
n_genes_dict,
filters_dict,
annotate_exec,
score_dict["provirus_names"],
score_dict["provirus_predictions"],
provirus_name_array=score_dict["provirus_names"],
provirus_score_array=score_dict["provirus_predictions"],
)
plasmid_name_set = set(plasmid_name_array)
virus_name_set = set(virus_name_array)
Expand Down

0 comments on commit 70422bc

Please sign in to comment.