diff --git a/doc/command-line.md b/doc/command-line.md index 14e875a9d6..7b9cbe2615 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -551,8 +551,8 @@ The sourmash `tax` or `taxonomy` commands integrate taxonomic `gather` command (we cannot combine separate `gather` runs for the same query). For supported databases (e.g. GTDB, NCBI), we provide taxonomy csv files, but they can also be generated for user-generated - databases. As of v4.8, some sourmash taxonomy commands can also use `LIN` - lineage information. For more information, see [databases](databases.md). + databases. As of v4.8 and 4.8.6, respectively, some sourmash taxonomy + commands can also use `LIN` or `ICTV` lineage information. `tax` commands rely upon the fact that `gather` provides both the total fraction of the query matched to each database matched, as well as a diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index 7541440fc2..f30318ad0a 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -78,6 +78,13 @@ def subparser(subparsers): default=False, help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) def main(args): diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index b9712658a4..cd72499c31 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -124,6 +124,13 @@ def subparser(subparsers): default=None, help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) add_tax_threshold_arg(subparser, 0.1) add_rank_arg(subparser) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 563c6c3d81..f8e31902e8 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -116,6 +116,13 @@ def subparser(subparsers): default=None, help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) add_rank_arg(subparser) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index d430677b8f..76236f19ae 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -57,6 +57,13 @@ def subparser(subparsers): default=False, help="use LIN taxonomy in place of standard taxonomic ranks.", ) + subparser.add_argument( + "--ictv", + "--ictv-taxonomy", + action="store_true", + default=False, + help="use ICTV taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain ICTV ranks.", + ) def main(args): diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 8e490ae545..073977cb79 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -18,6 +18,7 @@ RankLineageInfo, LINLineageInfo, AnnotateTaxResult, + ICTVRankLineageInfo, ) usage = """ @@ -82,6 +83,7 @@ def metagenome(args): keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins, + ictv=args.ictv, ) available_ranks = tax_assign.available_ranks except ValueError as exc: @@ -113,6 +115,7 @@ def metagenome(args): keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -258,6 +261,7 @@ def genome(args): keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins, + ictv=args.ictv, ) available_ranks = tax_assign.available_ranks @@ -297,6 +301,7 @@ def genome(args): keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: @@ -402,6 +407,7 @@ def annotate(args): keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: @@ -466,6 +472,7 @@ def annotate(args): raw=row, id_col=id_col, lins=args.lins, + ictv=args.ictv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, ) @@ -591,6 +598,7 @@ def summarize(args): keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, lins=args.lins, + ictv=args.ictv, ) except ValueError as exc: error("ERROR while loading taxonomies!") @@ -637,6 +645,8 @@ def summarize(args): rank = lineage[-1].rank if args.lins: inf = LINLineageInfo(lineage=lineage) + elif args.ictv: + inf = ICTVRankLineageInfo(lineage=lineage) else: inf = RankLineageInfo(lineage=lineage) lin = inf.display_lineage() diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d1827c2aad..55feed66d2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -48,6 +48,36 @@ "unclassified": "U", } +ICTV_RANKS = ( + "realm", + "subrealm", + "kingdom", + "subkingdom", + "phylum", + "subphylum", + "class", + "subclass", + "order", + "suborder", + "family", + "subfamily", + "genus", + "subgenus", + "species", + "name", +) + +NCBI_RANKS = ( + "superkingdom", + "phylum", + "class", + "order", + "family", + "genus", + "species", + "strain", +) + class LineagePair(NamedTuple): rank: str @@ -314,7 +344,7 @@ def find_lca(self, other): @dataclass(frozen=True, order=True) class RankLineageInfo(BaseLineageInfo): """ - This RankLineageInfo class usees the BaseLineageInfo methods for a standard set + This RankLineageInfo class uses the BaseLineageInfo methods for a standard set of taxonomic ranks. Inputs: @@ -332,16 +362,7 @@ class RankLineageInfo(BaseLineageInfo): and will not be used or compared in any other class methods. """ - ranks: tuple = ( - "superkingdom", - "phylum", - "class", - "order", - "family", - "genus", - "species", - "strain", - ) + ranks: tuple = NCBI_RANKS lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): @@ -408,6 +429,79 @@ def _init_from_lineage_dict(self): object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) +@dataclass(frozen=True, order=True) +class ICTVRankLineageInfo(RankLineageInfo): + """ + This ICTV RankLineageInfo class uses the RankLineageInfo methods but uses + the 15-rank ICTV taxonomy. It also allows for a 'name' column in the taxonomy, + which reflects that virus name is sometimes used as a sub-species rank. + + Inputs: + optional: + ranks: tuple or list of hierarchical ranks + default: ('realm','subrealm','kingdom','subkingdom','phylum','subphylum', + 'class','subclass','order','suborder','family','subfamily', + 'genus','subgenus','species','name') + lineage: tuple or list of LineagePair + lineage_str: `;`- or `,`-separated string of names + lineage_dict: dictionary of {rank: name} + + If no inputs are provided, result will be ICTVRankLineageInfo with + default ranks and no lineage names. + + Input lineage information is only used for initialization of the final `lineage` + and will not be used or compared in any other class methods. + """ + + ranks: tuple = ICTV_RANKS + lineage_dict: dict = field(default=None, compare=False) # dict of rank: name + + def __post_init__(self): + "Initialize according to passed values" + object.__setattr__(self, "ranks", ICTV_RANKS) + if self.lineage is not None: + self._init_from_lineage_tuples() + elif self.lineage_str is not None: + self._init_from_lineage_str() + elif self.lineage_dict is not None: + self._init_from_lineage_dict() + elif self.ranks: + self._init_empty() + + def _init_from_lineage_dict(self): + """ + Initialize from lineage dict, e.g. from lineages csv. + Allows empty ranks/extra columns and reordering if necessary + """ + null_names = set(["[Blank]", "na", "null", "NA", ""]) + if not isinstance(self.lineage_dict, (dict)): + raise ValueError(f"{self.lineage_dict} is not dictionary") + new_lineage = [] + # build empty lineage and taxpath + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + + # now add rank information in correct spots. This corrects for order and allows empty ranks and extra dict keys + for key, val in self.lineage_dict.items(): + name = None + try: + rank, name = key, val + rank_idx = self.rank_index(rank) + except ValueError: + continue # ignore dictionary entries (columns) that don't match a rank + + # filter null + if name is not None and name.strip() in null_names: + name = None + new_lineage[rank_idx] = LineagePair(rank=rank, name=name) + + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) + + @dataclass(frozen=True, order=True) class LINLineageInfo(BaseLineageInfo): """ @@ -557,7 +651,10 @@ def __post_init__(self): self.add_lineages(self.assignments) def add_lineage(self, lineage): - if isinstance(lineage, BaseLineageInfo | RankLineageInfo | LINLineageInfo): + if isinstance( + lineage, + BaseLineageInfo | RankLineageInfo | LINLineageInfo | ICTVRankLineageInfo, + ): lineage = lineage.filled_lineage node = self.tree for lineage_tup in lineage: @@ -724,6 +821,7 @@ def load_gather_results( keep_full_identifiers=False, keep_identifier_versions=False, lins=False, + ictv=False, ): "Load a single gather csv" if not seen_queries: @@ -761,6 +859,7 @@ def load_gather_results( keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions, lins=lins, + ictv=ictv, ) taxres.get_match_lineage( tax_assignments=tax_assignments, @@ -771,7 +870,8 @@ def load_gather_results( if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new this_querytaxres = gather_results.get( - gatherRow.query_name, QueryTaxResult(taxres.query_info, lins=lins) + gatherRow.query_name, + QueryTaxResult(taxres.query_info, lins=lins, ictv=ictv), ) this_querytaxres.add_taxresult(taxres) gather_results[gatherRow.query_name] = this_querytaxres @@ -795,6 +895,7 @@ def check_and_load_gather_csvs( keep_full_identifiers=False, keep_identifier_versions=False, lins=False, + ictv=False, ): """ Load gather csvs, checking for empties and ids missing from taxonomic assignments. @@ -816,6 +917,7 @@ def check_and_load_gather_csvs( keep_identifier_versions=keep_identifier_versions, fail_on_missing_taxonomy=fail_on_missing_taxonomy, lins=lins, + ictv=ictv, ) except ValueError as exc: if force: @@ -1134,6 +1236,7 @@ def load( keep_full_identifiers=False, keep_identifier_versions=True, lins=False, + ictv=False, ): """ Load a taxonomy assignment CSV file into a LineageDB. @@ -1156,7 +1259,7 @@ def load( if os.path.isdir(filename): raise ValueError(f"'{filename}' is a directory") - with sourmash_args.FileInputCSV(filename) as r: + with sourmash_args.FileInputCSV(filename, delimiter=",") as r: header = r.fieldnames if not header: raise ValueError(f"cannot read taxonomy assignments from {filename}") @@ -1173,7 +1276,7 @@ def load( header = ["ident" if "accession" == x else x for x in header] elif "name" in header and "lineage" in header: return cls.load_from_gather_with_lineages( - filename, force=force, lins=lins + filename, force=force, lins=lins, ictv=ictv ) else: header_str = ",".join([repr(x) for x in header]) @@ -1181,12 +1284,21 @@ def load( f"No taxonomic identifiers found; headers are {header_str}" ) - if lins and "lin" not in header: - raise ValueError( - f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}." - ) + if lins: + notify("Trying to read LIN taxonomy assignments.") + if "lin" not in header: + raise ValueError( + f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}." + ) + + if ictv: + notify("Trying to read ICTV taxonomy assignments.") + # check that all ranks are in header + ranks = list(ICTVRankLineageInfo().taxlist) + if not set(ranks).issubset(header): + raise ValueError("Not all taxonomy ranks present") - if not lins: + if not lins and not ictv: # is "strain" an available rank? if "strain" in header: include_strain = True @@ -1217,10 +1329,12 @@ def load( "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." ) else: - n_pos = ( - lineageInfo.n_lin_positions - ) # set n_pos with first entry + # set n_pos with first entry + n_pos = lineageInfo.n_lin_positions ranks = lineageInfo.ranks + elif ictv: + # read lineage from row dictionary + lineageInfo = ICTVRankLineageInfo(lineage_dict=row) else: # read lineage from row dictionary lineageInfo = RankLineageInfo(lineage_dict=row) @@ -1247,7 +1361,7 @@ def load( else: assignments[ident] = lineage - if not lins: + if not lins and not ictv: if lineage[-1].rank == "species": n_species += 1 elif lineage[-1].rank == "strain": @@ -1257,7 +1371,9 @@ def load( return LineageDB(assignments, ranks) @classmethod - def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): + def load_from_gather_with_lineages( + cls, filename, *, force=False, lins=False, ictv=False + ): """ Load an annotated gather-with-lineages CSV file produced by 'tax annotate' into a LineageDB. @@ -1294,6 +1410,8 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): if lins: lineageInfo = LINLineageInfo(lineage_str=row["lineage"]) + elif ictv: + lineageInfo = ICTVRankLineageInfo(lineage_str=row["lineage"]) else: lineageInfo = RankLineageInfo(lineage_str=row["lineage"]) @@ -1772,6 +1890,7 @@ class BaseTaxResult: missed_ident: bool = False match_lineage_attempted: bool = False lins: bool = False + ictv: bool = False def get_ident(self, id_col=None): # split identifiers = split on whitespace @@ -1799,6 +1918,8 @@ def get_match_lineage( if lin: if self.lins: self.lineageInfo = LINLineageInfo(lineage=lin) + elif self.ictv: + self.lineageInfo = ICTVRankLineageInfo(lineage=lin) else: self.lineageInfo = RankLineageInfo(lineage=lin) else: @@ -1858,7 +1979,7 @@ class TaxResult(BaseTaxResult): # get match lineage tax_res.get_match_lineage(taxD=taxonomic_assignments) - Use RankLineageInfo or LINLineageInfo to store lineage information. + Use RankLineageInfo, ICTVLineageInfo, or LINLineageInfo to store lineage information. """ raw: GatherRow @@ -1884,6 +2005,8 @@ def __post_init__(self): self.unique_intersect_bp = int(self.raw.unique_intersect_bp) if self.lins: self.lineageInfo = LINLineageInfo() + elif self.ictv: + self.lineageInfo = ICTVRankLineageInfo() else: self.lineageInfo = RankLineageInfo() @@ -2109,6 +2232,7 @@ class QueryTaxResult: query_info: QueryInfo # initialize with QueryInfo dataclass lins: bool = False + ictv: bool = False def __post_init__(self): self.query_name = self.query_info.query_name # for convenience @@ -2147,7 +2271,11 @@ def _init_classification_results(self): self.krona_header = [] def is_compatible(self, taxresult): - return taxresult.query_info == self.query_info and taxresult.lins == self.lins + return ( + taxresult.query_info == self.query_info + and taxresult.lins == self.lins + and taxresult.ictv == self.ictv + ) @property def ascending_ranks(self): @@ -2280,6 +2408,8 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): # record unclassified if self.lins: lineage = LINLineageInfo() + elif self.ictv: + lineage = ICTVRankLineageInfo() else: lineage = RankLineageInfo() query_ani = None diff --git a/tests/test-data/tax/test.ictv-taxonomy.csv b/tests/test-data/tax/test.ictv-taxonomy.csv new file mode 100644 index 0000000000..d99a76601c --- /dev/null +++ b/tests/test-data/tax/test.ictv-taxonomy.csv @@ -0,0 +1,8 @@ +ident,superkingdom,realm,subrealm,kingdom,subkingdom,phylum,subphylum,class,subclass,order,suborder,family,subfamily,genus,subgenus,species,name +GCF_000017325,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus zairense,Ebola virus +GCF_000021665,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus sudanense,Sudan virus +GCF_009494285,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus taiense,Taï Forest virus +GCF_003471795,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus restonense,Reston virus +GCF_001881345,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus bundibugyoense,Bundibugyo virus +GCF_013368705,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,Orthoebolavirus,,Orthoebolavirus bombaliense,Bombali virus +GCF_013311112,Viruses,Riboviria,,Orthornavirae,,Negarnaviricota,Haploviricotina,Monjiviricetes,,Mononegavirales,,Filoviridae,,other-genus,,other-sp,other test diff --git a/tests/test_tax.py b/tests/test_tax.py index 3f766f5e37..70b4f14fc0 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1126,6 +1126,131 @@ def test_metagenome_bad_rank_krona(runtmp): ) +def test_metagenome_ictv(runtmp): + # test basic metagenome + c = runtmp + + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.ictv-taxonomy.csv") + + c.run_sourmash("tax", "metagenome", "-g", g_csv, "--taxonomy-csv", tax, "--ictv") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + print(c.last_result.out) + assert ( + "query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank" + in c.last_result.out + ) + assert ( + "test1,realm,0.204,Riboviria,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,realm,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,kingdom,0.204,Riboviria;;Orthornavirae,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,kingdom,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,phylum,0.204,Riboviria;;Orthornavirae;;Negarnaviricota,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,phylum,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,subphylum,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,subphylum,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,class,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,class,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,order,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,family,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,genus,0.204,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus,md5,test1.sig,0.131,1024000,0.950,0" + in c.last_result.out + ) + assert ( + "test1,genus,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,species,0.088,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bundibugyoense,md5,test1.sig,0.058,442000,0.925,0" + in c.last_result.out + ) + assert ( + "test1,species,0.078,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus taiense,md5,test1.sig,0.050,390000,0.921,0" + in c.last_result.out + ) + assert ( + "test1,species,0.028,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bombaliense,md5,test1.sig,0.016,138000,0.891,0" + in c.last_result.out + ) + assert ( + "test1,species,0.011,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus restonense,md5,test1.sig,0.007,54000,0.864,0" + in c.last_result.out + ) + assert ( + "test1,species,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + assert ( + "test1,name,0.088,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bundibugyoense;Bundibugyo virus,md5,test1.sig,0.058,442000,0.925,0" + in c.last_result.out + ) + assert ( + "test1,name,0.078,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus taiense;Taï Forest virus,md5,test1.sig,0.050,390000,0.921,0" + in c.last_result.out + ) + assert ( + "test1,name,0.028,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bombaliense;Bombali virus,md5,test1.sig,0.016,138000,0.891,0" + in c.last_result.out + ) + assert ( + "test1,name,0.011,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus restonense;Reston virus,md5,test1.sig,0.007,54000,0.864,0" + in c.last_result.out + ) + assert ( + "test1,name,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" + in c.last_result.out + ) + + def test_genome_no_rank_krona(runtmp): g_csv = utils.get_test_data("tax/test1.gather.csv") tax = utils.get_test_data("tax/test.taxonomy.csv") @@ -2897,6 +3022,121 @@ def test_genome_gather_two_queries(runtmp): ) +def test_genome_gather_ictv(runtmp): + """ + test genome classification with ictv taxonomy + """ + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.ictv-taxonomy.csv") + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "0", + "--ictv", + ) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert ( + "47+63,match,name,0.664,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus sudanense;Sudan virus,491c0a81,,0.664,5238000,0.987" + in c.last_result.out + ) + + +def test_genome_gather_ictv_twoqueries(runtmp): + """ + test genome classification with ictv taxonomy + """ + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.ictv-taxonomy.csv") + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + + # split 47+63 into two fake queries: q47, q63 + g_res2 = runtmp.output("two-queries.gather.csv") + q2_results = [x + "\n" for x in Path(g_res).read_text().splitlines()] + # rename queries + q2_results[1] = q2_results[1].replace("47+63", "q47") + q2_results[2] = q2_results[2].replace("47+63", "q63") + with open(g_res2, "w") as fp: + for line in q2_results: + print(line) + fp.write(line) + + c.run_sourmash( + "tax", + "genome", + "-g", + g_res2, + "--taxonomy-csv", + taxonomy_csv, + "--containment-threshold", + "0", + "--ictv", + ) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + print(c.last_result.out) + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert ( + "q47,match,name,0.664,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus sudanense;Sudan virus,491c0a81,,0.664,5238000,0.987" + in c.last_result.out + ) + assert ( + "q63,match,name,0.336,Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus zairense;Ebola virus,491c0a81,,0.336,2648000,0.965" + in c.last_result.out + ) + + +def test_genome_gather_ictv_fail(runtmp): + """ + test genome classification with ictv taxonomy + """ + c = runtmp + taxonomy_csv = utils.get_test_data("tax/test.ictv-taxonomy.csv") + tax2_csv = runtmp.output("ictv-taxfail") + # copy taxonomy csv to new file, but remove one of the columns + with open(taxonomy_csv) as inF: + with open(tax2_csv, "w") as outF: + for line in inF.readlines(): + line = line.rsplit(",", 1)[0] + outF.write(f"{line}\n") + + g_res = utils.get_test_data("tax/47+63_x_gtdb-rs202.gather.csv") + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash( + "tax", + "genome", + "-g", + g_res, + "--taxonomy-csv", + tax2_csv, + "--containment-threshold", + "0", + "--ictv", + ) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status != 0 + print(c.last_result.out) + assert "Not all taxonomy ranks present" in str(exc.value) + + def test_genome_rank_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates @@ -4159,6 +4399,57 @@ def test_annotate_gzipped_gather(runtmp): ) +def test_annotate_0_ictv(runtmp): + # test annotate basics + c = runtmp + + g_csv = utils.get_test_data("tax/test1.gather.csv") + tax = utils.get_test_data("tax/test.ictv-taxonomy.csv") + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + c.run_sourmash( + "tax", + "annotate", + "--gather-csv", + g_csv, + "--taxonomy-csv", + tax, + "-o", + out_dir, + "--ictv", + ) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert os.path.exists(csvout) + + gather_results = [x.rstrip() for x in Path(csvout).read_text().splitlines()] + print("\n".join(gather_results)) + assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err + + assert "lineage" in gather_results[0] + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bundibugyoense;Bundibugyo virus" + in gather_results[1] + ) + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus taiense;Taï Forest virus" + in gather_results[2] + ) + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus bombaliense;Bombali virus" + in gather_results[3] + ) + assert ( + "Riboviria;;Orthornavirae;;Negarnaviricota;Haploviricotina;Monjiviricetes;;Mononegavirales;;Filoviridae;;Orthoebolavirus;;Orthoebolavirus restonense;Reston virus" + in gather_results[4] + ) + + def test_annotate_0_LIN(runtmp): # test annotate basics c = runtmp @@ -5466,6 +5757,44 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): assert c["1"] == 11 +def test_tax_summarize_ictv(runtmp): + # test basic operation w/csv output on lineages-style file w/strain csv + taxfile = utils.get_test_data("tax/test.ictv-taxonomy.csv") + lineage_csv = runtmp.output("ictv-lins.csv") + + taxdb = tax_utils.LineageDB.load(taxfile) + with open(lineage_csv, "w", newline="") as fp: + w = csv.writer(fp) + w.writerow(["name", "lineage"]) + for k, v in taxdb.items(): + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + runtmp.sourmash("tax", "summarize", lineage_csv, "-o", "ranks.csv", "--ictv") + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 7" in out + assert "saved 14 lineage counts to" in err + + csv_out = runtmp.output("ranks.csv") + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + print(row) + val = row["lineage_count"] + c[val] += 1 + + print(list(c.most_common())) + print(c) + assert c["1"] == 8 + assert c["7"] == 5 + assert c["6"] == 1 + + def test_tax_summarize_LINS(runtmp): # test basic operation w/LINs taxfile = utils.get_test_data("tax/test.LIN-taxonomy.csv") diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index bd0060b65a..192406e251 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -28,6 +28,7 @@ BaseLineageInfo, RankLineageInfo, LINLineageInfo, + ICTVRankLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, write_krona, @@ -38,6 +39,8 @@ LineageDB_Sqlite, MultiLineageDB, filter_row, + NCBI_RANKS, + ICTV_RANKS, ) @@ -2018,6 +2021,22 @@ def test_lca_RankLineageInfo_no_lca(): assert lca_from_lin1 == lca_from_lin2 is None +def test_ICTVLineageInfo_ranks_input_ignored(): + # check that ranks cannot be changed + taxinfo = ICTVRankLineageInfo(ranks=["one", "two"]) + assert taxinfo.taxlist == ICTV_RANKS + + +def test_ICTVLineageInfo_lineagedict_input(): + # check that ranks cannot be changed + dummy_names = [f"name{i}" for i in range(1, len(ICTV_RANKS) + 1)] + input_lindict = dict(zip(ICTV_RANKS, dummy_names)) + taxinfo = ICTVRankLineageInfo(lineage_dict=input_lindict) + print(taxinfo.display_lineage()) + assert taxinfo.display_lineage() == ";".join(dummy_names) + assert taxinfo.taxlist == ICTV_RANKS + + def test_incompatibility_LINLineageInfo_RankLineageInfo(): x = "a;b;c" lin1 = RankLineageInfo(lineage_str=x)